From f83a4b143006577d9434633dd43f2917a6e51307 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 1 Jan 2022 12:36:06 +0100 Subject: [PATCH] readabilite: avoid double parsing of html --- morss/readabilite.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index 63d6384..d34997e 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -19,19 +19,13 @@ import re import lxml.etree import lxml.html +import lxml.html.soupparser from bs4 import BeautifulSoup def parse(data, encoding=None): - if encoding: - data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8') - - else: - data = BeautifulSoup(data, 'lxml').prettify('utf-8') - - parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8') - - return lxml.html.fromstring(data, parser=parser) + kwargs = {'from_encoding': encoding} if encoding else {} + return lxml.html.soupparser.fromstring(data, **kwargs) def count_words(string):