From f3d1f92b393fca9ac1238eeb4ae041d55267b232 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 7 Apr 2020 10:38:36 +0200 Subject: [PATCH] Detect encoding everytime --- README.md | 1 - morss/feeds.py | 16 ++++++++-------- morss/morss.py | 9 ++++----- morss/readabilite.py | 11 +++++++---- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index dc5bd04..8fde243 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,6 @@ The arguments are: - `mono`: disable multithreading while fetching, makes debugging easier - `theforce`: force download the rss feed and ignore cached http errros - `silent`: don't output the final RSS (useless on its own, but can be nice when debugging) - - `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly... - http server only - `callback=NAME`: for JSONP calls - `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers) diff --git a/morss/feeds.py b/morss/feeds.py index 7485760..5859390 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -15,7 +15,7 @@ import dateutil.parser from copy import deepcopy import lxml.html -from bs4 import BeautifulSoup +from .readabilite import parse as html_parse json.encoder.c_make_encoder = None @@ -53,7 +53,7 @@ def parse_rules(filename=None): return rules -def parse(data, url=None, mimetype=None): +def parse(data, url=None, mimetype=None, encoding=None): " Determine which ruleset to use " rulesets = parse_rules() @@ -67,7 +67,7 @@ def parse(data, url=None, mimetype=None): for path in ruleset['path']: if fnmatch(url, path): parser = [x for x in parsers if x.mode == ruleset['mode']][0] - return parser(data, ruleset) + return parser(data, ruleset, encoding=encoding) # 2) Look for a parser based on mimetype @@ -86,7 +86,7 @@ def parse(data, url=None, mimetype=None): # 'path' as they should have been caught beforehands try: - feed = parser(data) + feed = parser(data, encoding=encoding) except (ValueError): # parsing did not work @@ -113,7 +113,7 @@ def parse(data, url=None, mimetype=None): class ParserBase(object): - def __init__(self, data=None, rules=None, parent=None): + def __init__(self, data=None, rules=None, parent=None, encoding=None): if rules is None: rules = parse_rules()[self.default_ruleset] @@ -122,9 +122,10 @@ class ParserBase(object): if data is None: data = rules['base'] - self.root = self.parse(data) self.parent = parent + self.encoding = encoding + self.root = self.parse(data) def parse(self, raw): pass @@ -442,8 +443,7 @@ class ParserHTML(ParserXML): mimetype = ['text/html', 'application/xhtml+xml'] def parse(self, raw): - parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print - return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser) + return html_parse(raw, encoding=self.encoding) def tostring(self, encoding='unicode', **k): return lxml.html.tostring(self.root, encoding=encoding, **k) diff --git a/morss/morss.py b/morss/morss.py index 19973a0..b0d1735 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -10,7 +10,6 @@ import re import lxml.etree import lxml.html -from bs4 import BeautifulSoup from . import feeds from . import crawler @@ -261,7 +260,7 @@ def ItemFill(item, options, feedurl='/', fast=False): log('non-text page') return True - out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con)) + out = readabilite.get_article(data, url=con.geturl(), encoding=encoding) if out is not None: item.content = out @@ -329,7 +328,7 @@ def FeedFetch(url, options): if options.items: # using custom rules - rss = feeds.FeedHTML(xml) + rss = feeds.FeedHTML(xml, encoding=encoding) rss.rules['title'] = options.title if options.title else '//head/title' rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content' @@ -349,7 +348,7 @@ def FeedFetch(url, options): else: try: - rss = feeds.parse(xml, url, contenttype) + rss = feeds.parse(xml, url, contenttype, encoding=encoding) rss = rss.convert(feeds.FeedXML) # contains all fields, otherwise much-needed data can be lost @@ -649,7 +648,7 @@ def cgi_page(environ, start_response): data, con, contenttype, encoding = crawler.adv_get(url=url) if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: - html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify()) + html = readabilite.parse(data, encoding=encoding) html.make_links_absolute(con.geturl()) kill_tags = ['script', 'iframe', 'noscript'] diff --git a/morss/readabilite.py b/morss/readabilite.py index 7cbece7..b5dad9c 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -6,11 +6,14 @@ import re def parse(data, encoding=None): if encoding: - parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding) - else: - parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True) + data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8') - return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser) + else: + data = BeautifulSoup(data, 'lxml').prettify('utf-8') + + parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8') + + return lxml.html.fromstring(data, parser=parser) def count_words(string):