From 3176c2a8e8028a6bbbf2752c616b115c16bd680c Mon Sep 17 00:00:00 2001 From: pictuga Date: Sun, 15 Sep 2013 14:57:37 +0200 Subject: [PATCH] Fix bad characters detection Now works with any encoding, no longer restricted to utf-8. Uses regex to find encoding (not perfect, but rather fast, since it's used on a substring) --- feeds.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/feeds.py b/feeds.py index 8127e67..43892e3 100644 --- a/feeds.py +++ b/feeds.py @@ -42,10 +42,17 @@ class FeedException(Exception): pass def parse(data): - data = data.decode('utf-8', 'replace').encode('utf-8') + # encoding + match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) + if match: + enc = match.groups()[0].lower() + data = data.decode(enc, 'ignore').encode(enc) + + # parse parser = etree.XMLParser(recover=True) doc = etree.fromstring(data, parser) + # rss match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP) if len(match): mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,