From 1b7fdad6a88cd7f7857d2436149db2467014473e Mon Sep 17 00:00:00 2001 From: pictuga Date: Sun, 8 Sep 2013 15:48:34 +0200 Subject: [PATCH] Improve broken XML support TPB feed is a good example . Now supports ampersand in feed, using the "recover" mode in etree.parse. Broken utf-8 strings in feed are now also supported. --- feeds.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/feeds.py b/feeds.py index eb11b4d..8127e67 100644 --- a/feeds.py +++ b/feeds.py @@ -42,7 +42,10 @@ class FeedException(Exception): pass def parse(data): - doc = etree.fromstring(data) + data = data.decode('utf-8', 'replace').encode('utf-8') + parser = etree.XMLParser(recover=True) + doc = etree.fromstring(data, parser) + match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP) if len(match): mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,