From 9bc4417be380f247a815603d69b0452f1fe0e213 Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 25 Sep 2013 12:32:40 +0200 Subject: [PATCH] More flexible xml caching New includes a 'type' var, to remember what we did out of it (normal, nothing, grabbed xml link, etc). xml/html mimetype are now saved in a dict, for easier editing, and consistency. --- morss.py | 66 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/morss.py b/morss.py index c67cf6a..579430f 100644 --- a/morss.py +++ b/morss.py @@ -36,6 +36,9 @@ DEBUG = False UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)' UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11' +MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], + 'html': ['text/html', 'application/xhtml+xml']} + PROTOCOL = ['http', 'https', 'ftp'] if 'REQUEST_URI' in os.environ: @@ -173,7 +176,7 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor): data = GzipFile(fileobj=StringIO(data), mode='r').read() # redirect - if resp.info().type in ['text/html', 'application/xhtml+xml']: + if resp.info().type in MIMETYPE['html']: match = re.search(r'(?i)]*?url=(http.*?)["\']', data) if match: newurl = match.groups()[0] @@ -356,40 +359,49 @@ def Gather(url, cachePath, progress=False): log(cache._hash) # fetch feed - if cache.isYoungerThan(DELAY): - if 'xml' in cache: - log('xml cached') - xml = cache.get('xml') - if 'link' in cache: - log('link cached') - return Gather(cache.get('link'), cachePath, progress) + if cache.isYoungerThan(DELAY) and 'xml' in cache and 'style' in cache: + log('xml cached') + xml = cache.get('xml') + style = cache.get('style') else: try: opener = CacheDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified')) - con = urllib2.build_opener(opener).open(url) + con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT) xml = con.read() except (urllib2.URLError, httplib.HTTPException, socket.timeout): return False - if xml[:5] == '