From c25aec7107b1556a4e545dceb495ef493f503e93 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sun, 15 Sep 2013 15:33:14 +0200 Subject: [PATCH] Only perform redirects on html pages --- morss.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/morss.py b/morss.py index d8e8739..c486f15 100644 --- a/morss.py +++ b/morss.py @@ -196,19 +196,20 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor): data = GzipFile(fileobj=StringIO(data), mode='r').read() # redirect - match = re.search(r'(?i)]*?url=(http.*?)["\']', data) - if match: - newurl = match.groups()[0] - log('redirect: %s' % newurl) + if resp.info().type in ['text/html', 'application/xhtml+xml']: + match = re.search(r'(?i)]*?url=(http.*?)["\']', data) + if match: + newurl = match.groups()[0] + log('redirect: %s' % newurl) - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ('content-length', 'content-type')) - new = urllib2.Request(newurl, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) + newheaders = dict((k,v) for k,v in req.headers.items() + if k.lower() not in ('content-length', 'content-type')) + new = urllib2.Request(newurl, + headers=newheaders, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) - return self.parent.open(new, timeout=req.timeout) + return self.parent.open(new, timeout=req.timeout) # decode data = decodeHTML(resp, data)