Use etag/last-modified to fetch xml feeds

master
pictuga 2013-07-18 23:54:13 +02:00
parent 70df746416
commit 9e324465e4
1 changed files with 32 additions and 3 deletions

View File

@ -237,6 +237,33 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor):
https_response = http_response https_response = http_response
https_request = http_request https_request = http_request
class CacheDownload(urllib2.BaseHandler):
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_RSS):
self.cache = cache
self.etag = etag
self.lastmodified = lastmodified
self.useragent = useragent
def http_request(self, req):
req.add_unredirected_header('User-Agent', self.useragent)
if self.cache:
if self.etag:
req.add_unredirected_header('If-None-Match', self.etag)
if self.lastmodified:
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
return req
def http_error_304(self, req, fp, code, msg, headers):
log('http cached')
if self.etag:
headers.addheader('etag', self.etag)
if self.lastmodified:
headers.addheader('last-modified', self.lastmodified)
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
return resp
https_request = http_request
def decodeHTML(con, data): def decodeHTML(con, data):
if con.headers.getparam('charset'): if con.headers.getparam('charset'):
log('header') log('header')
@ -357,10 +384,12 @@ def Gather(url, cachePath, mode='feed'):
xml = cache.get(url) xml = cache.get(url)
else: else:
try: try:
req = urllib2.Request(url) opener = CacheDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'))
req.add_unredirected_header('User-Agent', UA_RSS) con = urllib2.build_opener(opener).open(url)
xml = urllib2.urlopen(req).read() xml = con.read()
cache.set(url, xml) cache.set(url, xml)
cache.set('etag', con.headers.getheader('etag'))
cache.set('lastmodified', con.headers.getheader('last-modified'))
except (urllib2.HTTPError, urllib2.URLError): except (urllib2.HTTPError, urllib2.URLError):
return False return False