Use etag/last-modified to fetch xml feeds
parent
70df746416
commit
9e324465e4
35
morss.py
35
morss.py
|
@ -237,6 +237,33 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor):
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
class CacheDownload(urllib2.BaseHandler):
|
||||||
|
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_RSS):
|
||||||
|
self.cache = cache
|
||||||
|
self.etag = etag
|
||||||
|
self.lastmodified = lastmodified
|
||||||
|
self.useragent = useragent
|
||||||
|
|
||||||
|
def http_request(self, req):
|
||||||
|
req.add_unredirected_header('User-Agent', self.useragent)
|
||||||
|
if self.cache:
|
||||||
|
if self.etag:
|
||||||
|
req.add_unredirected_header('If-None-Match', self.etag)
|
||||||
|
if self.lastmodified:
|
||||||
|
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
|
||||||
|
return req
|
||||||
|
|
||||||
|
def http_error_304(self, req, fp, code, msg, headers):
|
||||||
|
log('http cached')
|
||||||
|
if self.etag:
|
||||||
|
headers.addheader('etag', self.etag)
|
||||||
|
if self.lastmodified:
|
||||||
|
headers.addheader('last-modified', self.lastmodified)
|
||||||
|
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
https_request = http_request
|
||||||
|
|
||||||
def decodeHTML(con, data):
|
def decodeHTML(con, data):
|
||||||
if con.headers.getparam('charset'):
|
if con.headers.getparam('charset'):
|
||||||
log('header')
|
log('header')
|
||||||
|
@ -357,10 +384,12 @@ def Gather(url, cachePath, mode='feed'):
|
||||||
xml = cache.get(url)
|
xml = cache.get(url)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
req = urllib2.Request(url)
|
opener = CacheDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'))
|
||||||
req.add_unredirected_header('User-Agent', UA_RSS)
|
con = urllib2.build_opener(opener).open(url)
|
||||||
xml = urllib2.urlopen(req).read()
|
xml = con.read()
|
||||||
cache.set(url, xml)
|
cache.set(url, xml)
|
||||||
|
cache.set('etag', con.headers.getheader('etag'))
|
||||||
|
cache.set('lastmodified', con.headers.getheader('last-modified'))
|
||||||
except (urllib2.HTTPError, urllib2.URLError):
|
except (urllib2.HTTPError, urllib2.URLError):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue