diff --git a/morss/crawler.py b/morss/crawler.py
index fe0ec67..3bbc7fd 100644
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -137,28 +137,49 @@ class ContentNegociationHandler(BaseHandler): #FIXME
https_request = http_request
-class MetaRedirectHandler(BaseHandler):
+class HTTPEquivHandler(BaseHandler):
+ " Handler to support , since it defines HTTP headers "
+
+ handler_order = 600
+
def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and contenttype.startswith('text/'):
if contenttype in MIMETYPE['html']:
data = resp.read()
- match = re.search(b'(?i)]*?url=(http.*?)["\']', data)
- if match:
- new_url = match.groups()[0]
- new_headers = dict((k, v) for k, v in list(req.headers.items())
- if k.lower() not in ('content-length', 'content-type'))
- new = Request(new_url,
- headers=new_headers,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
- return self.parent.open(new, timeout=req.timeout)
- else:
- fp = BytesIO(data)
- old_resp = resp
- resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
- resp.msg = old_resp.msg
+ regex = r'(?i)[^"\']+)\1\s+content=(["\'])(?P[^>]+)\3\s*/?>'
+ headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))]
+
+ for header in headers:
+ resp.headers[header['key'].lower()] = header['value']
+
+ fp = BytesIO(data)
+ old_resp = resp
+ resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+ resp.msg = old_resp.msg
+
+ return resp
+
+ https_response = http_response
+
+
+class HTTPRefreshHandler(BaseHandler):
+ handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
+
+ def http_response(self, req, resp):
+ if 200 <= resp.code < 300:
+ if resp.headers.get('refresh'):
+ regex = r'(?i)^(?P[0-9]+)\s*;\s*url=(["\']?)(?P.+)\2$'
+ match = re.search(regex, resp.headers.get('refresh'))
+
+ if match:
+ url = match.groupdict()['url']
+
+ if url:
+ resp.code = 302
+ resp.msg = 'Moved Temporarily'
+ resp.headers['location'] = url
return resp
diff --git a/morss/morss.py b/morss/morss.py
index 00682b0..0939a19 100644
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -233,8 +233,8 @@ class Cache:
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
- crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(),
- crawler.EncodingFixHandler()]
+ crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
+ crawler.HTTPRefreshHandler(), crawler.EncodingFixHandler()]
def accept_handler(*kargs):
handlers = default_handlers[:]