crawler: parse html to get http-equiv

For sure slower, but way cleaner (and probably more stable)
This commit is contained in:
pictuga 2017-03-08 17:50:57 -10:00
parent 92b4a5c57c
commit fb8825b410

View File

@ -7,6 +7,7 @@ from gzip import GzipFile
from io import BytesIO, StringIO from io import BytesIO, StringIO
import re import re
import chardet import chardet
import lxml.html
import sqlite3 import sqlite3
import time import time
@ -156,20 +157,18 @@ class HTTPEquivHandler(BaseHandler):
def http_response(self, req, resp): def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0] contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and contenttype.startswith('text/'): if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
if contenttype in MIMETYPE['html']: data = resp.read()
data = resp.read()
regex = r'(?i)<meta\s+http-equiv=(["\'])(?P<key>[^"\']+)\1\s+content=(["\'])(?P<value>[^>]+)\3\s*/?>' headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))]
for header in headers: for header in headers:
resp.headers[header['key'].lower()] = header['value'] resp.headers[header.get('http-equiv').lower()] = header.get('content')
fp = BytesIO(data) fp = BytesIO(data)
old_resp = resp old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg resp.msg = old_resp.msg
return resp return resp