Move custom_handler to crawler

Makes more sense. Easier to reuse. Also cleaned up a bit the code
This commit is contained in:
2017-03-18 22:51:27 -10:00
parent beec6469cc
commit 2003e2760b
2 changed files with 24 additions and 17 deletions

View File

@@ -48,8 +48,6 @@ THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
PORT = 8080
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
PROTOCOL = ['http', 'https', 'ftp']
@@ -127,19 +125,6 @@ def parseOptions(options):
return out
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
crawler.HTTPRefreshHandler()]
def custom_handler(accept, strict=False, delay=DELAY, encoding=None):
handlers = default_handlers[:]
handlers.append(crawler.EncodingFixHandler(encoding))
handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept], strict))
handlers.append(crawler.SQliteCacheHandler(delay))
return build_opener(*handlers)
def ItemFix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
@@ -267,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = -2
try:
con = custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
data = con.read()
except (IOError, HTTPException) as e:
@@ -368,7 +353,7 @@ def FeedFetch(url, options):
delay = 0
try:
con = custom_handler('xml', True, delay, options.encoding).open(url, timeout=TIMEOUT * 2)
con = crawler.custom_handler('xml', True, delay, options.encoding).open(url, timeout=TIMEOUT * 2)
xml = con.read()
except (HTTPError) as e: