From 97d9dda547621be0cd310ffdcc68385e71f3ea7a Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 11 Sep 2021 11:34:16 +0200 Subject: [PATCH] crawler: support 308 redirects --- morss/crawler.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index f882ac3..b7cd1e3 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -33,16 +33,17 @@ try: from urllib import quote import mimetools - from urllib2 import (BaseHandler, HTTPCookieProcessor, Request, addinfourl, - build_opener, parse_http_list, parse_keqv_list) + from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, + Request, addinfourl, build_opener, parse_http_list, + parse_keqv_list) from urlparse import urlparse, urlunparse except ImportError: # python 3 import email from urllib.parse import quote, urlparse, urlunparse - from urllib.request import (BaseHandler, HTTPCookieProcessor, Request, - addinfourl, build_opener, parse_http_list, - parse_keqv_list) + from urllib.request import (BaseHandler, HTTPCookieProcessor, + HTTPRedirectHandler, Request, addinfourl, + build_opener, parse_http_list, parse_keqv_list) try: # python 2 @@ -134,6 +135,7 @@ def custom_opener(follow=None, delay=None): handlers.append(SizeLimitHandler(500*1024)) # 500KiB handlers.append(HTTPCookieProcessor()) handlers.append(GZIPHandler()) + handlers.append(HTTPAllRedirectHandler()) handlers.append(HTTPEquivHandler()) handlers.append(HTTPRefreshHandler()) handlers.append(UAHandler(random.choice(DEFAULT_UAS))) @@ -400,6 +402,11 @@ class HTTPEquivHandler(RespStrHandler): resp.headers[meta.get('http-equiv').lower()] = meta.get('content') +class HTTPAllRedirectHandler(HTTPRedirectHandler): + def http_error_308(self, req, fp, code, msg, headers): + return self.http_error_301(req, fp, 301, msg, headers) + + class HTTPRefreshHandler(BaseHandler): handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000