crawler: fix regression from d6b90448f3
continuous-integration/drone/push Build is failing Details

master
pictuga 2022-02-01 23:18:16 +01:00
parent d3b623482d
commit 4d64afe9cb
1 changed files with 31 additions and 6 deletions

View File

@ -32,7 +32,7 @@ from .caching import default_cache
try: try:
# python 2 # python 2
from urllib import quote, unquote from urllib import quote
from httplib import HTTPMessage from httplib import HTTPMessage
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
@ -43,7 +43,7 @@ except ImportError:
# python 3 # python 3
from email import message_from_string from email import message_from_string
from http.client import HTTPMessage from http.client import HTTPMessage
from urllib.parse import quote, unquote, urlsplit from urllib.parse import quote, urlsplit
from urllib.request import (BaseHandler, HTTPCookieProcessor, from urllib.request import (BaseHandler, HTTPCookieProcessor,
HTTPRedirectHandler, Request, addinfourl, HTTPRedirectHandler, Request, addinfourl,
build_opener, parse_http_list, parse_keqv_list) build_opener, parse_http_list, parse_keqv_list)
@ -151,6 +151,28 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
return build_opener(*handlers) return build_opener(*handlers)
def is_ascii(string):
# there's a native function in py3, but home-made fix for backward compatibility
try:
string.encode('ascii')
except UnicodeError:
return False
else:
return True
def soft_quote(string):
" url-quote only when not a valid ascii string "
if is_ascii(string):
return string
else:
return quote(string.encode('utf-8'))
def sanitize_url(url): def sanitize_url(url):
# make sure the url is unicode, i.e. not bytes # make sure the url is unicode, i.e. not bytes
if isinstance(url, bytes): if isinstance(url, bytes):
@ -163,7 +185,10 @@ def sanitize_url(url):
# turns out some websites have really badly fomatted urls (fix http:/badurl) # turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url) url = re.sub('^(https?):/([^/])', r'\1://\2', url)
# escape non-ascii unicode characters (also encode spaces as %20) # escape spaces
url = url.replace(' ', '%20')
# escape non-ascii unicode characters
parts = urlsplit(url) parts = urlsplit(url)
parts = parts._replace( parts = parts._replace(
@ -171,9 +196,9 @@ def sanitize_url(url):
parts.hostname, parts.hostname,
parts.hostname.encode('idna').decode('ascii') parts.hostname.encode('idna').decode('ascii')
), ),
path=quote(unquote(parts.path).encode('utf-8')), path=soft_quote(parts.path),
query=quote(unquote(parts.query).encode('utf-8')), query=soft_quote(parts.query),
fragment=quote(unquote(parts.fragment).encode('utf-8')), fragment=soft_quote(parts.fragment),
) )
return parts.geturl() return parts.geturl()