From 4d64afe9cb496d40d07f38dce63e1d0605fcff14 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 1 Feb 2022 23:18:16 +0100 Subject: [PATCH] crawler: fix regression from d6b90448f3dbfd07d8b6fdfb5bbb20f28b3a331f --- morss/crawler.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index f3343f7..83d7b20 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -32,7 +32,7 @@ from .caching import default_cache try: # python 2 - from urllib import quote, unquote + from urllib import quote from httplib import HTTPMessage from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, @@ -43,7 +43,7 @@ except ImportError: # python 3 from email import message_from_string from http.client import HTTPMessage - from urllib.parse import quote, unquote, urlsplit + from urllib.parse import quote, urlsplit from urllib.request import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, Request, addinfourl, build_opener, parse_http_list, parse_keqv_list) @@ -151,6 +151,28 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None): return build_opener(*handlers) +def is_ascii(string): + # there's a native function in py3, but home-made fix for backward compatibility + try: + string.encode('ascii') + + except UnicodeError: + return False + + else: + return True + + +def soft_quote(string): + " url-quote only when not a valid ascii string " + + if is_ascii(string): + return string + + else: + return quote(string.encode('utf-8')) + + def sanitize_url(url): # make sure the url is unicode, i.e. not bytes if isinstance(url, bytes): @@ -163,7 +185,10 @@ def sanitize_url(url): # turns out some websites have really badly fomatted urls (fix http:/badurl) url = re.sub('^(https?):/([^/])', r'\1://\2', url) - # escape non-ascii unicode characters (also encode spaces as %20) + # escape spaces + url = url.replace(' ', '%20') + + # escape non-ascii unicode characters parts = urlsplit(url) parts = parts._replace( @@ -171,9 +196,9 @@ def sanitize_url(url): parts.hostname, parts.hostname.encode('idna').decode('ascii') ), - path=quote(unquote(parts.path).encode('utf-8')), - query=quote(unquote(parts.query).encode('utf-8')), - fragment=quote(unquote(parts.fragment).encode('utf-8')), + path=soft_quote(parts.path), + query=soft_quote(parts.query), + fragment=soft_quote(parts.fragment), ) return parts.geturl()