crawler: randomize user agent

master
pictuga 2020-04-24 11:28:39 +02:00
parent 8187876a06
commit 6a0531ca03
1 changed files with 15 additions and 2 deletions

View File

@ -7,6 +7,7 @@ import chardet
from cgi import parse_header from cgi import parse_header
import lxml.html import lxml.html
import time import time
import random
try: try:
# python 2 # python 2
@ -31,7 +32,19 @@ MIMETYPE = {
'html': ['text/html', 'application/xhtml+xml', 'application/xml']} 'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' DEFAULT_UAS = [
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
]
def get(*args, **kwargs): def get(*args, **kwargs):
@ -70,7 +83,7 @@ def custom_handler(follow=None, delay=None, encoding=None):
handlers.append(GZIPHandler()) handlers.append(GZIPHandler())
handlers.append(HTTPEquivHandler()) handlers.append(HTTPEquivHandler())
handlers.append(HTTPRefreshHandler()) handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA)) handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
handlers.append(BrowserlyHeaderHandler()) handlers.append(BrowserlyHeaderHandler())
handlers.append(EncodingFixHandler(encoding)) handlers.append(EncodingFixHandler(encoding))