crawler: randomize user agent
parent
8187876a06
commit
6a0531ca03
|
@ -7,6 +7,7 @@ import chardet
|
||||||
from cgi import parse_header
|
from cgi import parse_header
|
||||||
import lxml.html
|
import lxml.html
|
||||||
import time
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
|
@ -31,7 +32,19 @@ MIMETYPE = {
|
||||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
DEFAULT_UAS = [
|
||||||
|
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
|
||||||
|
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def get(*args, **kwargs):
|
def get(*args, **kwargs):
|
||||||
|
@ -70,7 +83,7 @@ def custom_handler(follow=None, delay=None, encoding=None):
|
||||||
handlers.append(GZIPHandler())
|
handlers.append(GZIPHandler())
|
||||||
handlers.append(HTTPEquivHandler())
|
handlers.append(HTTPEquivHandler())
|
||||||
handlers.append(HTTPRefreshHandler())
|
handlers.append(HTTPRefreshHandler())
|
||||||
handlers.append(UAHandler(DEFAULT_UA))
|
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||||
handlers.append(BrowserlyHeaderHandler())
|
handlers.append(BrowserlyHeaderHandler())
|
||||||
handlers.append(EncodingFixHandler(encoding))
|
handlers.append(EncodingFixHandler(encoding))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue