From 21480f90de2c9b856ed199dce9f360b42dafb381 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 25 Nov 2017 19:57:41 +0100 Subject: [PATCH] Move from gzip to zlib to decompress data Faster on incomplete files --- morss/crawler.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index 4d00fca..14ba904 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -3,7 +3,7 @@ import sys import ssl import socket -from gzip import GzipFile +import zlib from io import BytesIO, StringIO import re import chardet @@ -100,22 +100,9 @@ class SizeLimitHandler(BaseHandler): https_response = http_response -def UnGzip(cprss, CHUNKSIZE=64*1024): # the bigger the CHUNKSIZE, the faster +def UnGzip(data): " Supports truncated files " - gz = GzipFile(fileobj=cprss, mode='rb') - - data = b'' - chunk = gz.read(CHUNKSIZE) - - try: - while chunk: - data += chunk - chunk = gz.read(CHUNKSIZE) - - except (IOError, EOFError): - pass - - return data + return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data) class GZIPHandler(BaseHandler): @@ -128,7 +115,7 @@ class GZIPHandler(BaseHandler): if resp.headers.get('Content-Encoding') == 'gzip': data = resp.read() - data = UnGzip(BytesIO(data)) + data = UnGzip(data) resp.headers['Content-Encoding'] = 'identity'