Source code for scrapy.downloadermiddlewares.httpcompression

import warnings
from logging import getLogger

from scrapy import signals
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Response, TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.utils._compression import (
    _DecompressionMaxSizeExceeded,
    _inflate,
    _unbrotli,
    _unzstd,
)
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.gz import gunzip

logger = getLogger(__name__)


ACCEPTED_ENCODINGS = [b'gzip', b'deflate']

try:
    import brotli
except ImportError:
    pass
else:
    ACCEPTED_ENCODINGS.append(b'br')

try:
    import zstandard
except ImportError:
    pass
else:
    ACCEPTED_ENCODINGS.append(b'zstd')


[docs]class HttpCompressionMiddleware: """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" def __init__(self, stats=None, *, crawler=None): if not crawler: self.stats = stats self._max_size = 1073741824 self._warn_size = 33554432 return self.stats = crawler.stats self._max_size = crawler.settings.getint("DOWNLOAD_MAXSIZE") self._warn_size = crawler.settings.getint("DOWNLOAD_WARNSIZE") crawler.signals.connect(self.open_spider, signals.spider_opened) @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('COMPRESSION_ENABLED'): raise NotConfigured try: return cls(crawler=crawler) except TypeError: warnings.warn( "HttpCompressionMiddleware subclasses must either modify " "their '__init__' method to support a 'crawler' parameter or " "reimplement their 'from_crawler' method.", ScrapyDeprecationWarning, ) mw = cls() mw.stats = crawler.stats mw._max_size = crawler.settings.getint("DOWNLOAD_MAXSIZE") mw._warn_size = crawler.settings.getint("DOWNLOAD_WARNSIZE") crawler.signals.connect(mw.open_spider, signals.spider_opened) return mw def open_spider(self, spider): if hasattr(spider, "download_maxsize"): self._max_size = spider.download_maxsize if hasattr(spider, "download_warnsize"): self._warn_size = spider.download_warnsize def process_request(self, request, spider): request.headers.setdefault('Accept-Encoding', b", ".join(ACCEPTED_ENCODINGS)) def process_response(self, request, response, spider): if request.method == 'HEAD': return response if isinstance(response, Response): content_encoding = response.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() max_size = request.meta.get("download_maxsize", self._max_size) warn_size = request.meta.get("download_warnsize", self._warn_size) try: decoded_body = self._decode( response.body, encoding.lower(), max_size ) except _DecompressionMaxSizeExceeded: raise IgnoreRequest( f"Ignored response {response} because its body " f"({len(response.body)} B) exceeded DOWNLOAD_MAXSIZE " f"({max_size} B) during decompression." ) if len(response.body) < warn_size <= len(decoded_body): logger.warning( f"{response} body size after decompression " f"({len(decoded_body)} B) is larger than the " f"download warning size ({warn_size} B)." ) if self.stats: self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider) self.stats.inc_value('httpcompression/response_count', spider=spider) respcls = responsetypes.from_args( headers=response.headers, url=response.url, body=decoded_body ) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response def _decode(self, body, encoding, max_size): if encoding == b'gzip' or encoding == b'x-gzip': return gunzip(body, max_size=max_size) if encoding == b'deflate': return _inflate(body, max_size=max_size) if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS: return _unbrotli(body, max_size=max_size) if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS: return _unzstd(body, max_size=max_size) return body