Source code for scrapy_zyte_api.responses

from __future__ import annotations

import datetime as dt
from base64 import b64decode
from copy import copy
from typing import TYPE_CHECKING, Any, TypeAlias, cast

from scrapy.http import Headers, HtmlResponse, Response, TextResponse, XmlResponse
from scrapy.responsetypes import responsetypes

if TYPE_CHECKING:
    from scrapy import Request
    from scrapy.http.cookies import CookieJar

from scrapy_zyte_api._cookies import _process_cookies
from scrapy_zyte_api.utils import (
    _RESPONSE_HAS_ATTRIBUTES,
    _RESPONSE_HAS_IP_ADDRESS,
    _RESPONSE_HAS_PROTOCOL,
)

_DEFAULT_ENCODING = "utf-8"


class ZyteAPIMixin:
    url: str

    REMOVE_HEADERS = {
        # Zyte API already decompresses the HTTP Response Body. Scrapy's
        # HttpCompressionMiddleware will error out when it attempts to
        # decompress an already decompressed body based on this header.
        "content-encoding",
    }

    def __init__(self, *args, raw_api_response: dict | None = None, **kwargs):
        super().__init__(*args, **kwargs)
        self._raw_api_response = raw_api_response
        if not _RESPONSE_HAS_ATTRIBUTES:
            self.attributes: tuple[str, ...] = (
                "url",
                "status",
                "headers",
                "body",
                "request",
                "flags",
                "certificate",
            )
            if _RESPONSE_HAS_IP_ADDRESS:
                self.attributes += ("ip_address",)
            if _RESPONSE_HAS_PROTOCOL:
                self.attributes += ("protocol",)
        self.attributes += ("raw_api_response",)

    def replace(self, *args, **kwargs):
        if kwargs.get("raw_api_response"):
            raise ValueError("Replacing the value of 'raw_api_response' isn't allowed.")
        for attribute in self.attributes:
            kwargs.setdefault(attribute, getattr(self, attribute))
        cls = kwargs.pop("cls", self.__class__)
        return cls(*args, **kwargs)

    @property
    def raw_api_response(self) -> dict | None:
        """Contains the raw API response from Zyte API.

        For the full list of parameters, see :ref:`zapi-reference`.
        """
        return self._raw_api_response

    @staticmethod
    def _response_cookie_to_header_value(cookie):
        result = f"{cookie['name']}={cookie['value']}"
        domain = cookie.get("domain")
        if domain:
            result += f"; Domain={cookie['domain']}"
        path = cookie.get("path")
        if path is not None:
            result += f"; Path={path}"
        expires = cookie.get("expires")
        if expires is not None:
            expires_date = dt.datetime.fromtimestamp(expires, dt.timezone.utc)
            expires_date_string = expires_date.strftime("%a, %d %b %Y %H:%M:%S GMT")
            result += f"; Expires={expires_date_string}"
        if cookie.get("httpOnly"):
            result += "; HttpOnly"
        if cookie.get("secure"):
            result += "; Secure"
        same_site = cookie.get("sameSite")
        if same_site is not None:
            result += f"; SameSite={same_site}"
        return result

    @classmethod
    def from_api_response(cls, api_response: dict, *, request: Request | None = None):
        """Alternative constructor to instantiate the response from the raw
        Zyte API response.
        """
        return cls(
            url=api_response["url"],
            status=api_response.get("statusCode") or 200,
            body=b64decode(api_response.get("httpResponseBody") or ""),
            request=request,
            flags=["zyte-api"],
            headers=cls._prepare_headers(api_response),
            raw_api_response=api_response,
        )

    @classmethod
    def _prepare_headers(cls, api_response: dict[str, Any]):
        result: dict[str, list[str]] = {}
        input_headers: list[dict[str, str]] | None = api_response.get(
            "httpResponseHeaders"
        )
        response_cookies: list[dict[str, str]] | None = api_response.get(
            "experimental", {}
        ).get("responseCookies")
        if input_headers:
            headers_to_remove = copy(cls.REMOVE_HEADERS)
            if response_cookies:
                headers_to_remove.add("set-cookie")
            result = {
                h["name"]: [h["value"]]
                for h in input_headers
                if h["name"].lower() not in headers_to_remove
            }
        if response_cookies:
            result["Set-Cookie"] = []
            for cookie in response_cookies:
                result["Set-Cookie"].append(
                    cls._response_cookie_to_header_value(cookie)
                )
        return result


[docs] class ZyteAPITextResponse(ZyteAPIMixin, HtmlResponse): @classmethod def from_api_response(cls, api_response: dict, *, request: Request | None = None): """Alternative constructor to instantiate the response from the raw Zyte API response. """ body = None encoding = None if api_response.get("browserHtml"): encoding = _DEFAULT_ENCODING # Zyte API has "utf-8" by default body = api_response["browserHtml"].encode(encoding) elif api_response.get("httpResponseBody"): body = b64decode(api_response["httpResponseBody"]) return cls( url=api_response["url"], status=api_response.get("statusCode") or 200, body=body, encoding=encoding, request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response), raw_api_response=api_response, ) def replace(self, *args, **kwargs): kwargs.setdefault("encoding", self.encoding) return ZyteAPIMixin.replace(self, *args, **kwargs)
[docs] class ZyteAPIXmlResponse(ZyteAPIMixin, XmlResponse): pass
try: from scrapy.http import JsonResponse as _JsonResponse
[docs] class ZyteAPIJsonResponse(ZyteAPIMixin, _JsonResponse): pass
_SCRAPY_JSON_CLS: type | None = _JsonResponse except ImportError: ZyteAPIJsonResponse = None # type: ignore[assignment, misc] _SCRAPY_JSON_CLS = None
[docs] class ZyteAPIResponse(ZyteAPIMixin, Response): pass
_IMMUTABLE_JSON: TypeAlias = None | str | int | float | bool _JSON: TypeAlias = ( None | str | int | float | bool | list["_JSON"] | dict[_IMMUTABLE_JSON, "_JSON"] ) _API_RESPONSE: TypeAlias = dict[str, _JSON] _SCRAPY_TO_ZYTE_RESPONSE: dict[type[TextResponse], type[ZyteAPIMixin]] = { XmlResponse: ZyteAPIXmlResponse, } if _SCRAPY_JSON_CLS is not None and ZyteAPIJsonResponse is not None: _SCRAPY_TO_ZYTE_RESPONSE[_SCRAPY_JSON_CLS] = ZyteAPIJsonResponse def _process_response( api_response: _API_RESPONSE, request: Request, cookie_jars: dict[Any, CookieJar] | None, ) -> ( ZyteAPITextResponse | ZyteAPIXmlResponse | ZyteAPIJsonResponse | ZyteAPIResponse | None ): """Given a Zyte API Response and the ``scrapy.Request`` that asked for it, this returns the most appropriate Zyte API response class based on the response content type. """ # NOTES: Currently, Zyte API does NOT only allow both 'browserHtml' and # 'httpResponseBody' to be present at the same time. The support for both # will be addressed in the future. Reference: # - https://github.com/scrapy-plugins/scrapy-zyte-api/pull/10#issuecomment-1131406460 # For now, at least one of them should be present. _process_cookies(api_response, request, cookie_jars) if api_response.get("browserHtml"): # Using TextResponse because browserHtml always returns a browser-rendered page # even when requesting files (like images) return ZyteAPITextResponse.from_api_response(api_response, request=request) if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): # a plain dict here doesn't work correctly on Scrapy < 2.1 scrapy_headers = Headers() for header in cast("list[dict[str, str]]", api_response["httpResponseHeaders"]): scrapy_headers[header["name"].encode()] = header["value"].encode() response_cls = responsetypes.from_args( headers=scrapy_headers, url=cast("str", api_response["url"]), # FIXME: update this when python-zyte-api supports base64 decoding body=b64decode(api_response["httpResponseBody"]), # type: ignore[arg-type] ) if issubclass(response_cls, TextResponse): zyte_cls = _SCRAPY_TO_ZYTE_RESPONSE.get(response_cls, ZyteAPITextResponse) return zyte_cls.from_api_response(api_response, request=request) return ZyteAPIResponse.from_api_response(api_response, request=request)