from base64 import b64decode
from copy import copy
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from scrapy import Request
from scrapy.http import Headers, HtmlResponse, Response, TextResponse
from scrapy.http.cookies import CookieJar
from scrapy.responsetypes import responsetypes
from scrapy_zyte_api._cookies import _process_cookies
from scrapy_zyte_api.utils import (
_RESPONSE_HAS_ATTRIBUTES,
_RESPONSE_HAS_IP_ADDRESS,
_RESPONSE_HAS_PROTOCOL,
)
_DEFAULT_ENCODING = "utf-8"
class ZyteAPIMixin:
REMOVE_HEADERS = {
# Zyte API already decompresses the HTTP Response Body. Scrapy's
# HttpCompressionMiddleware will error out when it attempts to
# decompress an already decompressed body based on this header.
"content-encoding",
}
def __init__(self, *args, raw_api_response: Optional[Dict] = None, **kwargs):
super().__init__(*args, **kwargs)
self._raw_api_response = raw_api_response
if not _RESPONSE_HAS_ATTRIBUTES:
self.attributes: Tuple[str, ...] = (
"url",
"status",
"headers",
"body",
"request",
"flags",
"certificate",
)
if _RESPONSE_HAS_IP_ADDRESS:
self.attributes += ("ip_address",)
if _RESPONSE_HAS_PROTOCOL:
self.attributes += ("protocol",)
self.attributes += ("raw_api_response",)
def replace(self, *args, **kwargs):
if kwargs.get("raw_api_response"):
raise ValueError("Replacing the value of 'raw_api_response' isn't allowed.")
for attribute in self.attributes:
kwargs.setdefault(attribute, getattr(self, attribute))
cls = kwargs.pop("cls", self.__class__)
return cls(*args, **kwargs)
@property
def raw_api_response(self) -> Optional[Dict]:
"""Contains the raw API response from Zyte API.
For the full list of parameters, see :ref:`zapi-reference`.
"""
return self._raw_api_response
@staticmethod
def _response_cookie_to_header_value(cookie):
result = f"{cookie['name']}={cookie['value']}"
domain = cookie.get("domain")
if domain:
result += f"; Domain={cookie['domain']}"
path = cookie.get("path")
if path is not None:
result += f"; Path={path}"
expires = cookie.get("expires")
if expires is not None:
expires_date = datetime.utcfromtimestamp(expires)
expires_date_string = expires_date.strftime("%a, %d %b %Y %H:%M:%S GMT")
result += f"; Expires={expires_date_string}"
if cookie.get("httpOnly"):
result += "; HttpOnly"
if cookie.get("secure"):
result += "; Secure"
same_site = cookie.get("sameSite")
if same_site is not None:
result += f"; SameSite={same_site}"
return result
@classmethod
def _prepare_headers(cls, api_response: Dict[str, Any]):
result: Dict[str, List[str]] = {}
input_headers: Optional[List[Dict[str, str]]] = api_response.get(
"httpResponseHeaders"
)
response_cookies: Optional[List[Dict[str, str]]] = api_response.get(
"experimental", {}
).get("responseCookies")
if input_headers:
headers_to_remove = copy(cls.REMOVE_HEADERS)
if response_cookies:
headers_to_remove.add("set-cookie")
result = {
h["name"]: [h["value"]]
for h in input_headers
if h["name"].lower() not in headers_to_remove
}
if response_cookies:
result["Set-Cookie"] = []
for cookie in response_cookies:
result["Set-Cookie"].append(
cls._response_cookie_to_header_value(cookie)
)
return result
[docs]
class ZyteAPITextResponse(ZyteAPIMixin, HtmlResponse):
@classmethod
def from_api_response(
cls, api_response: Dict, *, request: Optional[Request] = None
):
"""Alternative constructor to instantiate the response from the raw
Zyte API response.
"""
body = None
encoding = None
if api_response.get("browserHtml"):
encoding = _DEFAULT_ENCODING # Zyte API has "utf-8" by default
body = api_response["browserHtml"].encode(encoding)
elif api_response.get("httpResponseBody"):
body = b64decode(api_response["httpResponseBody"])
return cls(
url=api_response["url"],
status=api_response.get("statusCode") or 200,
body=body,
encoding=encoding,
request=request,
flags=["zyte-api"],
headers=cls._prepare_headers(api_response),
raw_api_response=api_response,
)
def replace(self, *args, **kwargs):
kwargs.setdefault("encoding", self.encoding)
return ZyteAPIMixin.replace(self, *args, **kwargs)
[docs]
class ZyteAPIResponse(ZyteAPIMixin, Response):
@classmethod
def from_api_response(
cls, api_response: Dict, *, request: Optional[Request] = None
):
"""Alternative constructor to instantiate the response from the raw
Zyte API response.
"""
return cls(
url=api_response["url"],
status=api_response.get("statusCode") or 200,
body=b64decode(api_response.get("httpResponseBody") or ""),
request=request,
flags=["zyte-api"],
headers=cls._prepare_headers(api_response),
raw_api_response=api_response,
)
_IMMUTABLE_JSON = Union[None, str, int, float, bool]
_JSON = Union[
None, str, int, float, bool, List["_JSON"], Dict[_IMMUTABLE_JSON, "_JSON"]
]
_API_RESPONSE = Dict[str, _JSON]
def _process_response(
api_response: _API_RESPONSE,
request: Request,
cookie_jars: Optional[Dict[Any, CookieJar]],
) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]:
"""Given a Zyte API Response and the ``scrapy.Request`` that asked for it,
this returns either a ``ZyteAPITextResponse`` or ``ZyteAPIResponse`` depending
on which if it can properly decode the HTTP Body or have access to browserHtml.
"""
# NOTES: Currently, Zyte API does NOT only allow both 'browserHtml' and
# 'httpResponseBody' to be present at the same time. The support for both
# will be addressed in the future. Reference:
# - https://github.com/scrapy-plugins/scrapy-zyte-api/pull/10#issuecomment-1131406460
# For now, at least one of them should be present.
_process_cookies(api_response, request, cookie_jars)
if api_response.get("browserHtml"):
# Using TextResponse because browserHtml always returns a browser-rendered page
# even when requesting files (like images)
return ZyteAPITextResponse.from_api_response(api_response, request=request)
if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"):
# a plain dict here doesn't work correctly on Scrapy < 2.1
scrapy_headers = Headers()
for header in cast(List[Dict[str, str]], api_response["httpResponseHeaders"]):
scrapy_headers[header["name"].encode()] = header["value"].encode()
response_cls = responsetypes.from_args(
headers=scrapy_headers,
url=cast(str, api_response["url"]),
# FIXME: update this when python-zyte-api supports base64 decoding
body=b64decode(api_response["httpResponseBody"]), # type: ignore
)
if issubclass(response_cls, TextResponse):
return ZyteAPITextResponse.from_api_response(api_response, request=request)
return ZyteAPIResponse.from_api_response(api_response, request=request)