diff --git a/CHANGES.rst b/CHANGES.rst index 53ed7f11..b0adbb88 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,14 @@ Changes ======= +TBD +--- + +* Introduce ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are subclasses + of ``scrapy.http.Response`` and ``scrapy.http.TextResponse`` respectively. + These new response classes hold the raw Zyte Data API response in the + ``raw_api_response`` attribute. + 0.1.0 (2022-02-03) ------------------ diff --git a/README.rst b/README.rst index 3ef63fc9..25b12288 100644 --- a/README.rst +++ b/README.rst @@ -33,8 +33,8 @@ Installation This package requires Python 3.7+. -How to configure ----------------- +Configuration +------------- Replace the default ``http`` and ``https`` in Scrapy's `DOWNLOAD_HANDLERS `_ @@ -46,7 +46,7 @@ Lastly, make sure to `install the asyncio-based Twisted reactor `_ in the ``settings.py`` file as well: -Here's example of the things needed inside a Scrapy project's ``settings.py`` file: +Here's an example of the things needed inside a Scrapy project's ``settings.py`` file: .. code-block:: python @@ -60,37 +60,83 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" -How to use ----------- +Usage +----- -Set the ``zyte_api`` `Request.meta -`_ -key to download a request using Zyte API. Full list of parameters is provided in the -`Zyte API Specification `_. +To enable a ``scrapy.Request`` to go through Zyte Data API, the ``zyte_api`` key in +`Request.meta `_ +must be present and has dict-like contents. -.. code-block:: python +To set the default parameters for Zyte API enabled requests, you can set the +following in the ``settings.py`` file or `any other settings within Scrapy +`_: - import scrapy +.. code-block:: python + ZYTE_API_DEFAULT_PARAMS = { + "browserHtml": True, + "geolocation": "US", + } - class TestSpider(scrapy.Spider): - name = "test" +You can see the full list of parameters in the `Zyte Data API Specification +`_. - def start_requests(self): +Note that the ``ZYTE_API_DEFAULT_PARAMS`` would only work if the ``zyte_api`` +key in `Request.meta `_ +is set. When doing so, it will override any parameters set in the +``ZYTE_API_DEFAULT_PARAMS`` setting. - yield scrapy.Request( - url="http://books.toscrape.com/", - callback=self.parse, - meta={ - "zyte_api": { - "browserHtml": True, - # You can set any GEOLocation region you want. - "geolocation": "US", - "javascript": True, - "echoData": {"something": True}, - } - }, - ) +.. code-block:: python - def parse(self, response): - yield {"URL": response.url, "status": response.status, "HTML": response.body} + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + custom_settings = { + "ZYTE_API_DEFAULT_PARAMS": { + "geolocation": "US", # You can set any Geolocation region you want. + } + } + + def start_requests(self): + yield scrapy.Request( + url="http://books.toscrape.com/", + callback=self.parse, + meta={ + "zyte_api": { + "browserHtml": True, + "javascript": True, + "echoData": {"some_value_I_could_track": 123}, + } + }, + ) + + def parse(self, response): + yield {"URL": response.url, "status": response.status, "HTML": response.body} + + print(response.raw_api_response) + # { + # 'url': 'https://quotes.toscrape.com/', + # 'browserHtml': ' ... ', + # 'echoData': {'some_value_I_could_track': 123}, + # } + + print(response.request.meta) + # { + # 'zyte_api': { + # 'browserHtml': True, + # 'geolocation': 'US', + # 'javascript': True, + # 'echoData': {'some_value_I_could_track': 123} + # }, + # 'download_timeout': 180.0, + # 'download_slot': 'quotes.toscrape.com' + # } + +The raw Zyte Data API response can be accessed via the ``raw_api_response`` attribute +of the response object. Note that such responses are of ``ZyteAPIResponse`` and +``ZyteAPITextResponse`` types, which are respectively subclasses of ``scrapy.http.Response`` +and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte Data API +responses. diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 92035c41..e192d5c5 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,14 +1,13 @@ import json import logging import os -from base64 import b64decode -from typing import Any, Dict, Generator, List, Optional +from typing import Any, Dict, Generator, Optional, Union from scrapy import Spider from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler from scrapy.exceptions import IgnoreRequest, NotConfigured -from scrapy.http import Request, Response, TextResponse +from scrapy.http import Request from scrapy.settings import Settings from scrapy.utils.defer import deferred_from_coro from scrapy.utils.reactor import verify_installed_reactor @@ -16,6 +15,8 @@ from zyte_api.aio.client import AsyncClient, create_session from zyte_api.aio.errors import RequestError +from .responses import ZyteAPIResponse, ZyteAPITextResponse, _process_response + logger = logging.getLogger(__name__) @@ -30,8 +31,8 @@ def __init__( ) self._stats = crawler.stats self._job_id = crawler.settings.get("JOB") + self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") self._session = create_session() - self._encoding = "utf-8" @classmethod def from_crawler(cls, crawler): @@ -48,19 +49,36 @@ def from_crawler(cls, crawler): return cls(crawler.settings, crawler, client) def download_request(self, request: Request, spider: Spider) -> Deferred: - if request.meta.get("zyte_api"): - return deferred_from_coro(self._download_request(request, spider)) - else: - return super().download_request(request, spider) + api_params = self._prepare_api_params(request) + if api_params: + return deferred_from_coro( + self._download_request(api_params, request, spider) + ) + return super().download_request(request, spider) + + def _prepare_api_params(self, request: Request) -> Optional[dict]: + meta_params = request.meta.get("zyte_api") + if not meta_params and meta_params != {}: + return None + + if meta_params is True: + meta_params = {} - async def _download_request(self, request: Request, spider: Spider) -> Response: - api_params: Dict[str, Any] = request.meta["zyte_api"] - if not isinstance(api_params, dict): + api_params: Dict[str, Any] = self._zyte_api_default_params or {} + try: + api_params.update(meta_params) + except TypeError: logger.error( - "zyte_api parameters in the request meta should be " - f"provided as dictionary, got {type(api_params)} instead ({request.url})." + f"zyte_api parameters in the request meta should be " + f"provided as dictionary, got {type(request.meta.get('zyte_api'))} " + f"instead ({request.url})." ) raise IgnoreRequest() + return api_params + + async def _download_request( + self, api_params: dict, request: Request, spider: Spider + ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: # Define url by default api_data = {**{"url": request.url}, **api_params} if self._job_id is not None: @@ -80,31 +98,9 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: f"Got an error when processing Zyte API request ({request.url}): {er}" ) raise IgnoreRequest() + self._stats.inc_value("scrapy-zyte-api/request_count") - headers = self._prepare_headers(api_response.get("httpResponseHeaders")) - # browserHtml and httpResponseBody are not allowed at the same time, - # but at least one of them should be present - if api_response.get("browserHtml"): - # Using TextResponse because browserHtml always returns a browser-rendered page - # even when requesting files (like images) - return TextResponse( - url=api_response["url"], - status=200, - body=api_response["browserHtml"].encode(self._encoding), - encoding=self._encoding, - request=request, - flags=["zyte-api"], - headers=headers, - ) - else: - return Response( - url=api_response["url"], - status=200, - body=b64decode(api_response["httpResponseBody"]), - request=request, - flags=["zyte-api"], - headers=headers, - ) + return _process_response(api_response, request) @inlineCallbacks def close(self) -> Generator: @@ -129,9 +125,3 @@ def _get_request_error_message(error: RequestError) -> str: if error_data.get("detail"): return error_data["detail"] return base_message - - @staticmethod - def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]): - if not init_headers: - return None - return {h["name"]: h["value"] for h in init_headers} diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py new file mode 100644 index 00000000..ab8ef992 --- /dev/null +++ b/scrapy_zyte_api/responses.py @@ -0,0 +1,128 @@ +from base64 import b64decode +from typing import Dict, List, Optional, Tuple, Union + +from scrapy import Request +from scrapy.http import Response, TextResponse +from scrapy.responsetypes import responsetypes + +_DEFAULT_ENCODING = "utf-8" + + +class ZyteAPIMixin: + + REMOVE_HEADERS = { + # Zyte API already decompresses the HTTP Response Body. Scrapy's + # HttpCompressionMiddleware will error out when it attempts to + # decompress an already decompressed body based on this header. + "content-encoding" + } + + def __init__(self, *args, raw_api_response: Dict = None, **kwargs): + super().__init__(*args, **kwargs) + self._raw_api_response = raw_api_response + + def replace(self, *args, **kwargs): + if kwargs.get("raw_api_response"): + raise ValueError("Replacing the value of 'raw_api_response' isn't allowed.") + return super().replace(*args, **kwargs) + + @property + def raw_api_response(self) -> Optional[Dict]: + """Contains the raw API response from Zyte API. + + To see the full list of parameters and their description, kindly refer to the + `Zyte API Specification `_. + """ + return self._raw_api_response + + @classmethod + def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]): + if not init_headers: + return None + return { + h["name"]: h["value"] + for h in init_headers + if h["name"].lower() not in cls.REMOVE_HEADERS + } + + +class ZyteAPITextResponse(ZyteAPIMixin, TextResponse): + + attributes: Tuple[str, ...] = TextResponse.attributes + ("raw_api_response",) + + @classmethod + def from_api_response(cls, api_response: Dict, *, request: Request = None): + """Alternative constructor to instantiate the response from the raw + Zyte API response. + """ + body = None + encoding = None + + if api_response.get("browserHtml"): + encoding = _DEFAULT_ENCODING # Zyte API has "utf-8" by default + body = api_response["browserHtml"].encode(encoding) + elif api_response.get("httpResponseBody"): + body = b64decode(api_response["httpResponseBody"]) + + return cls( + url=api_response["url"], + status=200, + body=body, + encoding=encoding, + request=request, + flags=["zyte-api"], + headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), + raw_api_response=api_response, + ) + + +class ZyteAPIResponse(ZyteAPIMixin, Response): + + attributes: Tuple[str, ...] = Response.attributes + ("raw_api_response",) + + @classmethod + def from_api_response(cls, api_response: Dict, *, request: Request = None): + """Alternative constructor to instantiate the response from the raw + Zyte API response. + """ + return cls( + url=api_response["url"], + status=200, + body=b64decode(api_response.get("httpResponseBody") or ""), + request=request, + flags=["zyte-api"], + headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), + raw_api_response=api_response, + ) + + +def _process_response( + api_response: Dict[str, Union[List[Dict], str]], request: Request +) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: + """Given a Zyte API Response and the ``scrapy.Request`` that asked for it, + this returns either a ``ZyteAPITextResponse`` or ``ZyteAPIResponse`` depending + on which if it can properly decode the HTTP Body or have access to browserHtml. + """ + + # NOTES: Currently, Zyte API does NOT only allow both 'browserHtml' and + # 'httpResponseBody' to be present at the same time. The support for both + # will be addressed in the future. Reference: + # - https://github.com/scrapy-plugins/scrapy-zyte-api/pull/10#issuecomment-1131406460 + # For now, at least one of them should be present. + + if api_response.get("browserHtml"): + # Using TextResponse because browserHtml always returns a browser-rendered page + # even when requesting files (like images) + return ZyteAPITextResponse.from_api_response(api_response, request=request) + + if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): + response_cls = responsetypes.from_args( + headers=api_response["httpResponseHeaders"], + url=api_response["url"], + # FIXME: update this when python-zyte-api supports base64 decoding + body=b64decode(api_response["httpResponseBody"]), # type: ignore + ) + if issubclass(response_cls, TextResponse): + return ZyteAPITextResponse.from_api_response(api_response, request=request) + + return ZyteAPIResponse.from_api_response(api_response, request=request) diff --git a/tests/mockserver.py b/tests/mockserver.py index 6ef5f5a4..e202860b 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -45,7 +45,7 @@ def do_POST(self): # NOQA {"name": "test_header", "value": "test_value"} ] if post_data.get("jobId") is None: - browser_html = "" + browser_html = "Hello

World!

" else: browser_html = f"{post_data['jobId']}" if post_data.get("browserHtml"): diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index f6cca9e7..84ddb342 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,12 +1,15 @@ import os +import sys from asyncio import iscoroutine from typing import Any, Dict +from unittest import mock import pytest from _pytest.logging import LogCaptureFixture # NOQA from scrapy import Request, Spider -from scrapy.exceptions import IgnoreRequest, NotConfigured +from scrapy.exceptions import IgnoreRequest, NotConfigured, NotSupported from scrapy.http import Response, TextResponse +from scrapy.utils.defer import deferred_to_future from scrapy.utils.test import get_crawler from twisted.internet.asyncioreactor import install as install_asyncio_reactor from twisted.internet.defer import Deferred @@ -23,6 +26,23 @@ class TestAPI: + @staticmethod + async def produce_request_response(meta, custom_settings=None): + with MockServer() as server: + async with make_handler(custom_settings, server.urljoin("/")) as handler: + req = Request( + "http://example.com", + method="POST", + meta=meta, + ) + coro_or_deferred = handler.download_request(req, None) + if iscoroutine(coro_or_deferred): + resp = await coro_or_deferred # type: ignore + else: + resp = await deferred_to_future(coro_or_deferred) + + return req, resp + @pytest.mark.parametrize( "meta", [ @@ -34,25 +54,16 @@ class TestAPI: ) @pytest.mark.asyncio async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # NOQA - - assert isinstance(resp, TextResponse) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.text == "" + req, resp = await self.produce_request_response(meta) + assert isinstance(resp, TextResponse) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"Hello

World!

" + assert resp.text == "Hello

World!

" + assert resp.css("h1 ::text").get() == "World!" + assert resp.xpath("//body/text()").getall() == ["Hello"] @pytest.mark.parametrize( "meta", @@ -71,24 +82,18 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): ) @pytest.mark.asyncio async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # NOQA + req, resp = await self.produce_request_response(meta) + assert isinstance(resp, Response) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"Hello

World!

" - assert isinstance(resp, Response) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" + with pytest.raises(NotSupported): + assert resp.css("h1 ::text").get() == "World!" + with pytest.raises(NotSupported): + assert resp.xpath("//body/text()").getall() == ["Hello"] @pytest.mark.parametrize( "meta", @@ -99,24 +104,93 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]) ) @pytest.mark.asyncio async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # NOQA + req, resp = await self.produce_request_response(meta) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"Hello

World!

" + assert resp.headers == {b"Test_Header": [b"test_value"]} - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.headers == {b"Test_Header": [b"test_value"]} + @pytest.mark.skipif( + sys.version_info < (3, 8), reason="Python3.7 has poor support for AsyncMocks" + ) + @pytest.mark.parametrize( + "meta,custom_settings,expected,use_zyte_api", + [ + ({}, {}, {}, False), + ({"zyte_api": {}}, {}, {}, False), + ({"zyte_api": True}, {}, {}, False), + ({"zyte_api": False}, {}, {}, False), + ( + {}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + False, + ), + ( + {"zyte_api": False}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {}, + False, + ), + ( + {"zyte_api": None}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {}, + False, + ), + ( + {"zyte_api": {}}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + True, + ), + ( + {"zyte_api": True}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + True, + ), + ( + {"zyte_api": {"javascript": True, "geolocation": "US"}}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "US", "javascript": True}, + True, + ), + ], + ) + @mock.patch("tests.AsyncClient") + @pytest.mark.asyncio + async def test_zyte_api_request_meta( + self, + mock_client, + meta: Dict[str, Dict[str, Any]], + custom_settings: Dict[str, str], + expected: Dict[str, str], + use_zyte_api: bool, + ): + try: + # This would always error out since the mocked client doesn't + # return the expected API response. + await self.produce_request_response(meta, custom_settings=custom_settings) + except Exception: + pass + + # What we're interested in is the Request call in the API + request_call = [c for c in mock_client.mock_calls if "request_raw(" in str(c)] + + if not use_zyte_api: + assert request_call == [] + return + + elif not request_call: + pytest.fail("The client's request_raw() method was not called.") + + args_used = request_call[0].args[0] + args_used.pop("url") + + assert args_used == expected @pytest.mark.parametrize( "meta, api_relevant", @@ -124,15 +198,18 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any ({"zyte_api": {"waka": True}}, True), ({"zyte_api": True}, True), ({"zyte_api": {"browserHtml": True}}, True), - ({"zyte_api": {}}, False), + ({"zyte_api": {}}, True), + ({"zyte_api": None}, False), ({"randomParameter": True}, False), ({}, False), + (None, False), ], ) @pytest.mark.asyncio async def test_coro_handling( self, meta: Dict[str, Dict[str, Any]], api_relevant: bool ): + custom_settings = {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True}} with MockServer() as server: async with make_handler({}, server.urljoin("/")) as handler: req = Request( @@ -140,13 +217,14 @@ async def test_coro_handling( method="POST", meta=meta, ) + handler._zyte_api_default_params = custom_settings if api_relevant: coro = handler.download_request(req, Spider("test")) assert not iscoroutine(coro) assert isinstance(coro, Deferred) else: # Non-API requests won't get into handle, but run HTTPDownloadHandler.download_request instead - # But because they're Deffered - they won't run because event loop is closed + # But because they're Deferred - they won't run because event loop is closed with pytest.raises(RuntimeError, match="Event loop is closed"): handler.download_request(req, Spider("test")) @@ -160,13 +238,6 @@ async def test_coro_handling( "Got an error when processing Zyte API request (http://example.com): " "Object of type Request is not JSON serializable", ), - ( - {"zyte_api": True}, - "/", - IgnoreRequest, - "zyte_api parameters in the request meta should be provided as " - "dictionary, got instead (http://example.com)", - ), ( {"zyte_api": {"browserHtml": True}}, "/exception/", @@ -188,8 +259,12 @@ async def test_exceptions( with MockServer() as server: async with make_handler({}, server.urljoin(server_path)) as handler: req = Request("http://example.com", method="POST", meta=meta) + api_params = handler._prepare_api_params(req) + with pytest.raises(exception_type): # NOQA - await handler._download_request(req, Spider("test")) # NOQA + await handler._download_request( + api_params, req, Spider("test") + ) # NOQA assert exception_text in caplog.text @pytest.mark.parametrize( @@ -205,7 +280,10 @@ async def test_job_id(self, job_id): method="POST", meta={"zyte_api": {"browserHtml": True}}, ) - resp = await handler._download_request(req, Spider("test")) # NOQA + api_params = handler._prepare_api_params(req) + resp = await handler._download_request( + api_params, req, Spider("test") + ) # NOQA assert resp.request is req assert resp.url == req.url diff --git a/tests/test_responses.py b/tests/test_responses.py new file mode 100644 index 00000000..cc34553a --- /dev/null +++ b/tests/test_responses.py @@ -0,0 +1,385 @@ +from base64 import b64encode + +import pytest +from scrapy import Request +from scrapy.exceptions import NotSupported +from scrapy.http import Response, TextResponse + +from scrapy_zyte_api.responses import ( + ZyteAPIResponse, + ZyteAPITextResponse, + _process_response, +) + +PAGE_CONTENT = "The cake is a lie!" +URL = "https://example.com" + + +def raw_api_response_browser(): + return { + "url": URL, + "browserHtml": PAGE_CONTENT, + "javascript": True, + "echoData": {"some_value": "here"}, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": len(PAGE_CONTENT)}, + ], + } + + +def raw_api_response_body(): + return { + "url": "https://example.com", + "httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8")), + "echoData": {"some_value": "here"}, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": len(PAGE_CONTENT)}, + ], + } + + +EXPECTED_HEADERS = {b"Content-Type": [b"text/html"], b"Content-Length": [b"44"]} +EXPECTED_BODY = PAGE_CONTENT.encode("utf-8") + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (raw_api_response_browser, ZyteAPITextResponse), + (raw_api_response_body, ZyteAPIResponse), + ], +) +def test_init(api_response, cls): + response = cls(URL, raw_api_response=api_response()) + assert response.raw_api_response == api_response() + + assert response.url == URL + assert response.status == 200 + assert not response.headers + assert response.body == b"" + assert not response.flags + assert response.request is None + assert response.certificate is None + assert response.ip_address is None + assert response.protocol is None + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (raw_api_response_browser, ZyteAPITextResponse), + (raw_api_response_body, ZyteAPIResponse), + ], +) +def test_text_from_api_response(api_response, cls): + response = cls.from_api_response(api_response()) + assert response.raw_api_response == api_response() + + assert response.url == URL + assert response.status == 200 + assert response.headers == EXPECTED_HEADERS + assert response.body == EXPECTED_BODY + assert response.flags == ["zyte-api"] + assert response.request is None + assert response.certificate is None + assert response.ip_address is None + assert response.protocol is None + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (raw_api_response_browser, ZyteAPITextResponse), + (raw_api_response_body, ZyteAPIResponse), + ], +) +def test_response_replace(api_response, cls): + orig_response = cls.from_api_response(api_response()) + + # It should still work the same way + new_response = orig_response.replace(status=404) + assert new_response.status == 404 + + new_response = orig_response.replace(url="https://new-example.com") + assert new_response.url == "https://new-example.com" + + # Ensure that the Zyte API response is intact + assert new_response.raw_api_response == api_response() + + new_raw_api_response = { + "url": "https://another-website.com", + "httpResponseHeaders": {"name": "Content-Type", "value": "application/json"}, + } + + # Attempting to replace the raw_api_response value would raise an error + with pytest.raises(ValueError): + orig_response.replace(raw_api_response=new_raw_api_response) + + +def test_non_utf8_response(): + content = "Some non-ASCII ✨ chars" + sample_raw_api_response = { + "url": URL, + "browserHtml": content, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html; charset=iso-8859-1"}, + {"name": "Content-Length", "value": len(content)}, + ], + } + + # Encoding inference should not kick in under the hood for + # ``scrapy.http.TextResponse`` since ``ZyteAPITextResponse`` using "utf-8" + # for it. This is the default encoding for the "browserHtml" contents from + # Zyte API. Thus, even if the Response Headers or tags indicate a + # different encoding, it should still be treated as "utf-8". + response = ZyteAPITextResponse.from_api_response(sample_raw_api_response) + assert response.text == content + assert response.encoding == "utf-8" + + +BODY = "Hello

World!✨

" + + +def format_to_httpResponseBody(body, encoding="utf-8"): + return b64encode(body.encode(encoding)).decode("utf-8") + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (raw_api_response_browser, ZyteAPITextResponse), + (raw_api_response_body, ZyteAPIResponse), + ], +) +def test_response_headers_removal(api_response, cls): + """Headers like 'Content-Encoding' should be removed later in the response + instance returned to Scrapy. + + However, it should still be present inside 'raw_api_response.headers'. + """ + additional_headers = [ + {"name": "Content-Encoding", "value": "gzip"}, + {"name": "X-Some-Other-Value", "value": "123"}, + ] + raw_response = api_response() + raw_response["httpResponseHeaders"] = additional_headers + + response = cls.from_api_response(raw_response) + + assert response.headers == {b"X-Some-Other-Value": [b"123"]} + assert ( + response.raw_api_response["httpResponseHeaders"] + == raw_response["httpResponseHeaders"] + ) + + +def test__process_response_no_body(): + """The _process_response() function should handle missing 'browserHtml' or + 'httpResponseBody'. + """ + api_response = {"url": "https://example.com", "product": {"name": "shoes"}} + + resp = _process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, Response) + assert resp.body == b"" + + +def test__process_response_body_only(): + """Having the Body but with no Headers won't allow us to decode the contents + with the proper encoding. + + Thus, we won't have access to css/xpath selectors. + """ + encoding = "utf-8" + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(BODY, encoding=encoding), + } + + resp = _process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, Response) + with pytest.raises(NotSupported): + assert resp.css("h1 ::text") + with pytest.raises(NotSupported): + assert resp.xpath("//body/text()") + + +@pytest.mark.xfail(reason="encoding inference is not supported for now") +def test__process_response_body_only_infer_encoding(): + """The ``scrapy.TextResponse`` class has the ability to check the encoding + by inferring it in the HTML body. + + However, this is a bit tricky since we need to somehow ensure that the body + we're receiving is "text/html". We can't fully determine that without the + headers. + """ + encoding = "gb18030" + body = ( + "" + '' + "Some ✨ contents" + "" + ) + + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(body, encoding=encoding), + } + + resp = _process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + assert resp.css("body ::text").get() == "Some ✨ contents" + assert resp.xpath("//body/text()").getall() == ["Some ✨ contents"] + + +@pytest.mark.parametrize( + "encoding,content_type", + [ + ("utf-8", "text/html; charset=UTF-8"), + ("gb18030", "text/html; charset=gb2312"), + ], +) +def test__process_response_body_and_headers(encoding, content_type): + """Having access to the Headers allow us to properly decode the contents + and will have access to the css/xpath selectors. + """ + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(BODY, encoding=encoding), + "httpResponseHeaders": [{"name": "Content-Type", "value": content_type}], + } + + resp = _process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + assert resp.css("h1 ::text").get() == "World!✨" + assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.encoding == encoding + + +@pytest.mark.parametrize( + "body,expected,actual_encoding,inferred_encoding", + [ + ("plain", "plain", "cp1252", "cp1252"), + ( + "✨", + "✨", + "utf-8", + "utf-8", + ), + ( + "✨", + "✨", + "utf-16", + "utf-16-le", + ), + ( + """ + ✨""", + "✨", + "gb18030", + None, + ), + ], +) +def test__process_response_body_and_headers_but_no_encoding( + body, expected, actual_encoding, inferred_encoding +): + """Should both the body and headers are present but no 'Content-Type' encoding + can be derived, it should infer from the body contents. + """ + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(body, encoding=actual_encoding), + "httpResponseHeaders": [{"name": "X-Value", "value": "some_value"}], + } + + resp = _process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + + if inferred_encoding: + assert resp.css("body ::text").get() == expected + assert resp.xpath("//body/text()").get() == expected + assert resp.encoding == inferred_encoding + + # Scrapy's ``TextResponse`` built-in inference only works on "utf-8" and + # "Latin-1" based encodings. + else: + assert resp.css("body ::text").get() != expected + assert resp.xpath("//body/text()").get() != expected + assert resp.encoding == "ascii" + + +def test__process_response_body_and_headers_mismatch(): + """If the actual contents have a mismatch in terms of its encoding, we won't + properly decode the ✨ emoji. + """ + encoding = "utf-8" + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(BODY, encoding=encoding), + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html; charset=gb2312"} + ], + } + + resp = _process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + assert resp.css("h1 ::text").get() != "World!✨" # mismatch + assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.encoding == "gb18030" + + +def test__process_response_non_text(): + """Non-textual responses like images, files, etc. won't have access to the + css/xpath selectors. + """ + api_response = { + "url": "https://example.com/sprite.gif", + "httpResponseBody": b"", + "httpResponseHeaders": [ + { + "name": "Content-Type", + "value": "image/gif", + } + ], + } + resp = _process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, Response) + with pytest.raises(NotSupported): + assert resp.css("h1 ::text") + with pytest.raises(NotSupported): + assert resp.xpath("//body/text()") + + +@pytest.mark.parametrize( + "api_response", + [ + {"url": "https://example.com", "browserHtml": BODY}, + { + "url": "https://example.com", + "browserHtml": BODY, + "httpResponseHeaders": [ + { + "name": "Content-Type", + "value": "text/html; charset=UTF-8", + } + ], + }, + ], +) +def test__process_response_browserhtml(api_response): + resp = _process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + assert resp.css("h1 ::text").get() == "World!✨" + assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.encoding == "utf-8" # Zyte API is consistent with this on browserHtml