From 2736224e7780a502cc613a82f2b1a1318c626916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Nov 2023 17:48:46 +0100 Subject: [PATCH 01/16] WIP --- CHANGES.rst | 19 +- README.rst | 46 ++- scrapy_zyte_api/_cookies.py | 15 +- scrapy_zyte_api/_params.py | 239 ++++++++----- scrapy_zyte_api/_request_fingerprinter.py | 3 + scrapy_zyte_api/responses.py | 10 +- tests/test_api_requests.py | 407 ++++++++++++++-------- 7 files changed, 479 insertions(+), 260 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8680a62b..23ca0eb6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,23 @@ Changes ======= +0.12.3 (unreleased) +------------------- + +* Cookie support is no longer experimental: + + * The ``responseCookies`` response parameter is now handled the same as + ``experimental.responseCookies``; the latter still works but is deprecated. + + * The ``ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED`` setting is now deprecated. + When enabled, however, the ``experimental`` name space is still used for + automatic request parameter mapping. + + * If the ``COOKIES_ENABLED`` setting is ``True`` (default), automatic request + parameter mapping now sets ``responseCookies`` to ``True`` and maps request + cookies to ``requestCookies``. + + 0.12.2 (2023-10-19) ------------------- @@ -36,7 +53,7 @@ Changes Experimental is treated as a namespace, and its parameters are the ones counted, i.e. there is no ``scrapy-zyte-api/request_args/experimental`` stat, but there are stats like - ``scrapy-zyte-api/request_args/experimental.responseCookies``. + ``scrapy-zyte-api/request_args/experimental.foo``. 0.11.1 (2023-08-25) diff --git a/README.rst b/README.rst index 533c43fb..a5f89e9f 100644 --- a/README.rst +++ b/README.rst @@ -323,10 +323,10 @@ possible: - ``statusCode`` becomes ``response.status``. -- ``httpResponseHeaders`` and ``experimental.responseCookies`` become +- ``httpResponseHeaders`` and ``responseCookies`` become ``response.headers``. -- ``experimental.responseCookies`` is also mapped into the request cookiejar. +- ``responseCookies`` is also mapped into the request cookiejar. - ``browserHtml`` and ``httpResponseBody`` are mapped into both ``response.text`` (``str``) and ``response.body`` (``bytes``). @@ -412,17 +412,15 @@ parameters are chosen as follows by default: - ``Request.body`` becomes ``httpRequestBody``. -- If the ``ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED`` Scrapy setting is - ``True``, the COOKIES_ENABLED_ Scrapy setting is ``True`` (default), and - provided request metadata does not set dont_merge_cookies_ to ``True``: +- If the COOKIES_ENABLED_ Scrapy setting is ``True`` (default), and provided + request metadata does not set dont_merge_cookies_ to ``True``: .. _COOKIES_ENABLED: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-setting-COOKIES_ENABLED .. _dont_merge_cookies: https://docs.scrapy.org/en/latest/topics/request-response.html#std-reqmeta-dont_merge_cookies - - ``experimental.responseCookies`` is set to ``True``. + - ``responseCookies`` is set to ``True``. - - Cookies from the request `cookie jar`_ become - ``experimental.requestCookies``. + - Cookies from the request `cookie jar`_ become ``requestCookies``. .. _cookie jar: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-reqmeta-cookiejar @@ -434,10 +432,10 @@ parameters are chosen as follows by default: If the cookies to be set exceed the limit defined in the ``ZYTE_API_MAX_COOKIES`` setting (100 by default), a warning is logged, and only as many cookies as the limit allows are set for the target - request. To silence this warning, set ``experimental.requestCookies`` - manually, e.g. to an empty dict. Alternatively, if Zyte API starts - supporting more than 100 request cookies, update the - ``ZYTE_API_MAX_COOKIES`` setting accordingly. + request. To silence this warning, set ``requestCookies`` manually, e.g. + to an empty dict. Alternatively, if Zyte API starts supporting more + than 100 request cookies, update the ``ZYTE_API_MAX_COOKIES`` setting + accordingly. If you are using a custom downloader middleware to handle request cookiejars, you can point the ``ZYTE_API_COOKIE_MIDDLEWARE`` setting to @@ -511,20 +509,18 @@ following parameters: "value": "application/json" } ], - "experimental": { - "requestCookies": [ - { - "name": "a", - "value": "b", - "domain": "" - } - ], - "responseCookies": true - }, "httpResponseBody": true, "httpResponseHeaders": true, "httpRequestBody": "eyJmb28iOiAiYmFyIn0=", "httpRequestMethod": "POST", + "requestCookies": [ + { + "name": "a", + "value": "b", + "domain": "" + } + ], + "responseCookies": true, "url": "https://httpbin.org/anything" } @@ -553,10 +549,8 @@ following parameters: { "browserHtml": true, - "experimental": { - "responseCookies": true - }, "requestHeaders": {"referer": "https://example.com/"}, + "responseCookies": true, "url": "https://quotes.toscrape.com" } @@ -815,7 +809,7 @@ The following Zyte API parameters are *not* taken into account for request fingerprinting: - Request header parameters (``customHttpRequestHeaders``, - ``requestHeaders``) + ``requestHeaders``, ``requestCookies``) - Metadata parameters (``echoData``, ``jobId``) diff --git a/scrapy_zyte_api/_cookies.py b/scrapy_zyte_api/_cookies.py index 60ae268b..df1eea70 100644 --- a/scrapy_zyte_api/_cookies.py +++ b/scrapy_zyte_api/_cookies.py @@ -1,6 +1,7 @@ from http.cookiejar import Cookie from typing import Any, Dict, List, Optional from urllib.parse import urlparse +from warnings import warn from scrapy.http import Request from scrapy.http.cookies import CookieJar @@ -30,9 +31,21 @@ def _process_cookies( ): if not cookie_jars: return - response_cookies = api_response.get("experimental", {}).get("responseCookies") + old_response_cookies = api_response.get("experimental", {}).get("responseCookies") + response_cookies = api_response.get("responseCookies", old_response_cookies) if not response_cookies: return + if old_response_cookies: + warn( + "Found experimental.responseCookies in a Zyte API response. The " + "experimental name space is deprecated for that parameter. " + "Please, make sure you are no longer using the deprecated " + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting, and remove the " + "experimental name space from the responseCookies and " + "requestCookies parameters in your code, both when building " + "requests and when parsing responses.", + DeprecationWarning, + ) cookie_jar = _get_cookie_jar(request, cookie_jars) for response_cookie in response_cookies: rest = {} diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 00e06351..38f01fcc 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -204,11 +204,17 @@ def _set_http_response_headers_from_request( def _set_http_response_cookies_from_request( *, api_params: Dict[str, Any], + experimental: bool, ): - api_params.setdefault("experimental", {}) - api_params["experimental"].setdefault("responseCookies", True) - if api_params["experimental"]["responseCookies"] is False: - del api_params["experimental"]["responseCookies"] + if not experimental: + api_params.setdefault("responseCookies", True) + if api_params["responseCookies"] is False: + del api_params["responseCookies"] + else: + api_params.setdefault("experimental", {}) + api_params["experimental"].setdefault("responseCookies", True) + if api_params["experimental"]["responseCookies"] is False: + del api_params["experimental"]["responseCookies"] def _set_http_request_cookies_from_request( @@ -217,61 +223,117 @@ def _set_http_request_cookies_from_request( request: Request, cookie_jars: Dict[Any, CookieJar], max_cookies: int, + experimental: bool, ): - api_params.setdefault("experimental", {}) - if "requestCookies" in api_params["experimental"]: - request_cookies = api_params["experimental"]["requestCookies"] - if request_cookies is False: - del api_params["experimental"]["requestCookies"] - elif not request_cookies and isinstance(request_cookies, list): + if not experimental: + if "requestCookies" in api_params: + request_cookies = api_params["requestCookies"] + if request_cookies is False: + del api_params["requestCookies"] + elif not request_cookies and isinstance(request_cookies, list): + logger.warning( + ( + "Request %(request)r is overriding automatic request " + "cookie mapping by explicitly setting " + "requestCookies to []. If this was your intention, " + "please use False instead of []. Otherwise, stop " + "defining requestCookies in your request to let " + "automatic mapping work." + ), + { + "request": request, + }, + ) + return + output_cookies = [] + input_cookies = _get_all_cookies(request, cookie_jars) + input_cookie_count = len(input_cookies) + if input_cookie_count > max_cookies: logger.warning( ( - "Request %(request)r is overriding automatic request " - "cookie mapping by explicitly setting " - "experimental.requestCookies to []. If this was your " - "intention, please use False instead of []. Otherwise, " - "stop defining experimental.requestCookies in your " - "request to let automatic mapping work." + "Request %(request)r would get %(count)r cookies, but " + "request cookie automatic mapping is limited to %(max)r " + "cookies (see the ZYTE_API_MAX_COOKIES setting), so only " + "%(max)r cookies have been added to this request. To " + "silence this warning, set the request cookies manually " + "through the requestCookies Zyte API parameter instead. " + "Alternatively, if Zyte API starts supporting more than " + "%(max)r request cookies, update the ZYTE_API_MAX_COOKIES " + "setting accordingly." ), { "request": request, + "count": input_cookie_count, + "max": max_cookies, }, ) - return - output_cookies = [] - input_cookies = _get_all_cookies(request, cookie_jars) - input_cookie_count = len(input_cookies) - if input_cookie_count > max_cookies: - logger.warning( - ( - "Request %(request)r would get %(count)r cookies, but request " - "cookie automatic mapping is limited to %(max)r cookies " - "(see the ZYTE_API_MAX_COOKIES setting), so only %(max)r " - "cookies have been added to this request. To silence this " - "warning, set the request cookies manually through the " - "experimental.requestCookies Zyte API parameter instead. " - "Alternatively, if Zyte API starts supporting more than " - "%(max)r request cookies, update the ZYTE_API_MAX_COOKIES " - "setting accordingly." - ), - { - "request": request, - "count": input_cookie_count, - "max": max_cookies, - }, - ) - input_cookies = input_cookies[:max_cookies] - for input_cookie in input_cookies: - output_cookie = { - "name": input_cookie.name, - "value": input_cookie.value, - "domain": input_cookie.domain, - } - if input_cookie.path_specified: - output_cookie["path"] = input_cookie.path - output_cookies.append(output_cookie) - if output_cookies: - api_params["experimental"]["requestCookies"] = output_cookies + input_cookies = input_cookies[:max_cookies] + for input_cookie in input_cookies: + output_cookie = { + "name": input_cookie.name, + "value": input_cookie.value, + "domain": input_cookie.domain, + } + if input_cookie.path_specified: + output_cookie["path"] = input_cookie.path + output_cookies.append(output_cookie) + if output_cookies: + api_params["requestCookies"] = output_cookies + else: + api_params.setdefault("experimental", {}) + if "requestCookies" in api_params["experimental"]: + request_cookies = api_params["experimental"]["requestCookies"] + if request_cookies is False: + del api_params["experimental"]["requestCookies"] + elif not request_cookies and isinstance(request_cookies, list): + logger.warning( + ( + "Request %(request)r is overriding automatic request " + "cookie mapping by explicitly setting " + "experimental.requestCookies to []. If this was your " + "intention, please use False instead of []. Otherwise, " + "stop defining experimental.requestCookies in your " + "request to let automatic mapping work." + ), + { + "request": request, + }, + ) + return + output_cookies = [] + input_cookies = _get_all_cookies(request, cookie_jars) + input_cookie_count = len(input_cookies) + if input_cookie_count > max_cookies: + logger.warning( + ( + "Request %(request)r would get %(count)r cookies, but request " + "cookie automatic mapping is limited to %(max)r cookies " + "(see the ZYTE_API_MAX_COOKIES setting), so only %(max)r " + "cookies have been added to this request. To silence this " + "warning, set the request cookies manually through the " + "experimental.requestCookies Zyte API parameter instead. " + "Alternatively, if Zyte API starts supporting more than " + "%(max)r request cookies, update the ZYTE_API_MAX_COOKIES " + "setting accordingly." + ), + { + "request": request, + "count": input_cookie_count, + "max": max_cookies, + }, + ) + input_cookies = input_cookies[:max_cookies] + for input_cookie in input_cookies: + output_cookie = { + "name": input_cookie.name, + "value": input_cookie.value, + "domain": input_cookie.domain, + } + if input_cookie.path_specified: + output_cookie["path"] = input_cookie.path + output_cookies.append(output_cookie) + if output_cookies: + api_params["experimental"]["requestCookies"] = output_cookies def _set_http_request_method_from_request( @@ -348,6 +410,7 @@ def _update_api_params_from_request( cookies_enabled: bool, cookie_jars: Optional[Dict[Any, CookieJar]], max_cookies: int, + experimental_cookies: bool, ): _set_http_response_body_from_request(api_params=api_params, request=request) _set_http_response_headers_from_request( @@ -365,14 +428,17 @@ def _update_api_params_from_request( _set_http_request_body_from_request(api_params=api_params, request=request) if cookies_enabled: assert cookie_jars is not None # typing - _set_http_response_cookies_from_request(api_params=api_params) + _set_http_response_cookies_from_request( + api_params=api_params, experimental=experimental_cookies + ) _set_http_request_cookies_from_request( api_params=api_params, request=request, cookie_jars=cookie_jars, max_cookies=max_cookies, + experimental=experimental_cookies, ) - if not api_params["experimental"]: + if experimental_cookies and not api_params["experimental"]: del api_params["experimental"] _unset_unneeded_api_params( api_params=api_params, request=request, default_params=default_params @@ -478,6 +544,7 @@ def _get_automap_params( cookies_enabled: bool, cookie_jars: Optional[Dict[Any, CookieJar]], max_cookies: int, + experimental_cookies: bool, ): meta_params = request.meta.get("zyte_api_automap", default_enabled) if meta_params is False: @@ -507,6 +574,7 @@ def _get_automap_params( cookies_enabled=cookies_enabled, cookie_jars=cookie_jars, max_cookies=max_cookies, + experimental_cookies=experimental_cookies, ) return params @@ -524,6 +592,7 @@ def _get_api_params( cookies_enabled: bool, cookie_jars: Optional[Dict[Any, CookieJar]], max_cookies: int, + experimental_cookies: bool, ) -> Optional[dict]: """Returns a dictionary of API parameters that must be sent to Zyte API for the specified request, or None if the request should not be sent through @@ -539,6 +608,7 @@ def _get_api_params( cookies_enabled=cookies_enabled, cookie_jars=cookie_jars, max_cookies=max_cookies, + experimental_cookies=experimental_cookies, ) if api_params is None: return None @@ -597,19 +667,38 @@ def __init__(self, crawler, cookies_enabled=None): self._job_id = environ.get("SHUB_JOBKEY", None) self._transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False) self._skip_headers = _load_skip_headers(settings) - self._warn_on_cookies = False + self._experimental_cookies = settings.getbool( + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED" + ) + if self._experimental_cookies: + warn( + "The deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting " + "is set to True. Please, remove the deprecated " + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting, and remove " + "the experimental name space from the responseCookies and " + "requestCookies parameters in your code (if any), both when " + "building requests and when parsing responses.", + DeprecationWarning, + ) if cookies_enabled is not None: self._cookies_enabled = cookies_enabled - elif settings.getbool("ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED") is True: + else: self._cookies_enabled = settings.getbool("COOKIES_ENABLED") - if not self._cookies_enabled: + if not self._cookies_enabled and self._experimental_cookies: logger.warning( - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is True, but it " - "will have no effect because COOKIES_ENABLED is False." + "The deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED " + "setting is True, but it will have no effect because the " + "COOKIES_ENABLED setting is False. To silence this " + "warning, remove the deprecated " + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting. To enable " + "automatic cookie mapping, set COOKIES_ENABLED to True. " + "Please, consider removing the deprecated " + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting, and " + "removing the experimental name space from the " + "responseCookies and requestCookies parameters in your " + "code (if any), both when building requests and when " + "parsing responses.", ) - else: - self._cookies_enabled = False - self._warn_on_cookies = settings.getbool("COOKIES_ENABLED") self._max_cookies = settings.getint("ZYTE_API_MAX_COOKIES", 100) self._crawler = crawler self._cookie_jars = None @@ -629,32 +718,6 @@ def parse(self, request): cookies_enabled=cookies_enabled, cookie_jars=self._cookie_jars, max_cookies=self._max_cookies, + experimental_cookies=self._experimental_cookies, ) - if not dont_merge_cookies and self._warn_on_cookies: - self._handle_warn_on_cookies(request, params) return params - - def _handle_warn_on_cookies(self, request, params): - if params and params.get("experimental", {}).get("requestCookies") is not None: - return - if self._cookie_jars is None: - return - input_cookies = _get_all_cookies(request, self._cookie_jars) - if len(input_cookies) <= 0: - return - logger.warning( - ( - "Cookies are enabled for request %(request)r, and there are " - "cookies in the cookiejar, but " - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is False, so automatic " - "mapping will not map cookies for this or any other request. " - "To silence this warning, disable cookies for all requests " - "that use automated mapping, either with the " - "COOKIES_ENABLED setting or with the dont_merge_cookies " - "request metadata key." - ), - { - "request": request, - }, - ) - self._warn_on_cookies = False diff --git a/scrapy_zyte_api/_request_fingerprinter.py b/scrapy_zyte_api/_request_fingerprinter.py index 6523df2b..7e189ce9 100644 --- a/scrapy_zyte_api/_request_fingerprinter.py +++ b/scrapy_zyte_api/_request_fingerprinter.py @@ -37,10 +37,13 @@ def __init__(self, crawler): self._cache: "WeakKeyDictionary[Request, bytes]" = WeakKeyDictionary() self._param_parser = _ParamParser(crawler, cookies_enabled=False) self._skip_keys = ( + "cookieManagement", "customHttpRequestHeaders", "echoData", "jobId", + "requestCookies", "requestHeaders", + "responseCookies", "experimental", ) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 18b63401..fbbf16e4 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -96,9 +96,14 @@ def _prepare_headers(cls, api_response: Dict[str, Any]): for h in input_headers if h["name"].lower() not in cls.REMOVE_HEADERS } - input_cookies: Optional[List[Dict[str, str]]] = api_response.get( + old_input_cookies: Optional[List[Dict[str, str]]] = api_response.get( "experimental", {} ).get("responseCookies") + input_cookies: Optional[List[Dict[str, str]]] = api_response.get( + "responseCookies", old_input_cookies + ) + # Note: We do not warn about deprecated experimental cookie use because + # _process_cookies is called earlier and already takes care of that. if input_cookies: result["Set-Cookie"] = [] for cookie in input_cookies: @@ -173,11 +178,10 @@ def _process_response( on which if it can properly decode the HTTP Body or have access to browserHtml. """ - # NOTES: Currently, Zyte API does NOT only allow both 'browserHtml' and + # NOTES: Currently, Zyte API does NOT allow both 'browserHtml' and # 'httpResponseBody' to be present at the same time. The support for both # will be addressed in the future. Reference: # - https://github.com/scrapy-plugins/scrapy-zyte-api/pull/10#issuecomment-1131406460 - # For now, at least one of them should be present. _process_cookies(api_response, request, cookie_jars) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 53b9de9b..764f99f8 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -273,6 +273,7 @@ async def parse(self, response): JOB_ID = None COOKIES_ENABLED = False MAX_COOKIES = 100 +EXPERIMENTAL_COOKIES = True GET_API_PARAMS_KWARGS = { "default_params": DEFAULT_PARAMS, "transparent_mode": TRANSPARENT_MODE, @@ -282,6 +283,7 @@ async def parse(self, response): "job_id": JOB_ID, "cookies_enabled": COOKIES_ENABLED, "max_cookies": MAX_COOKIES, + "experimental_cookies": EXPERIMENTAL_COOKIES, } @@ -314,6 +316,7 @@ async def test_param_parser_input_custom(mockserver): assert parser._max_cookies == 1 assert parser._skip_headers == {b"a"} assert parser._transparent_mode is True + assert parser._experimental_cookies is True @ensureDeferred @@ -1633,8 +1636,11 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "settings,cookies,meta,params,expected,warnings,cookie_jar", [ # Cookies, both for requests and for responses, are enabled based on - # both ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED (default: False) and # COOKIES_ENABLED (default: True). + # + # ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED (deprecated, default: False) + # forces the experimental name space to be used when enabled, and + # triggers a deprecation warning. *( ( settings, @@ -1645,34 +1651,37 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "httpResponseBody": True, "httpResponseHeaders": True, }, - setup_warnings - or ( - run_time_warnings - if cast(Dict, settings).get("COOKIES_ENABLED", True) - else [] - ), + warnings, [], ) - for input_cookies, run_time_warnings in ( + for input_cookies in ( + REQUEST_INPUT_COOKIES_EMPTY, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + ) + for settings, warnings in ( ( - REQUEST_INPUT_COOKIES_EMPTY, + {}, [], ), ( - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - [ - "there are cookies in the cookiejar, but ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is False", - ], + { + "COOKIES_ENABLED": True, + }, + [], ), - ) - for settings, setup_warnings in ( ( - {}, - [], + { + "COOKIES_ENABLED": True, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + ], ), ( { "COOKIES_ENABLED": True, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": False, }, [], ), @@ -1688,7 +1697,8 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, }, [ - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is True, but it will have no effect because COOKIES_ENABLED is False.", + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "will have no effect", ], ), ( @@ -1700,6 +1710,8 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca ), ) ), + # When COOKIES_ENABLED is True, responseCookies is set to True, and + # requestCookies is filled automatically if there are cookies. *( ( settings, @@ -1709,10 +1721,8 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca { "httpResponseBody": True, "httpResponseHeaders": True, - "experimental": { - "responseCookies": True, - **cast(Dict, output_cookies), - }, + "responseCookies": True, + **cast(Dict, output_cookies), }, [], [], @@ -1728,83 +1738,57 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca ), ) for settings in ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - { - "COOKIES_ENABLED": True, - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, + {}, + {"COOKIES_ENABLED": True}, ) ), - # Do not warn about request cookies not being mapped if cookies are - # manually set. + # When ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is also True, + # responseCookies and requestCookies are defined within the + # experimental name space, and a deprecation warning is issued. *( ( settings, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, + input_cookies, + {}, {}, - { - "experimental": { - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, - } - }, { "httpResponseBody": True, "httpResponseHeaders": True, "experimental": { - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + **cast(Dict, output_cookies), }, }, - [], + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + ], [], ) + for input_cookies, output_cookies in ( + ( + REQUEST_INPUT_COOKIES_EMPTY, + {}, + ), + ( + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {"requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL}, + ), + ) for settings in ( - {}, + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, { "COOKIES_ENABLED": True, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, }, ) ), # dont_merge_cookies=True on request metadata disables cookies. ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_EMPTY, - { - "dont_merge_cookies": True, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - [], - ), - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - { - "dont_merge_cookies": True, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - [], - ), - # Do not warn about request cookies not being mapped if - # dont_merge_cookies=True is set on request metadata. - *( ( settings, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, + input_cookies, { "dont_merge_cookies": True, }, @@ -1813,80 +1797,221 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "httpResponseBody": True, "httpResponseHeaders": True, }, + warnings, [], - [ + ) + for input_cookies in ( + REQUEST_INPUT_COOKIES_EMPTY, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + ) + for settings, warnings in ( + ( + {}, + [], + ), + ( { - "name": "foo", - "value": "bar", - "domain": "example.com", - } - ], + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), ) - for settings in ( + ), + # Cookies can be disabled setting the corresponding Zyte API parameter + # to False. + # + # By default, setting experimental parameters to False has no effect. + # If ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is True, then only + # experimental parameters are taken into account instead. + *( + ( + settings, + input_cookies, {}, + input_params, { - "COOKIES_ENABLED": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + **output_params, }, + warnings, + [], + ) + for settings, input_cookies, input_params, output_params, warnings in ( + # No cookies, responseCookies disabled. + ( + {}, + REQUEST_INPUT_COOKIES_EMPTY, + { + "responseCookies": False, + }, + {}, + [], + ), + ( + {}, + REQUEST_INPUT_COOKIES_EMPTY, + { + "experimental": { + "responseCookies": False, + } + }, + { + "responseCookies": True, + "experimental": { + "responseCookies": False, + }, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + { + "responseCookies": False, + }, + { + "responseCookies": False, + "experimental": { + "responseCookies": False, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + { + "experimental": { + "responseCookies": False, + } + }, + {}, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + # No cookies, requestCookies disabled. + ( + {}, + REQUEST_INPUT_COOKIES_EMPTY, + { + "requestCookies": False, + }, + { + "responseCookies": True, + }, + [], + ), + ( + {}, + REQUEST_INPUT_COOKIES_EMPTY, + { + "experimental": { + "requestCookies": False, + } + }, + { + "responseCookies": True, + "experimental": { + "requestCookies": False, + }, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + { + "requestCookies": False, + }, + { + "requestCookies": False, + "experimental": {"responseCookies": True}, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + { + "experimental": { + "requestCookies": False, + } + }, + { + "experimental": {"responseCookies": True}, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + # No cookies, requestCookies and responseCookies disabled. + ( + {}, + REQUEST_INPUT_COOKIES_EMPTY, + { + "requestCookies": False, + "responseCookies": False, + }, + {}, + [], + ), + ( + {}, + REQUEST_INPUT_COOKIES_EMPTY, + { + "experimental": { + "requestCookies": False, + "responseCookies": False, + } + }, + { + "responseCookies": True, + "experimental": { + "requestCookies": False, + "responseCookies": False, + }, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + { + "requestCookies": False, + "responseCookies": False, + }, + { + "requestCookies": False, + "responseCookies": False, + "experimental": {"responseCookies": True}, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + { + "experimental": { + "requestCookies": False, + "responseCookies": False, + } + }, + {}, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), ) ), - # Cookies can be disabled setting the corresponding Zyte API parameter - # to False. - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_EMPTY, - {}, - { - "experimental": { - "responseCookies": False, - } - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - [], - ), - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_EMPTY, - {}, - { - "experimental": { - "requestCookies": False, - } - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "experimental": {"responseCookies": True}, - }, - [], - [], - ), - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_EMPTY, - {}, - { - "experimental": { - "responseCookies": False, - "requestCookies": False, - } - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - [], - ), + # TODO: Continue adapting the test scenarios from here. ( { "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, From a46bb146c33a5d212edb9f390a0239d9bd344694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 13:00:04 +0100 Subject: [PATCH 02/16] Update tests/test_api_requests.py::test_automap_cookies expectations --- scrapy_zyte_api/_params.py | 4 +- tests/test_api_requests.py | 642 ++++++++++++++++++++++++------------- 2 files changed, 416 insertions(+), 230 deletions(-) diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 38f01fcc..a2a92be3 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -19,6 +19,7 @@ "article", "articleList", "articleNavigation", + "jobPosting", "product", "productList", "productNavigation", @@ -671,14 +672,13 @@ def __init__(self, crawler, cookies_enabled=None): "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED" ) if self._experimental_cookies: - warn( + logger.warning( "The deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting " "is set to True. Please, remove the deprecated " "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting, and remove " "the experimental name space from the responseCookies and " "requestCookies parameters in your code (if any), both when " "building requests and when parsing responses.", - DeprecationWarning, ) if cookies_enabled is not None: self._cookies_enabled = cookies_enabled diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 764f99f8..491b50d0 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -447,7 +447,7 @@ def test_transparent_mode_toggling(setting, meta, expected): api_params = func() if api_params is not None: api_params.pop("url") - assert api_params == expected + assert expected == api_params @pytest.mark.parametrize("meta", [None, 0, "", b"", [], ()]) @@ -584,7 +584,7 @@ def test_default_params_merging( for key in ignore_keys: api_params.pop(key) api_params.pop("url") - assert api_params == expected + assert expected == api_params if warnings: for warning in warnings: assert warning in caplog.text @@ -679,7 +679,7 @@ def _test_automap( with caplog.at_level("WARNING"): api_params = param_parser.parse(request) api_params.pop("url") - assert api_params == expected + assert expected == api_params if warnings: for warning in warnings: assert warning in caplog.text @@ -1636,11 +1636,13 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "settings,cookies,meta,params,expected,warnings,cookie_jar", [ # Cookies, both for requests and for responses, are enabled based on - # COOKIES_ENABLED (default: True). + # COOKIES_ENABLED (default: True). Disabling cookie mapping at the + # spider level requires setting COOKIES_ENABLED to False. # - # ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED (deprecated, default: False) - # forces the experimental name space to be used when enabled, and - # triggers a deprecation warning. + # ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED (deprecated, default: False), + # when enabled, triggers a deprecation warning, and forces the + # experimental name space to be used for automatic cookie parameters if + # COOKIES_ENABLED is also True. *( ( settings, @@ -1659,35 +1661,16 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca REQUEST_INPUT_COOKIES_MINIMAL_DICT, ) for settings, warnings in ( - ( - {}, - [], - ), - ( - { - "COOKIES_ENABLED": True, - }, - [], - ), - ( - { - "COOKIES_ENABLED": True, - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - [ - "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", - ], - ), ( { - "COOKIES_ENABLED": True, - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": False, + "COOKIES_ENABLED": False, }, [], ), ( { "COOKIES_ENABLED": False, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": False, }, [], ), @@ -1701,13 +1684,6 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "will have no effect", ], ), - ( - { - "COOKIES_ENABLED": False, - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": False, - }, - [], - ), ) ), # When COOKIES_ENABLED is True, responseCookies is set to True, and @@ -1785,7 +1761,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca ) ), # dont_merge_cookies=True on request metadata disables cookies. - ( + *( ( settings, input_cookies, @@ -1875,7 +1851,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca { "responseCookies": False, "experimental": { - "responseCookies": False, + "responseCookies": True, }, }, ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], @@ -2009,175 +1985,357 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca {}, ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], ), + # Cookies, responseCookies disabled. + ( + {}, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "responseCookies": False, + }, + { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + }, + [], + ), + ( + {}, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "experimental": { + "responseCookies": False, + } + }, + { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + "experimental": { + "responseCookies": False, + }, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "responseCookies": False, + }, + { + "responseCookies": False, + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "experimental": { + "responseCookies": False, + } + }, + { + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + # Cookies, requestCookies disabled. + ( + {}, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "requestCookies": False, + }, + { + "responseCookies": True, + }, + [], + ), + ( + {}, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "experimental": { + "requestCookies": False, + } + }, + { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + "experimental": { + "requestCookies": False, + }, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "requestCookies": False, + }, + { + "requestCookies": False, + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "experimental": { + "requestCookies": False, + } + }, + { + "experimental": { + "responseCookies": True, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + # Cookies, requestCookies and responseCookies disabled. + ( + {}, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "requestCookies": False, + "responseCookies": False, + }, + {}, + [], + ), + ( + {}, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "experimental": { + "requestCookies": False, + "responseCookies": False, + } + }, + { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + "experimental": { + "requestCookies": False, + "responseCookies": False, + }, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "requestCookies": False, + "responseCookies": False, + }, + { + "requestCookies": False, + "responseCookies": False, + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "experimental": { + "requestCookies": False, + "responseCookies": False, + } + }, + {}, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), ) ), - # TODO: Continue adapting the test scenarios from here. - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - {}, - { - "experimental": { - "responseCookies": False, - } - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "experimental": { - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, - }, - }, - [], - [], - ), - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - {}, - { - "experimental": { - "requestCookies": False, - } - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "experimental": {"responseCookies": True}, - }, - [], - [], - ), - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - {}, - { - "experimental": { - "responseCookies": False, - "requestCookies": False, - } - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - [], - ), + # requestCookies, if set manually, prevents automatic mapping. + # # Setting requestCookies to [] disables automatic mapping, but logs a # a warning recommending to either use False to achieve the same or # remove the parameter to let automated mapping work. - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - {}, - { - "experimental": { - "requestCookies": [], - } - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "experimental": { - "requestCookies": [], - "responseCookies": True, - }, - }, - [ - "is overriding automatic request cookie mapping", - ], - [], + *( + ( + settings, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + input_params, + output_params, + warnings, + [], + ) + for override_cookies, override_warnings in ( + ( + [], + ["is overriding automatic request cookie mapping"], + ), + ( + REQUEST_OUTPUT_COOKIES_MAXIMAL, + [], + ), + ) + for settings, input_params, output_params, warnings in ( + ( + {}, + { + "requestCookies": override_cookies, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestCookies": override_cookies, + "responseCookies": True, + }, + override_warnings, + ), + ( + {}, + { + "experimental": { + "requestCookies": override_cookies, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + "experimental": { + "requestCookies": override_cookies, + }, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + { + "experimental": { + "requestCookies": override_cookies, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "requestCookies": override_cookies, + "responseCookies": True, + }, + }, + [ + *override_warnings, + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + ], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + { + "requestCookies": override_cookies, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestCookies": override_cookies, + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "responseCookies": True, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ) ), # Cookies work for browser and automatic extraction requests as well. - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - {}, - { - "browserHtml": True, - }, - { - "browserHtml": True, - "experimental": { - "responseCookies": True, - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + *( + ( + settings, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + params, + { + **params, + **extra_output_params, }, - }, - [], - [], - ), - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - {}, - { - "screenshot": True, - }, - { - "screenshot": True, - "experimental": { - "responseCookies": True, - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + warnings, + [], + ) + for params in ( + { + "browserHtml": True, }, - }, - [], - [], - ), - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - {}, - { - EXTRACT_KEY: True, - }, - { - EXTRACT_KEY: True, - "experimental": { - "responseCookies": True, - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + { + "screenshot": True, }, - }, - [], - [], + { + EXTRACT_KEY: True, + }, + ) + for settings, extra_output_params, warnings in ( + ( + {}, + { + "responseCookies": True, + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + { + "experimental": { + "responseCookies": True, + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ) ), # Cookies are mapped correctly, both with minimum and maximum cookie # parameters. *( ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - input, + settings, + input_cookies, {}, {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "experimental": { - "responseCookies": True, - "requestCookies": output, - }, - }, - [], + output_params, + warnings, [], ) - for input, output in ( + for input_cookies, output_cookies in ( ( REQUEST_INPUT_COOKIES_MINIMAL_DICT, REQUEST_OUTPUT_COOKIES_MINIMAL, @@ -2191,51 +2349,79 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca REQUEST_OUTPUT_COOKIES_MAXIMAL, ), ) - ), - # requestCookies, if set manually, prevents automatic mapping. - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - REQUEST_INPUT_COOKIES_MINIMAL_DICT, - {}, - { - "experimental": { - "requestCookies": REQUEST_OUTPUT_COOKIES_MAXIMAL, - }, - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "experimental": { - "responseCookies": True, - "requestCookies": REQUEST_OUTPUT_COOKIES_MAXIMAL, - }, - }, - [], - [], + for settings, output_params, warnings in ( + ( + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "responseCookies": True, + "requestCookies": output_cookies, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + "requestCookies": output_cookies, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ) ), # Mapping multiple cookies works. - ( - { - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, - }, - {"a": "b", "c": "d"}, - {}, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "experimental": { - "responseCookies": True, - "requestCookies": [ + *( + ( + settings, + input_cookies, + {}, + {}, + output_params, + warnings, + [], + ) + for input_cookies, output_cookies in ( + ( + {"a": "b", "c": "d"}, + [ {"name": "a", "value": "b", "domain": "example.com"}, {"name": "c", "value": "d", "domain": "example.com"}, ], - }, - }, - [], - [], + ), + ) + for settings, output_params, warnings in ( + ( + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "responseCookies": True, + "requestCookies": output_cookies, + }, + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + "requestCookies": output_cookies, + }, + }, + ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + ), + ) ), ], ) @@ -2764,7 +2950,7 @@ def test_default_params_automap(default_params, meta, expected, warnings, caplog with caplog.at_level("WARNING"): api_params = param_parser.parse(request) api_params.pop("url") - assert api_params == expected + assert expected == api_params if warnings: for warning in warnings: assert warning in caplog.text From bd680e70285ff101dad3ebc5851268cbfd5e56a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 13:53:19 +0100 Subject: [PATCH 03/16] Update remaining test expectations --- tests/test_api_requests.py | 280 +++++++++++++--------------- tests/test_downloader_middleware.py | 3 +- tests/test_request_fingerprinter.py | 4 +- tests/test_responses.py | 12 +- 4 files changed, 139 insertions(+), 160 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 491b50d0..4f51371e 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -271,9 +271,9 @@ async def parse(self, response): TRANSPARENT_MODE = False SKIP_HEADERS = {b"cookie", b"user-agent"} JOB_ID = None -COOKIES_ENABLED = False +COOKIES_ENABLED = True MAX_COOKIES = 100 -EXPERIMENTAL_COOKIES = True +EXPERIMENTAL_COOKIES = False GET_API_PARAMS_KWARGS = { "default_params": DEFAULT_PARAMS, "transparent_mode": TRANSPARENT_MODE, @@ -293,7 +293,7 @@ async def test_params_parser_input_default(mockserver): for key in GET_API_PARAMS_KWARGS: actual = getattr(handler._param_parser, f"_{key}") expected = GET_API_PARAMS_KWARGS[key] - assert actual == expected, key + assert expected == actual, key @ensureDeferred @@ -353,6 +353,7 @@ async def test_param_parser_output_side_effects(output, uses_zyte_api, mockserve DEFAULT_AUTOMAP_PARAMS: Dict[str, Any] = { "httpResponseBody": True, "httpResponseHeaders": True, + "responseCookies": True, } @@ -554,7 +555,7 @@ async def test_default_params_none(mockserver, caplog): ( "ZYTE_API_AUTOMAP_PARAMS", "zyte_api_automap", - {"httpResponseBody", "httpResponseHeaders"}, + DEFAULT_AUTOMAP_PARAMS.keys(), ), ], ) @@ -692,12 +693,11 @@ def _test_automap( [ # If no other known main output is specified in meta, httpResponseBody # is requested. - ({}, {"httpResponseBody": True, "httpResponseHeaders": True}, []), + ({}, DEFAULT_AUTOMAP_PARAMS, []), ( {"unknownMainOutput": True}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "unknownMainOutput": True, }, [], @@ -707,29 +707,29 @@ def _test_automap( # may stop working for binary responses in the future. ( {"httpResponseBody": True}, - {"httpResponseBody": True, "httpResponseHeaders": True}, + DEFAULT_AUTOMAP_PARAMS, [], ), - # If other main outputs are specified in meta, httpRequestBody is not - # set. + # If other main outputs are specified in meta, httpResponseBody and + # httpResponseHeaders are not set. ( {"browserHtml": True}, - {"browserHtml": True}, + {"browserHtml": True, "responseCookies": True}, [], ), ( {"screenshot": True}, - {"screenshot": True}, + {"screenshot": True, "responseCookies": True}, [], ), ( {EXTRACT_KEY: True}, - {EXTRACT_KEY: True}, + {EXTRACT_KEY: True, "responseCookies": True}, [], ), ( {"browserHtml": True, "screenshot": True}, - {"browserHtml": True, "screenshot": True}, + {"browserHtml": True, "screenshot": True, "responseCookies": True}, [], ), # If no known main output is specified, and httpResponseBody is @@ -737,12 +737,12 @@ def _test_automap( # is added. ( {"httpResponseBody": False}, - {}, + {"responseCookies": True}, [], ), ( {"httpResponseBody": False, "unknownMainOutput": True}, - {"unknownMainOutput": True}, + {"unknownMainOutput": True, "responseCookies": True}, [], ), # We allow httpResponseBody and browserHtml to be both set to True, in @@ -751,8 +751,7 @@ def _test_automap( {"httpResponseBody": True, "browserHtml": True}, { "browserHtml": True, - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, [], ), @@ -773,22 +772,22 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): # not be implicitly set to True, it is passed as such. ( {"httpResponseBody": False, "httpResponseHeaders": True}, - {"httpResponseHeaders": True}, + {"httpResponseHeaders": True, "responseCookies": True}, [], ), ( {"browserHtml": True, "httpResponseHeaders": True}, - {"browserHtml": True, "httpResponseHeaders": True}, + {"browserHtml": True, "httpResponseHeaders": True, "responseCookies": True}, [], ), ( {"screenshot": True, "httpResponseHeaders": True}, - {"screenshot": True, "httpResponseHeaders": True}, + {"screenshot": True, "httpResponseHeaders": True, "responseCookies": True}, [], ), ( {EXTRACT_KEY: True, "httpResponseHeaders": True}, - {EXTRACT_KEY: True, "httpResponseHeaders": True}, + {EXTRACT_KEY: True, "httpResponseHeaders": True, "responseCookies": True}, [], ), ( @@ -797,7 +796,11 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): "httpResponseBody": False, "httpResponseHeaders": True, }, - {"unknownMainOutput": True, "httpResponseHeaders": True}, + { + "unknownMainOutput": True, + "httpResponseHeaders": True, + "responseCookies": True, + }, [], ), # Setting httpResponseHeaders to True where it would be already True @@ -807,12 +810,20 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): # stops being set to True by default in those scenarios. ( {"httpResponseHeaders": True}, - {"httpResponseBody": True, "httpResponseHeaders": True}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "responseCookies": True, + }, [], ), ( {"httpResponseBody": True, "httpResponseHeaders": True}, - {"httpResponseBody": True, "httpResponseHeaders": True}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "responseCookies": True, + }, [], ), ( @@ -825,6 +836,7 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): "browserHtml": True, "httpResponseBody": True, "httpResponseHeaders": True, + "responseCookies": True, }, [], ), @@ -834,16 +846,21 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): "unknownMainOutput": True, "httpResponseBody": True, "httpResponseHeaders": True, + "responseCookies": True, }, [], ), # If httpResponseHeaders is set to False, httpResponseHeaders is not # defined, even if httpResponseBody is set to True, implicitly or # explicitly. - ({"httpResponseHeaders": False}, {"httpResponseBody": True}, []), + ( + {"httpResponseHeaders": False}, + {"httpResponseBody": True, "responseCookies": True}, + [], + ), ( {"httpResponseBody": True, "httpResponseHeaders": False}, - {"httpResponseBody": True}, + {"httpResponseBody": True, "responseCookies": True}, [], ), ( @@ -852,12 +869,16 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): "browserHtml": True, "httpResponseHeaders": False, }, - {"browserHtml": True, "httpResponseBody": True}, + {"browserHtml": True, "httpResponseBody": True, "responseCookies": True}, [], ), ( {"unknownMainOutput": True, "httpResponseHeaders": False}, - {"unknownMainOutput": True, "httpResponseBody": True}, + { + "unknownMainOutput": True, + "httpResponseBody": True, + "responseCookies": True, + }, [], ), # If httpResponseHeaders is unnecessarily set to False where @@ -866,22 +887,22 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): # logged. ( {"httpResponseBody": False, "httpResponseHeaders": False}, - {}, + {"responseCookies": True}, ["do not need to set httpResponseHeaders to False"], ), ( {"browserHtml": True, "httpResponseHeaders": False}, - {"browserHtml": True}, + {"browserHtml": True, "responseCookies": True}, ["do not need to set httpResponseHeaders to False"], ), ( {"screenshot": True, "httpResponseHeaders": False}, - {"screenshot": True}, + {"screenshot": True, "responseCookies": True}, ["do not need to set httpResponseHeaders to False"], ), ( {EXTRACT_KEY: True, "httpResponseHeaders": False}, - {EXTRACT_KEY: True}, + {EXTRACT_KEY: True, "responseCookies": True}, ["do not need to set httpResponseHeaders to False"], ), ( @@ -890,7 +911,7 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): "httpResponseBody": False, "httpResponseHeaders": False, }, - {"unknownMainOutput": True}, + {"unknownMainOutput": True, "responseCookies": True}, ["do not need to set httpResponseHeaders to False"], ), ], @@ -906,10 +927,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): ( "GET", {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, [], ), # Other HTTP methods, regardless of whether they are supported, @@ -920,8 +938,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): method, {}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "httpRequestMethod": method, }, [], @@ -946,8 +963,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): request_method, {"httpRequestMethod": meta_method}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "httpRequestMethod": meta_method, }, ["Use Request.method"], @@ -965,8 +981,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): request_method, {"httpRequestMethod": meta_method}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "httpRequestMethod": meta_method, }, [ @@ -987,6 +1002,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): { "browserHtml": True, "httpRequestMethod": "POST", + "responseCookies": True, }, [], ), @@ -996,6 +1012,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): { "screenshot": True, "httpRequestMethod": "POST", + "responseCookies": True, }, [], ), @@ -1005,6 +1022,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): { EXTRACT_KEY: True, "httpRequestMethod": "POST", + "responseCookies": True, }, [], ), @@ -1026,8 +1044,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, [], ), @@ -1039,6 +1056,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "browserHtml": True, "requestHeaders": {"referer": "a"}, + "responseCookies": True, }, [], ), @@ -1048,6 +1066,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "requestHeaders": {"referer": "a"}, "screenshot": True, + "responseCookies": True, }, [], ), @@ -1057,6 +1076,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "requestHeaders": {"referer": "a"}, EXTRACT_KEY: True, + "responseCookies": True, }, [], ), @@ -1072,8 +1092,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "requestHeaders": {"referer": "a"}, }, [], @@ -1085,8 +1104,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "requestHeaders": {"referer": "a"}, "screenshot": True, }, @@ -1099,8 +1117,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "requestHeaders": {"referer": "a"}, EXTRACT_KEY: True, }, @@ -1114,8 +1131,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "requestHeaders": {"referer": "a"}, "screenshot": True, }, @@ -1137,8 +1153,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "unknownMainOutput": True, }, [], @@ -1152,6 +1167,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"httpResponseBody": False}, { "requestHeaders": {"referer": "a"}, + "responseCookies": True, }, [], ), @@ -1161,6 +1177,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "requestHeaders": {"referer": "a"}, "unknownMainOutput": True, + "responseCookies": True, }, [], ), @@ -1168,10 +1185,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): ( {"Referer": "a"}, {"customHttpRequestHeaders": False}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, [], ), ( @@ -1179,6 +1193,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True, "requestHeaders": False}, { "browserHtml": True, + "responseCookies": True, }, [], ), @@ -1191,8 +1206,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): }, { "browserHtml": True, - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "requestHeaders": {"referer": "a"}, }, [], @@ -1205,8 +1219,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, [], ), @@ -1220,8 +1233,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): }, { "browserHtml": True, - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, [], ), @@ -1233,8 +1245,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "requestHeaders": {"referer": "a"}, }, [], @@ -1248,6 +1259,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"name": "Referer", "value": "a"}, ], "requestHeaders": {"referer": "a"}, + "responseCookies": True, }, [], ), @@ -1255,10 +1267,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): ( {"Referer": None}, {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, [], ), ( @@ -1266,6 +1275,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, + "responseCookies": True, }, [], ), @@ -1274,8 +1284,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True, "httpResponseBody": True}, { "browserHtml": True, - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, [], ), @@ -1284,6 +1293,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"screenshot": True}, { "screenshot": True, + "responseCookies": True, }, [], ), @@ -1292,6 +1302,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {EXTRACT_KEY: True}, { EXTRACT_KEY: True, + "responseCookies": True, }, [], ), @@ -1300,8 +1311,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"screenshot": True, "httpResponseBody": True}, { "screenshot": True, - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, [], ), @@ -1310,8 +1320,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {EXTRACT_KEY: True, "httpResponseBody": True}, { EXTRACT_KEY: True, - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, [], ), @@ -1319,8 +1328,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"Referer": None}, {"unknownMainOutput": True}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "unknownMainOutput": True, }, [], @@ -1330,13 +1338,14 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"unknownMainOutput": True, "httpResponseBody": False}, { "unknownMainOutput": True, + "responseCookies": True, }, [], ), ( {"Referer": None}, {"httpResponseBody": False}, - {}, + {"responseCookies": True}, [], ), # Warn if header parameters are used in meta, even if the values match @@ -1353,8 +1362,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, ["Use Request.headers instead"], ), @@ -1367,6 +1375,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "browserHtml": True, "requestHeaders": {"referer": "a"}, + "responseCookies": True, }, ["Use Request.headers instead"], ), @@ -1381,8 +1390,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "b"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, ["Use Request.headers instead"], ), @@ -1395,6 +1403,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "browserHtml": True, "requestHeaders": {"referer": "b"}, + "responseCookies": True, }, ["Use Request.headers instead"], ), @@ -1409,8 +1418,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, }, ["Use Request.headers instead"], ), @@ -1423,6 +1431,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "browserHtml": True, "requestHeaders": {"referer": "a"}, + "responseCookies": True, }, ["Use Request.headers instead"], ), @@ -1440,8 +1449,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "requestHeaders": {"referer": "a"}, }, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "requestHeaders": {"referer": "a"}, }, [], @@ -1459,6 +1467,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], + "responseCookies": True, }, [], ), @@ -1470,6 +1479,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, + "responseCookies": True, }, ["cannot be mapped"], ), @@ -1479,6 +1489,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, + "responseCookies": True, }, ["cannot be mapped"], ), @@ -1486,10 +1497,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): ( {"user-Agent": ""}, {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, ["cannot be mapped"], ), # The Accept, Accept-Encoding and Accept-Language headers, when @@ -1506,6 +1514,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, + "responseCookies": True, }, [], ), @@ -1515,6 +1524,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, + "responseCookies": True, }, ["cannot be mapped"], ) @@ -1536,19 +1546,13 @@ def test_automap_method(method, meta, expected, warnings, caplog): ( {"User-Agent": DEFAULT_USER_AGENT}, {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, [], ), ( {"User-Agent": ""}, {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, ["cannot be mapped"], ), ( @@ -1556,6 +1560,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, + "responseCookies": True, }, [], ), @@ -1564,6 +1569,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, + "responseCookies": True, }, ["cannot be mapped"], ), @@ -1588,8 +1594,7 @@ def test_automap_headers(headers, meta, expected, warnings, caplog): }, {}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "customHttpRequestHeaders": [ {"name": "User-Agent", "value": ""}, ], @@ -1611,6 +1616,7 @@ def test_automap_headers(headers, meta, expected, warnings, caplog): { "browserHtml": True, "requestHeaders": {"userAgent": ""}, + "responseCookies": True, }, [], ), @@ -2452,7 +2458,6 @@ def test_automap_all_cookies(meta): the target URL domain.""" settings: Dict[str, Any] = { **SETTINGS, - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, "ZYTE_API_TRANSPARENT_MODE": True, } crawler = get_crawler(settings) @@ -2483,7 +2488,7 @@ def test_automap_all_cookies(meta): ) cookie_middleware.process_request(request1, spider=None) api_params = param_parser.parse(request1) - assert api_params["experimental"]["requestCookies"] == [ + assert api_params["requestCookies"] == [ {"name": "a", "value": "b", "domain": "a.example"}, # https://github.com/scrapy/scrapy/issues/5841 # {"name": "c", "value": "d", "domain": "b.example"}, @@ -2527,9 +2532,7 @@ def test_automap_all_cookies(meta): cookie_middleware.process_request(request2, spider=None) api_params = param_parser.parse(request2) - assert sort_dict_list( - api_params["experimental"]["requestCookies"] - ) == sort_dict_list( + assert sort_dict_list(api_params["requestCookies"]) == sort_dict_list( [ {"name": "e", "value": "f", "domain": ".c.example"}, {"name": "i", "value": "j", "domain": ".d.example"}, @@ -2560,7 +2563,6 @@ def test_automap_cookie_jar(meta): request4 = Request(url="https://example.com/4", meta={**meta, "cookiejar": "a"}) settings: Dict[str, Any] = { **SETTINGS, - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, "ZYTE_API_TRANSPARENT_MODE": True, } crawler = get_crawler(settings) @@ -2570,20 +2572,18 @@ def test_automap_cookie_jar(meta): cookie_middleware.process_request(request1, spider=None) api_params = param_parser.parse(request1) - assert api_params["experimental"]["requestCookies"] == [ + assert api_params["requestCookies"] == [ {"name": "z", "value": "y", "domain": "example.com"} ] cookie_middleware.process_request(request2, spider=None) api_params = param_parser.parse(request2) - assert "requestCookies" not in api_params["experimental"] + assert "requestCookies" not in api_params cookie_middleware.process_request(request3, spider=None) api_params = param_parser.parse(request3) - assert sort_dict_list( - api_params["experimental"]["requestCookies"] - ) == sort_dict_list( + assert sort_dict_list(api_params["requestCookies"]) == sort_dict_list( [ {"name": "x", "value": "w", "domain": "example.com"}, {"name": "z", "value": "y", "domain": "example.com"}, @@ -2592,9 +2592,7 @@ def test_automap_cookie_jar(meta): cookie_middleware.process_request(request4, spider=None) api_params = param_parser.parse(request4) - assert sort_dict_list( - api_params["experimental"]["requestCookies"] - ) == sort_dict_list( + assert sort_dict_list(api_params["requestCookies"]) == sort_dict_list( [ {"name": "x", "value": "w", "domain": "example.com"}, {"name": "z", "value": "y", "domain": "example.com"}, @@ -2612,7 +2610,6 @@ def test_automap_cookie_jar(meta): def test_automap_cookie_limit(meta, caplog): settings: Dict[str, Any] = { **SETTINGS, - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, "ZYTE_API_MAX_COOKIES": 1, "ZYTE_API_TRANSPARENT_MODE": True, } @@ -2632,7 +2629,7 @@ def test_automap_cookie_limit(meta, caplog): cookie_middleware.process_request(request, spider=None) with caplog.at_level("WARNING"): api_params = param_parser.parse(request) - assert api_params["experimental"]["requestCookies"] == [ + assert api_params["requestCookies"] == [ {"name": "z", "value": "y", "domain": "example.com"} ] assert not caplog.records @@ -2649,7 +2646,7 @@ def test_automap_cookie_limit(meta, caplog): cookie_middleware.process_request(request, spider=None) with caplog.at_level("WARNING"): api_params = param_parser.parse(request) - assert api_params["experimental"]["requestCookies"] in [ + assert api_params["requestCookies"] in [ [{"name": "z", "value": "y", "domain": "example.com"}], [{"name": "x", "value": "w", "domain": "example.com"}], ] @@ -2674,7 +2671,7 @@ def test_automap_cookie_limit(meta, caplog): cookie_middleware.process_request(request, spider=None) with caplog.at_level("WARNING"): api_params = param_parser.parse(request) - assert api_params["experimental"]["requestCookies"] in [ + assert api_params["requestCookies"] in [ [{"name": "z", "value": "y", "domain": "example.com"}], [{"name": "x", "value": "w", "domain": "example.com"}], ] @@ -2698,7 +2695,7 @@ def test_automap_cookie_limit(meta, caplog): cookie_middleware.process_request(request, spider=None) with caplog.at_level("WARNING"): api_params = param_parser.parse(request) - assert api_params["experimental"]["requestCookies"] in [ + assert api_params["requestCookies"] in [ [{"name": "z", "value": "y", "domain": "other.example"}], [{"name": "x", "value": "w", "domain": "example.com"}], ] @@ -2747,7 +2744,6 @@ def test_automap_custom_cookie_middleware(): f"{mw_cls.__module__}.{mw_cls.__qualname__}": 700, }, "ZYTE_API_COOKIE_MIDDLEWARE": f"{mw_cls.__module__}.{mw_cls.__qualname__}", - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, "ZYTE_API_TRANSPARENT_MODE": True, } crawler = get_crawler(settings) @@ -2758,7 +2754,7 @@ def test_automap_custom_cookie_middleware(): request = Request(url="https://example.com/1") cookie_middleware.process_request(request, spider=None) api_params = param_parser.parse(request) - assert api_params["experimental"]["requestCookies"] == [ + assert api_params["requestCookies"] == [ {"name": "z", "value": "y", "domain": "example.com"} ] @@ -2771,8 +2767,7 @@ def test_automap_custom_cookie_middleware(): "a", {}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "httpRequestBody": "YQ==", }, [], @@ -2783,8 +2778,7 @@ def test_automap_custom_cookie_middleware(): "a", {"httpRequestBody": "Yg=="}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "httpRequestBody": "Yg==", }, [ @@ -2798,8 +2792,7 @@ def test_automap_custom_cookie_middleware(): "a", {"httpRequestBody": "YQ=="}, { - "httpResponseBody": True, - "httpResponseHeaders": True, + **DEFAULT_AUTOMAP_PARAMS, "httpRequestBody": "YQ==", }, ["Use Request.body instead"], @@ -2811,6 +2804,7 @@ def test_automap_custom_cookie_middleware(): { "browserHtml": True, "httpRequestBody": "YQ==", + "responseCookies": True, }, [], ), @@ -2820,6 +2814,7 @@ def test_automap_custom_cookie_middleware(): { "httpRequestBody": "YQ==", "screenshot": True, + "responseCookies": True, }, [], ), @@ -2829,6 +2824,7 @@ def test_automap_custom_cookie_middleware(): { "httpRequestBody": "YQ==", EXTRACT_KEY: True, + "responseCookies": True, }, [], ), @@ -2852,6 +2848,7 @@ def test_automap_body(body, meta, expected, warnings, caplog): }, { "browserHtml": True, + "responseCookies": True, }, ["unnecessarily defines"], ), @@ -2859,20 +2856,14 @@ def test_automap_body(body, meta, expected, warnings, caplog): { "browserHtml": False, }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, ["unnecessarily defines"], ), ( { "screenshot": False, }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, ["unnecessarily defines"], ), ( @@ -2882,6 +2873,7 @@ def test_automap_body(body, meta, expected, warnings, caplog): }, { "screenshot": True, + "responseCookies": True, }, ["do not need to set httpResponseHeaders to False"], ), @@ -2889,10 +2881,7 @@ def test_automap_body(body, meta, expected, warnings, caplog): { EXTRACT_KEY: False, }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, ["unnecessarily defines"], ), ( @@ -2902,6 +2891,7 @@ def test_automap_body(body, meta, expected, warnings, caplog): }, { EXTRACT_KEY: True, + "responseCookies": True, }, ["do not need to set httpResponseHeaders to False"], ), @@ -2919,16 +2909,14 @@ def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): {"screenshot": True, "browserHtml": False}, { "screenshot": True, + "responseCookies": True, }, [], ), ( {}, {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, + DEFAULT_AUTOMAP_PARAMS, [], ), ], diff --git a/tests/test_downloader_middleware.py b/tests/test_downloader_middleware.py index 63ff44c6..f4c3090b 100644 --- a/tests/test_downloader_middleware.py +++ b/tests/test_downloader_middleware.py @@ -64,8 +64,7 @@ async def test_autothrottle_handling(): async def test_cookies(): """Make sure that the downloader middleware does not crash on Zyte API requests with cookies.""" - settings = {"ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True} - crawler = get_crawler(settings_dict=settings) + crawler = get_crawler() await crawler.crawl("a") spider = crawler.spider middleware = create_instance( diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 9784d342..af898231 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -228,9 +228,7 @@ def test_only_end_parameters_matter(): "zyte_api": { "httpResponseBody": True, "httpResponseHeaders": True, - "experimental": { - "responseCookies": True, - }, + "responseCookies": True, } }, ) diff --git a/tests/test_responses.py b/tests/test_responses.py index 9705a6fc..cb33d9cd 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -66,9 +66,7 @@ def raw_api_response_browser(): {"name": "Content-Length", "value": len(PAGE_CONTENT)}, ], "statusCode": 200, - "experimental": { - "responseCookies": INPUT_COOKIES, - }, + "responseCookies": INPUT_COOKIES, } @@ -82,9 +80,7 @@ def raw_api_response_body(): {"name": "Content-Length", "value": len(PAGE_CONTENT)}, ], "statusCode": 200, - "experimental": { - "responseCookies": INPUT_COOKIES, - }, + "responseCookies": INPUT_COOKIES, } @@ -99,9 +95,7 @@ def raw_api_response_mixed(): {"name": "Content-Length", "value": len(PAGE_CONTENT_2)}, ], "statusCode": 200, - "experimental": { - "responseCookies": INPUT_COOKIES, - }, + "responseCookies": INPUT_COOKIES, } From 7abcc28b9aaff57e302ee1da18f7ac3bbc982747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 14:03:30 +0100 Subject: [PATCH 04/16] Address mypy issues --- tests/test_api_requests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 4f51371e..2cfd2e45 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -5,7 +5,7 @@ from functools import partial from http.cookiejar import Cookie from inspect import isclass -from typing import Any, Dict, cast +from typing import Any, Dict, List, cast from unittest import mock from unittest.mock import patch @@ -1814,7 +1814,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca { "httpResponseBody": True, "httpResponseHeaders": True, - **output_params, + **cast(Dict, output_params), }, warnings, [], @@ -2257,7 +2257,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca }, }, [ - *override_warnings, + *cast(List, override_warnings), "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", ], ), @@ -2290,7 +2290,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca params, { **params, - **extra_output_params, + **cast(Dict, extra_output_params), }, warnings, [], From 8586ef6bffb101bb1932ed48039da08f5fdf26b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 14:58:44 +0100 Subject: [PATCH 05/16] Warn about the use of deprecated experimental fields --- scrapy_zyte_api/_params.py | 23 +++++++++ tests/test_api_requests.py | 99 +++++++++++++++++++++++++++++++++++--- 2 files changed, 115 insertions(+), 7 deletions(-) diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index a2a92be3..0a7c351e 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -702,6 +702,14 @@ def __init__(self, crawler, cookies_enabled=None): self._max_cookies = settings.getint("ZYTE_API_MAX_COOKIES", 100) self._crawler = crawler self._cookie_jars = None + if not self._experimental_cookies: + self._unreported_deprecated_experimental_fields = { + "requestCookies", + "responseCookies", + "cookieManagement", + } + else: + self._unreported_deprecated_experimental_fields = set() def parse(self, request): dont_merge_cookies = request.meta.get("dont_merge_cookies", False) @@ -720,4 +728,19 @@ def parse(self, request): max_cookies=self._max_cookies, experimental_cookies=self._experimental_cookies, ) + if ( + params + and self._unreported_deprecated_experimental_fields + and "experimental" in params + ): + for field in list(self._unreported_deprecated_experimental_fields): + if field in params["experimental"]: + self._unreported_deprecated_experimental_fields.remove(field) + logger.warning( + f"Zyte API parameters for request {request} include " + f"experimental.{field}, which is deprecated. Please, " + f"replace it with {field}, both in request parameters " + f"and in any response parsing logic that might rely " + f"on the old parameter." + ) return params diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 2cfd2e45..e5e88319 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1844,7 +1844,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "responseCookies": False, }, }, - [], + ["experimental.responseCookies, which is deprecated"], ), ( { @@ -1901,7 +1901,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "requestCookies": False, }, }, - [], + ["experimental.requestCookies, which is deprecated"], ), ( { @@ -1959,7 +1959,10 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "responseCookies": False, }, }, - [], + [ + "experimental.responseCookies, which is deprecated", + "experimental.requestCookies, which is deprecated", + ], ), ( { @@ -2018,7 +2021,9 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "responseCookies": False, }, }, - [], + [ + "experimental.responseCookies, which is deprecated", + ], ), ( { @@ -2081,7 +2086,9 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "requestCookies": False, }, }, - [], + [ + "experimental.requestCookies, which is deprecated", + ], ), ( { @@ -2145,7 +2152,10 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "responseCookies": False, }, }, - [], + [ + "experimental.responseCookies, which is deprecated", + "experimental.requestCookies, which is deprecated", + ], ), ( { @@ -2237,7 +2247,9 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "requestCookies": override_cookies, }, }, - [], + [ + "experimental.requestCookies, which is deprecated", + ], ), ( { @@ -2966,3 +2978,76 @@ def test_default_params_false(default_params): param_parser = handler._param_parser api_params = param_parser.parse(request) assert api_params is None + + +# https://stackoverflow.com/a/6037657 +def unflatten(dictionary): + resultDict: Dict[Any, Any] = dict() + for key, value in dictionary.items(): + parts = key.split(".") + d = resultDict + for part in parts[:-1]: + if part not in d: + d[part] = dict() + d = d[part] + d[parts[-1]] = value + return resultDict + + +@pytest.mark.parametrize( + "old_field,new_field", + [ + ( + f"experimental.{field}", + field, + ) + for field in ( + "responseCookies", + "requestCookies", + "cookieManagement", + ) + ], +) +def test_field_deprecation_warnings(old_field, new_field, caplog): + input_params = unflatten({old_field: "foo"}) + + # Raw + raw_request = Request( + url="https://example.com", + meta={"zyte_api": input_params}, + ) + crawler = get_crawler(SETTINGS) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser + with caplog.at_level("WARNING"): + output_params = param_parser.parse(raw_request) + output_params.pop("url") + assert input_params == output_params + assert f"{old_field}, which is deprecated" in caplog.text + caplog.clear() + with caplog.at_level("WARNING"): + # Only warn once per field. + param_parser.parse(raw_request) + assert not caplog.text + caplog.clear() + + # Automap + raw_request = Request( + url="https://example.com", + meta={"zyte_api_automap": input_params}, + ) + crawler = get_crawler(SETTINGS) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser + with caplog.at_level("WARNING"): + output_params = param_parser.parse(raw_request) + output_params.pop("url") + for key, value in input_params.items(): + assert output_params[key] == value + assert f"{old_field}, which is deprecated" in caplog.text + caplog.clear() + with caplog.at_level("WARNING"): + # Only warn once per field. + param_parser.parse(raw_request) + assert not caplog.text + caplog.clear() From 318d38292c0e71ee046b3982a7717e4aab8bfad2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 15:04:51 +0100 Subject: [PATCH 06/16] Test that cookies do not affect request fingerprinting --- tests/test_request_fingerprinter.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index af898231..17d6b356 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -81,6 +81,32 @@ def test_headers(): assert fingerprint1 == fingerprint2 +def test_cookies(): + crawler = get_crawler() + fingerprinter = create_instance( + ScrapyZyteAPIRequestFingerprinter, settings=crawler.settings, crawler=crawler + ) + request1 = Request( + "https://example.com", + meta={ + "zyte_api": { + "responseCookies": True, + "requestCookies": [{"name": "foo", "value": "bar"}], + "cookieManagement": False, + "experimental": { + "responseCookies": True, + "requestCookies": [{"name": "foo", "value": "bar"}], + "cookieManagement": False, + }, + } + }, + ) + request2 = Request("https://example.com", meta={"zyte_api": True}) + fingerprint1 = fingerprinter.fingerprint(request1) + fingerprint2 = fingerprinter.fingerprint(request2) + assert fingerprint1 == fingerprint2 + + @pytest.mark.parametrize( "url,params,fingerprint", ( From a01a93a25ed81cd2593ae0a2549d2563706f28ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 15:08:56 +0100 Subject: [PATCH 07/16] Test that experimental.foo does not get a warning --- tests/test_api_requests.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index e5e88319..abd1bdb8 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -3051,3 +3051,40 @@ def test_field_deprecation_warnings(old_field, new_field, caplog): param_parser.parse(raw_request) assert not caplog.text caplog.clear() + + +def test_field_deprecation_warnings_false_positives(caplog): + """Make sure that the code tested by test_field_deprecation_warnings does + not trigger for unrelated fields that just happen to share their name space + (experimental).""" + + input_params = {"experimental": {"foo": "bar"}} + + # Raw + raw_request = Request( + url="https://example.com", + meta={"zyte_api": input_params}, + ) + crawler = get_crawler(SETTINGS) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser + with caplog.at_level("WARNING"): + output_params = param_parser.parse(raw_request) + output_params.pop("url") + assert input_params == output_params + assert not caplog.text + + # Automap + raw_request = Request( + url="https://example.com", + meta={"zyte_api_automap": input_params}, + ) + crawler = get_crawler(SETTINGS) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser + with caplog.at_level("WARNING"): + output_params = param_parser.parse(raw_request) + output_params.pop("url") + for key, value in input_params.items(): + assert output_params[key] == value + assert not caplog.text From 6ad1bdeb46cd6ce1caf71a39d96ce8b0cd36204a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 15:20:48 +0100 Subject: [PATCH 08/16] Test response parsing of cookies --- tests/test_responses.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_responses.py b/tests/test_responses.py index cb33d9cd..fc92d5db 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -481,3 +481,44 @@ def test_status_code(base_kwargs_func, kwargs, expected_status_code): response = _process_response(api_response, Request(api_response["url"])) assert response is not None assert response.status == expected_status_code + + +@pytest.mark.parametrize( + "api_response", + [ + { + "url": "https://example.com", + "httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8")), + "statusCode": 200, + "responseCookies": INPUT_COOKIES, + }, + { + "url": "https://example.com", + "httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8")), + "statusCode": 200, + "experimental": { + "responseCookies": INPUT_COOKIES, + }, + }, + { + "url": "https://example.com", + "httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8")), + "statusCode": 200, + "responseCookies": INPUT_COOKIES, + "experimental": { + "responseCookies": [ + { + "name": "foo", + "value": "bar", + }, + ], + }, + }, + ], +) +def test_cookies(api_response): + response = _process_response(api_response, Request(api_response["url"])) + assert response is not None + assert response.headers == { + **OUTPUT_COOKIE_HEADERS, + } From 519f880f3c9a070a2f938f0e9ce36cc3f84fcde7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 15:30:24 +0100 Subject: [PATCH 09/16] Do not warn about experimental.responseCookies on response parsing --- scrapy_zyte_api/_cookies.py | 12 ------------ tests/test_responses.py | 8 ++++++-- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/scrapy_zyte_api/_cookies.py b/scrapy_zyte_api/_cookies.py index df1eea70..f20518ca 100644 --- a/scrapy_zyte_api/_cookies.py +++ b/scrapy_zyte_api/_cookies.py @@ -1,7 +1,6 @@ from http.cookiejar import Cookie from typing import Any, Dict, List, Optional from urllib.parse import urlparse -from warnings import warn from scrapy.http import Request from scrapy.http.cookies import CookieJar @@ -35,17 +34,6 @@ def _process_cookies( response_cookies = api_response.get("responseCookies", old_response_cookies) if not response_cookies: return - if old_response_cookies: - warn( - "Found experimental.responseCookies in a Zyte API response. The " - "experimental name space is deprecated for that parameter. " - "Please, make sure you are no longer using the deprecated " - "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting, and remove the " - "experimental name space from the responseCookies and " - "requestCookies parameters in your code, both when building " - "requests and when parsing responses.", - DeprecationWarning, - ) cookie_jar = _get_cookie_jar(request, cookie_jars) for response_cookie in response_cookies: rest = {} diff --git a/tests/test_responses.py b/tests/test_responses.py index fc92d5db..48400a71 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -516,9 +516,13 @@ def test_status_code(base_kwargs_func, kwargs, expected_status_code): }, ], ) -def test_cookies(api_response): - response = _process_response(api_response, Request(api_response["url"])) +def test_cookies(api_response, caplog): + with caplog.at_level("WARNING"): + response = _process_response(api_response, Request(api_response["url"])) assert response is not None assert response.headers == { **OUTPUT_COOKIE_HEADERS, } + # Do not warn about the deprecated experimental.responseCookies response + # parameter, we already warn about it when found among request parameters. + assert not caplog.text From 2f6044419a9ede2d1f4c924ccdc10bc3cefcbceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 21 Nov 2023 14:27:42 +0100 Subject: [PATCH 10/16] Fix merge leftovers --- README.rst | 1918 ---------------------------------------------------- 1 file changed, 1918 deletions(-) diff --git a/README.rst b/README.rst index 41965a56..ba8a178b 100644 --- a/README.rst +++ b/README.rst @@ -26,1923 +26,5 @@ Scrapy plugin for seamless `Zyte API`_ integration. .. description ends -<<<<<<< HEAD -Requirements -============ - -* Python 3.7+ -* Scrapy 2.0.1+ - -scrapy-poet integration requires more recent software: - -* Python 3.8+ -* Scrapy 2.6+ - -Installation -============ - -.. code-block:: - - pip install scrapy-zyte-api - - -Quick start -=========== - -Get a `Zyte API`_ key, and add it to your project settings.py: - -.. code-block:: python - - ZYTE_API_KEY = "YOUR_API_KEY" - -Instead of adding API key to setting.py you can also set -``ZYTE_API_KEY`` environment variable. - -Then, set up the scrapy-zyte-api integration: - -.. code-block:: python - - DOWNLOAD_HANDLERS = { - "http": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", - "https": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", - } - DOWNLOADER_MIDDLEWARES = { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, - } - REQUEST_FINGERPRINTER_CLASS = "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter" - TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" - -By default, scrapy-zyte-api doesn't change the spider behavior. -To switch your spider to use Zyte API for all requests, -set the following option: - -.. code-block:: python - - ZYTE_API_TRANSPARENT_MODE = True - -Configuration -============= - -To enable this plugin: - -- Set the ``http`` and ``https`` keys in the `DOWNLOAD_HANDLERS - `_ - Scrapy setting to ``"scrapy_zyte_api.ScrapyZyteAPIDownloadHandler"``. - -- Add ``"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware"`` to the - `DOWNLOADER_MIDDLEWARES - `_ - Scrapy setting with any value, e.g. ``1000``. - -- Set the `REQUEST_FINGERPRINTER_CLASS - `_ - Scrapy setting to ``"scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter"``. - -- Set the `TWISTED_REACTOR - `_ - Scrapy setting to - ``"twisted.internet.asyncioreactor.AsyncioSelectorReactor"``. - - .. note:: On existing projects that were not using the asyncio Twisted - reactor, your existing code may need changes, such as: - - - `Handling a pre-installed Twisted reactor - `_. - - Some Twisted imports install the default, non-asyncio Twisted - reactor as a side effect. Once a reactor is installed, it cannot be - changed for the whole run time. - - - `Converting Twisted Deferreds into asyncio Futures - `_. - - Note that you might be using Deferreds without realizing it through - some Scrapy functions and methods. For example, when you yield the - return value of ``self.crawler.engine.download()`` from a spider - callback, you are yielding a Deferred. - -- Set `your Zyte API key - `_ as - either the ``ZYTE_API_KEY`` Scrapy setting or as an environment variable of - the same name. - -The ``ZYTE_API_ENABLED`` setting, which is ``True`` by default, can be set to -``False`` to disable this plugin. - -If you want to use scrapy-poet integration, add a provider to -``SCRAPY_POET_PROVIDERS`` (see `scrapy-poet integration`_): - -.. code-block:: python - - SCRAPY_POET_PROVIDERS = { - "scrapy_zyte_api.providers.ZyteApiProvider": 1100, - } - -Usage -===== - -You can send requests through Zyte API in one of the following ways: - -- Send all request through Zyte API by default, letting Zyte API parameters - be chosen automatically based on your Scrapy request parameters. See - `Using transparent mode`_. - -- Send specific requests through Zyte API, setting all Zyte API parameters - manually, keeping full control of what is sent to Zyte API. - See `Sending requests with manually-defined parameters`_. - -- Send specific requests through Zyte API, letting Zyte API parameters be - chosen automatically based on your Scrapy request parameters. - See `Sending requests with automatically-mapped parameters`_. - -Zyte API response parameters are mapped into Scrapy response parameters where -possible. See `Response mapping`_ for details. - - -Using transparent mode ----------------------- - -Set the ``ZYTE_API_TRANSPARENT_MODE`` `Scrapy setting`_ to ``True`` to handle -Scrapy requests as follows: - -.. _Scrapy setting: https://docs.scrapy.org/en/latest/topics/settings.html - -- By default, requests are sent through Zyte API with automatically-mapped - parameters. See `Sending requests with automatically-mapped parameters`_ - for details about automatic request parameter mapping. - - You do not need to set the ``zyte_api_automap`` request meta key to - ``True``, but you can set it to a dictionary to extend your Zyte API - request parameters. - -- Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent - through Zyte API with manually-defined parameters. - See `Sending requests with manually-defined parameters`_. - -- Requests with the ``zyte_api_automap`` request meta key set to ``False`` - are *not* sent through Zyte API. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - start_urls = ["https://quotes.toscrape.com/"] - - custom_settings = { - "ZYTE_API_TRANSPARENT_MODE": True, - } - - def parse(self, response): - print(response.text) - # "…" - - -Sending requests with manually-defined parameters -------------------------------------------------- - -To send a Scrapy request through Zyte API with manually-defined parameters, -define your Zyte API parameters in the ``zyte_api`` key in -`Request.meta `_ -as a ``dict``. - -The only exception is the ``url`` parameter, which should not be defined as a -Zyte API parameter. The value from ``Request.url`` is used automatically. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api": { - "browserHtml": True, - } - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -Note that response headers are necessary for raw response decoding. When -defining parameters manually and requesting ``httpResponseBody`` extraction, -remember to also request ``httpResponseHeaders`` extraction: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api": { - "httpResponseBody": True, - "httpResponseHeaders": True, - } - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -To learn more about Zyte API parameters, see the `data extraction usage`_ and -`API reference`_ pages of the `Zyte API documentation`_. - -.. _API reference: https://docs.zyte.com/zyte-api/openapi.html -.. _data extraction usage: https://docs.zyte.com/zyte-api/usage/extract.html -.. _Zyte API documentation: https://docs.zyte.com/zyte-api/get-started.html - - -Sending requests with automatically-mapped parameters ------------------------------------------------------ - -To send a Scrapy request through Zyte API letting Zyte API parameters be -automatically chosen based on the parameters of that Scrapy request, set the -``zyte_api_automap`` key in -`Request.meta `_ -to ``True``. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api_automap": True, - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -See also `Using transparent mode`_ and `Automated request parameter mapping`_. - - -Response mapping ----------------- - -Zyte API responses are mapped with one of the following classes: - -- ``scrapy_zyte_api.responses.ZyteAPITextResponse``, a subclass of - ``scrapy.http.TextResponse``, is used to map text responses, i.e. responses - with ``browserHtml`` or responses with both ``httpResponseBody`` and - ``httpResponseHeaders`` with a text body (e.g. plain text, HTML, JSON). - -- ``scrapy_zyte_api.responses.ZyteAPIResponse``, a subclass of - ``scrapy.http.Response``, is used to map any other response. - -Zyte API response parameters are mapped into response class attributes where -possible: - -- ``url`` becomes ``response.url``. - -- ``statusCode`` becomes ``response.status``. - -- ``httpResponseHeaders`` and ``responseCookies`` become - ``response.headers``. - -- ``responseCookies`` is also mapped into the request cookiejar. - -- ``browserHtml`` and ``httpResponseBody`` are mapped into both - ``response.text`` (``str``) and ``response.body`` (``bytes``). - - If none of these parameters were present, e.g. if the only requested output - was ``screenshot``, ``response.text`` and ``response.body`` would be empty. - - If a future version of Zyte API supported requesting both outputs on the - same request, and both parameters were present, ``browserHtml`` would be - the one mapped into ``response.text`` and ``response.body``. - -Both response classes have a ``raw_api_response`` attribute that contains a -``dict`` with the complete, raw response from Zyte API, where you can find all -Zyte API response parameters, including those that are not mapped into other -response class atttributes. - -For example, for a request for ``httpResponseBody`` and -``httpResponseHeaders``, you would get: - -.. code-block:: python - - def parse(self, response): - print(response.url) - # "https://quotes.toscrape.com/" - print(response.status) - # 200 - print(response.headers) - # {b"Content-Type": [b"text/html"], …} - print(response.text) - # "…" - print(response.body) - # b"…" - print(response.raw_api_response) - # { - # "url": "https://quotes.toscrape.com/", - # "statusCode": 200, - # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", - # "httpResponseHeaders": […], - # } - -For a request for ``screenshot``, on the other hand, the response would look -as follows: - -.. code-block:: python - - def parse(self, response): - print(response.url) - # "https://quotes.toscrape.com/" - print(response.status) - # 200 - print(response.headers) - # {} - print(response.text) - # "" - print(response.body) - # b"" - print(response.raw_api_response) - # { - # "url": "https://quotes.toscrape.com/", - # "statusCode": 200, - # "screenshot": "iVBORw0KGgoAAAANSUh…", - # } - from base64 import b64decode - print(b64decode(response.raw_api_response["screenshot"])) - # b'\x89PNG\r\n\x1a\n\x00\x00\x00\r…' - - -Automated request parameter mapping ------------------------------------ - -When you enable automated request parameter mapping, be it through transparent -mode (see `Using transparent mode`_) or for a specific request (see -`Sending requests with automatically-mapped parameters`_), Zyte API -parameters are chosen as follows by default: - -- ``Request.url`` becomes ``url``, same as in requests with manually-defined - parameters. - -- If ``Request.method`` is something other than ``"GET"``, it becomes - ``httpRequestMethod``. - -- ``Request.headers`` become ``customHttpRequestHeaders``. - -- ``Request.body`` becomes ``httpRequestBody``. - -- If the COOKIES_ENABLED_ Scrapy setting is ``True`` (default), and provided - request metadata does not set dont_merge_cookies_ to ``True``: - - .. _COOKIES_ENABLED: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-setting-COOKIES_ENABLED - .. _dont_merge_cookies: https://docs.scrapy.org/en/latest/topics/request-response.html#std-reqmeta-dont_merge_cookies - - - ``responseCookies`` is set to ``True``. - - - Cookies from the request `cookie jar`_ become ``requestCookies``. - - .. _cookie jar: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-reqmeta-cookiejar - - All cookies from the cookie jar are set, regardless of their cookie - domain. This is because Zyte API requests may involve requests to - different domains (e.g. when following cross-domain redirects, or - during browser rendering). - - If the cookies to be set exceed the limit defined in the - ``ZYTE_API_MAX_COOKIES`` setting (100 by default), a warning is logged, - and only as many cookies as the limit allows are set for the target - request. To silence this warning, set ``requestCookies`` manually, e.g. - to an empty dict. Alternatively, if Zyte API starts supporting more - than 100 request cookies, update the ``ZYTE_API_MAX_COOKIES`` setting - accordingly. - - If you are using a custom downloader middleware to handle request - cookiejars, you can point the ``ZYTE_API_COOKIE_MIDDLEWARE`` setting to - its import path to make scrapy-zyte-api work with it. The downloader - middleware is expected to have a ``jars`` property with the same - signature as in the built-in Scrapy downloader middleware for cookie - handling. - -- ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. - - This is subject to change without prior notice in future versions of - scrapy-zyte-api, so please account for the following: - - - If you are requesting a binary resource, such as a PDF file or an - image file, set ``httpResponseBody`` to ``True`` explicitly in your - requests: - - .. code-block:: python - - Request( - url="https://toscrape.com/img/zyte.png", - meta={ - "zyte_api_automap": {"httpResponseBody": True}, - }, - ) - - In the future, we may stop setting ``httpResponseBody`` to ``True`` by - default, and instead use a different, new Zyte API parameter that only - works for non-binary responses (e.g. HMTL, JSON, plain text). - - - If you need to access response headers, be it through - ``response.headers`` or through - ``response.raw_api_response["httpResponseHeaders"]``, set - ``httpResponseHeaders`` to ``True`` explicitly in your requests: - - .. code-block:: python - - Request( - url="https://toscrape.com/", - meta={ - "zyte_api_automap": {"httpResponseHeaders": True}, - }, - ) - - At the moment we request response headers because some response headers - are necessary to properly decode the response body as text. In the - future, Zyte API may be able to handle this decoding automatically, so - we would stop setting ``httpResponseHeaders`` to ``True`` by default. - -For example, the following Scrapy request: - -.. code-block:: python - - Request( - method="POST" - url="https://httpbin.org/anything", - headers={"Content-Type": "application/json"}, - body=b'{"foo": "bar"}', - cookies={"a": "b"}, - ) - -Results in a request to the Zyte API data extraction endpoint with the -following parameters: - -.. code-block:: javascript - - { - "customHttpRequestHeaders": [ - { - "name": "Content-Type", - "value": "application/json" - } - ], - "httpResponseBody": true, - "httpResponseHeaders": true, - "httpRequestBody": "eyJmb28iOiAiYmFyIn0=", - "httpRequestMethod": "POST", - "requestCookies": [ - { - "name": "a", - "value": "b", - "domain": "" - } - ], - "responseCookies": true, - "url": "https://httpbin.org/anything" - } - -You may set the ``zyte_api_automap`` key in -`Request.meta `_ -to a ``dict`` of Zyte API parameters to extend or override choices made by -automated request parameter mapping. - -Enabling ``browserHtml``, ``screenshot``, or an automatic extraction property, -unsets ``httpResponseBody`` and ``httpResponseHeaders``, and makes -``Request.headers`` become ``requestHeaders`` instead of -``customHttpRequestHeaders``. For example, the following Scrapy request: - -.. code-block:: python - - Request( - url="https://quotes.toscrape.com", - headers={"Referer": "https://example.com/"}, - meta={"zyte_api_automap": {"browserHtml": True}}, - ) - -Results in a request to the Zyte API data extraction endpoint with the -following parameters: - -.. code-block:: javascript - - { - "browserHtml": true, - "requestHeaders": {"referer": "https://example.com/"}, - "responseCookies": true, - "url": "https://quotes.toscrape.com" - } - -When mapping headers, headers not supported by Zyte API are excluded from the -mapping by default. Use the following `Scrapy settings`_ to change which -headers are included or excluded from header mapping: - -.. _Scrapy settings: https://docs.scrapy.org/en/latest/topics/settings.html - -- ``ZYTE_API_SKIP_HEADERS`` determines headers that must *not* be mapped as - ``customHttpRequestHeaders``, and its default value is: - - .. code-block:: python - - ["User-Agent"] - -- ``ZYTE_API_BROWSER_HEADERS`` determines headers that *can* be mapped as - ``requestHeaders``. It is a ``dict``, where keys are header names and - values are the key that represents them in ``requestHeaders``. Its default - value is: - - .. code-block:: python - - {"Referer": "referer"} - -To maximize support for potential future changes in Zyte API, automated -request parameter mapping allows some parameter values and parameter -combinations that Zyte API does not currently support, and may never support: - -- ``Request.method`` becomes ``httpRequestMethod`` even for unsupported_ - ``httpRequestMethod`` values, and even if ``httpResponseBody`` is unset. - - .. _unsupported: https://docs.zyte.com/zyte-api/usage/extract.html#zyte-api-set-method - -- You can set ``customHttpRequestHeaders`` or ``requestHeaders`` to ``True`` - to force their mapping from ``Request.headers`` in scenarios where they - would not be mapped otherwise. - - Conversely, you can set ``customHttpRequestHeaders`` or ``requestHeaders`` - to ``False`` to prevent their mapping from ``Request.headers``. - -- ``Request.body`` becomes ``httpRequestBody`` even if ``httpResponseBody`` - is unset. - -- You can set ``httpResponseBody`` to ``False`` (which unsets the parameter), - and not set ``browserHtml`` or ``screenshot`` to ``True``. In this case, - ``Request.headers`` is mapped as ``requestHeaders``. - -- You can set ``httpResponseBody`` to ``True`` and also set ``browserHtml`` - or ``screenshot`` to ``True``. In this case, ``Request.headers`` is mapped - both as ``customHttpRequestHeaders`` and as ``requestHeaders``, and - ``browserHtml`` is used as the Scrapy response body. - - -Setting default parameters -========================== - -Often the same configuration needs to be used for all Zyte API requests. For -example, all requests may need to set the same geolocation, or the spider only -uses ``browserHtml`` requests. - -The following settings allow you to define Zyte API parameters to be included -in all requests: - -- ``ZYTE_API_DEFAULT_PARAMS`` is a ``dict`` of parameters to be combined with - manually-defined parameters. See `Sending requests with manually-defined parameters`_. - - You may set the ``zyte_api`` request meta key to an empty ``dict`` to only - use default parameters for that request. - -- ``ZYTE_API_AUTOMAP_PARAMS`` is a ``dict`` of parameters to be combined with - automatically-mapped parameters. - See `Sending requests with automatically-mapped parameters`_. - -For example, if you set ``ZYTE_API_DEFAULT_PARAMS`` to -``{"geolocation": "US"}`` and ``zyte_api`` to ``{"browserHtml": True}``, -``{"url: "…", "geolocation": "US", "browserHtml": True}`` is sent to Zyte API. - -Parameters in these settings are merged with request-specific parameters, with -request-specific parameters taking precedence. - -``ZYTE_API_DEFAULT_PARAMS`` has no effect on requests that use automated -request parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on -requests that use manually-defined parameters. - -When using transparent mode (see `Using transparent mode`_), be careful -of which parameters you define through ``ZYTE_API_AUTOMAP_PARAMS``. In -transparent mode, all Scrapy requests go through Zyte API, even requests that -Scrapy sends automatically, such as those for ``robots.txt`` files when -ROBOTSTXT_OBEY_ is ``True``, or those for sitemaps when using a `sitemap -spider`_. Certain parameters, like ``browserHtml`` or ``screenshot``, are not -meant to be used for every single request. - -If the ``zyte_api_default_params`` request meta key is set to ``False``, the -value of the ``ZYTE_API_DEFAULT_PARAMS`` setting for this request is ignored. - -.. _ROBOTSTXT_OBEY: https://docs.scrapy.org/en/latest/topics/settings.html#robotstxt-obey -.. _sitemap spider: https://docs.scrapy.org/en/latest/topics/spiders.html#sitemapspider - - -Customizing the retry policy -============================ - -API requests are retried automatically using the default retry policy of -`python-zyte-api`_. - -API requests that exceed retries are dropped. You cannot manage API request -retries through Scrapy downloader middlewares. - -Use the ``ZYTE_API_RETRY_POLICY`` setting or the ``zyte_api_retry_policy`` -request meta key to override the default `python-zyte-api`_ retry policy with a -custom retry policy. - -A custom retry policy must be an instance of `tenacity.AsyncRetrying`_. - -Scrapy settings must be picklable, which `retry policies are not -`_, so you cannot assign retry -policy objects directly to the ``ZYTE_API_RETRY_POLICY`` setting, and must use -their import path string instead. - -When setting a retry policy through request meta, you can assign the -``zyte_api_retry_policy`` request meta key either the retry policy object -itself or its import path string. If you need your requests to be serializable, -however, you may also need to use the import path string. - -For example, to increase the maximum number of retries to 10 before dropping -the API request, you can subclass RetryFactory_ as follows: - -.. code-block:: python - - # project/retry_policies.py - from tenacity import stop_after_attempt - from zyte_api.aio.retry import RetryFactory - - class CustomRetryFactory(RetryFactory): - temporary_download_error_stop = stop_after_attempt(10) - - CUSTOM_RETRY_POLICY = CustomRetryFactory().build() - - # project/settings.py - ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" - - -To extend this retry policy, so it will also retry HTTP 521 errors, the same -as HTTP 520 errors, you can implement: - -.. code-block:: python - - # project/retry_policies.py - from tenacity import retry_if_exception, RetryCallState, stop_after_attempt - from zyte_api.aio.errors import RequestError - from zyte_api.aio.retry import RetryFactory - - def is_http_521(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status == 521 - - class CustomRetryFactory(RetryFactory): - - retry_condition = ( - RetryFactory.retry_condition - | retry_if_exception(is_http_521) - ) - temporary_download_error_stop = stop_after_attempt(10) - - def wait(self, retry_state: RetryCallState) -> float: - if is_http_521(retry_state.outcome.exception()): - return self.temporary_download_error_wait(retry_state=retry_state) - return super().wait(retry_state) - - def stop(self, retry_state: RetryCallState) -> bool: - if is_http_521(retry_state.outcome.exception()): - return self.temporary_download_error_stop(retry_state) - return super().stop(retry_state) - - CUSTOM_RETRY_POLICY = CustomRetryFactory().build() - - # project/settings.py - ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" - -.. _python-zyte-api: https://github.com/zytedata/python-zyte-api -.. _RetryFactory: https://github.com/zytedata/python-zyte-api/blob/main/zyte_api/aio/retry.py -.. _tenacity.AsyncRetrying: https://tenacity.readthedocs.io/en/latest/api.html#tenacity.AsyncRetrying - - -Misc settings -============= - -- ``ZYTE_API_MAX_REQUESTS`` - - Default: ``None`` - - When set to an integer value > 0, the spider will close when the number of - successful Zyte API requests reaches it. Note that in some cases, the actual - number of successful Zyte API requests would be below this number if some of - the in-progress requests fail or error out. - - -Stats -===== - -Stats from python-zyte-api_ are exposed as Scrapy stats with the -``scrapy-zyte-api`` prefix. - -For example, ``scrapy-zyte-api/status_codes/`` stats indicate the -status code of Zyte API responses (e.g. ``429`` for `rate limiting -`_ or -``520`` for `temporary download errors -`_). - -.. note:: The actual status code that is received from the target website, i.e. - the `statusCode - `_ - response field of a `Zyte API successful response - `_, - is accounted for in the ``downloader/response_status_count/`` - stat, as with any other Scrapy response. - - -Request fingerprinting -====================== - -The request fingerprinter class of this plugin ensures that Scrapy 2.7 and -later generate unique `request fingerprints -`_ -for Zyte API requests based on some of their parameters. - -For example, a request for ``browserHtml`` and a request for ``screenshot`` -with the same target URL are considered different requests. Similarly, requests -with the same target URL but different ``actions`` are also considered -different requests. - -Zyte API parameters that affect request fingerprinting ------------------------------------------------------- - -The request fingerprinter class of this plugin generates request fingerprints -for Zyte API requests based on the following Zyte API parameters: - -- ``url`` (`canonicalized `_) - - For URLs that include a URL fragment, like ``https://example.com#foo``, URL - canonicalization keeps the URL fragment if ``browserHtml`` or - ``screenshot`` are enabled. - -- Request attribute parameters (``httpRequestBody``, - ``httpRequestMethod``) - -- Output parameters (``browserHtml``, ``httpResponseBody``, - ``httpResponseHeaders``, ``screenshot``) - -- Rendering option parameters (``actions``, ``javascript``, - ``screenshotOptions``) - -- ``geolocation`` - -The following Zyte API parameters are *not* taken into account for request -fingerprinting: - -- Request header parameters (``customHttpRequestHeaders``, - ``requestHeaders``, ``requestCookies``) - -- Metadata parameters (``echoData``, ``jobId``) - -- Experimental parameters (``experimental``) - - -Changing the fingerprinting of non-Zyte-API requests ----------------------------------------------------- - -You can assign a request fingerprinter class to the -``ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS`` Scrapy setting to configure -a custom request fingerprinter class to use for requests that do not go through -Zyte API: - -.. code-block:: python - - ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS = "custom.RequestFingerprinter" - -By default, requests that do not go through Zyte API use the default request -fingerprinter class of the installed Scrapy version. - - -Request fingerprinting before Scrapy 2.7 ----------------------------------------- - -If you have a Scrapy version older than Scrapy 2.7, Zyte API parameters are not -taken into account for request fingerprinting. This can cause some Scrapy -components, like the filter of duplicate requests or the HTTP cache extension, -to interpret 2 different requests as being the same. - -To avoid most issues, use automated request parameter mapping, either through -transparent mode or setting ``zyte_api_automap`` to ``True`` in -``Request.meta``, and then use ``Request`` attributes instead of -``Request.meta`` as much as possible. Unlike ``Request.meta``, ``Request`` -attributes do affect request fingerprints in Scrapy versions older than Scrapy -2.7. - -For requests that must have the same ``Request`` attributes but should still -be considered different, such as browser-based requests with different URL -fragments, you can set ``dont_filter`` to ``True`` on ``Request.meta`` to -prevent the duplicate filter of Scrapy to filter any of them out. For example: - -.. code-block:: python - - yield Request( - "https://toscrape.com#1", - meta={"zyte_api_automap": {"browserHtml": True}}, - dont_filter=True, - ) - yield Request( - "https://toscrape.com#2", - meta={"zyte_api_automap": {"browserHtml": True}}, - dont_filter=True, - ) - -Note, however, that for other Scrapy components, like the HTTP cache -extensions, these 2 requests would still be considered identical. - - -Logging request parameters -========================== - -Set the ``ZYTE_API_LOG_REQUESTS`` setting to ``True`` and the ``LOG_LEVEL`` -setting to ``"DEBUG"`` to enable the logging of debug messages that indicate -the JSON object sent on every extract request to Zyte API. - -For example:: - - Sending Zyte API extract request: {"url": "https://example.com", "httpResponseBody": true} - -The ``ZYTE_API_LOG_REQUESTS_TRUNCATE``, 64 by default, determines the maximum -length of any string value in the logged JSON object, excluding object keys. To -disable truncation, set it to 0. - -scrapy-poet integration -======================= - -``scrapy-zyte-api`` includes a `scrapy-poet provider`_ that you can use to get -data from Zyte API in page objects. It requires additional dependencies which -you can get by installing the optional ``provider`` feature: -``pip install scrapy-zyte-api[provider]``. Enable the provider in the Scrapy -settings:: - - SCRAPY_POET_PROVIDERS = { - "scrapy_zyte_api.providers.ZyteApiProvider": 1100, - } - -Request some supported dependencies in the page object:: - - @attrs.define - class ProductPage(BasePage): - response: BrowserResponse - product: Product - - - class ZyteApiSpider(scrapy.Spider): - ... - - def parse_page(self, response: DummyResponse, page: ProductPage): - ... - -Or request them directly in the callback:: - - class ZyteApiSpider(scrapy.Spider): - ... - - def parse_page(self, - response: DummyResponse, - browser_response: BrowserResponse, - product: Product, - ): - ... - -The currently supported dependencies are: - -* ``web_poet.BrowserHtml`` -* ``web_poet.BrowserResponse`` -* ``zyte_common_items.Product`` -* ``zyte_common_items.ProductList`` -* ``zyte_common_items.ProductNavigation`` -* ``zyte_common_items.Article`` -* ``zyte_common_items.ArticleList`` -* ``zyte_common_items.ArticleNavigation`` - -The provider will make a request to Zyte API using the ``ZYTE_API_KEY`` and -``ZYTE_API_URL`` settings. - -The provider will ignore the transparent mode and parameter mapping settings. -To add extra parameters to all Zyte API requests sent by the provider, set them -as a dictionary through the ``ZYTE_API_PROVIDER_PARAMS`` setting, for example -in ``settings.py``:: - - ZYTE_API_PROVIDER_PARAMS = {"geolocation": "IE"} - -When the ``ZYTE_API_PROVIDER_PARAMS`` setting includes one of the Zyte API -extraction options (e.g. ``productOptions`` for ``product``), but the -final Zyte API request doesn't include the corresponding data type, the -unused options are automatically removed. So, it's safe to use -``ZYTE_API_PROVIDER_PARAMS`` to set the default options for various extraction -types, e.g.:: - - ZYTE_API_PROVIDER_PARAMS = { - "productOptions": {"extractFrom": "httpResponseBody"}, - "productNavigationOptions": {"extractFrom": "httpResponseBody"}, - } - -Note that the built-in ``scrapy_poet.page_input_providers.ItemProvider`` has a -priority of 2000, so when you have page objects producing -``zyte_common_items.Product`` items you should use higher values for -``ZyteApiProvider`` if you want these items to come from these page objects, -and lower values if you want them to come from Zyte API. - -Currently, when ``ItemProvider`` is used together with ``ZyteApiProvider``, -it may make more requests than is optimal: the normal Scrapy response will be -always requested even when using a ``DummyResponse`` annotation, and in some -dependency combinations two Zyte API requests will be made for the same page. -We are planning to solve these problems in the future releases of -``scrapy-poet`` and ``scrapy-zyte-api``. - -.. _scrapy-poet provider: https://scrapy-poet.readthedocs.io/en/stable/providers.html - - -Running behind a proxy -====================== - -If you require a proxy to access Zyte API (e.g. a corporate proxy), configure -the ``HTTP_PROXY`` and ``HTTPS_PROXY`` environment variables accordingly, and -set the ``ZYTE_API_USE_ENV_PROXY`` setting to ``True``. -||||||| 5e205ef -Requirements -============ - -* Python 3.7+ -* Scrapy 2.0.1+ - -scrapy-poet integration requires more recent software: - -* Python 3.8+ -* Scrapy 2.6+ - -Installation -============ - -.. code-block:: - - pip install scrapy-zyte-api - - -Quick start -=========== - -Get a `Zyte API`_ key, and add it to your project settings.py: - -.. code-block:: python - - ZYTE_API_KEY = "YOUR_API_KEY" - -Instead of adding API key to setting.py you can also set -``ZYTE_API_KEY`` environment variable. - -Then, set up the scrapy-zyte-api integration: - -.. code-block:: python - - DOWNLOAD_HANDLERS = { - "http": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", - "https": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", - } - DOWNLOADER_MIDDLEWARES = { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, - } - REQUEST_FINGERPRINTER_CLASS = "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter" - TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" - -By default, scrapy-zyte-api doesn't change the spider behavior. -To switch your spider to use Zyte API for all requests, -set the following option: - -.. code-block:: python - - ZYTE_API_TRANSPARENT_MODE = True - -Configuration -============= - -To enable this plugin: - -- Set the ``http`` and ``https`` keys in the `DOWNLOAD_HANDLERS - `_ - Scrapy setting to ``"scrapy_zyte_api.ScrapyZyteAPIDownloadHandler"``. - -- Add ``"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware"`` to the - `DOWNLOADER_MIDDLEWARES - `_ - Scrapy setting with any value, e.g. ``1000``. - -- Set the `REQUEST_FINGERPRINTER_CLASS - `_ - Scrapy setting to ``"scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter"``. - -- Set the `TWISTED_REACTOR - `_ - Scrapy setting to - ``"twisted.internet.asyncioreactor.AsyncioSelectorReactor"``. - - .. note:: On existing projects that were not using the asyncio Twisted - reactor, your existing code may need changes, such as: - - - `Handling a pre-installed Twisted reactor - `_. - - Some Twisted imports install the default, non-asyncio Twisted - reactor as a side effect. Once a reactor is installed, it cannot be - changed for the whole run time. - - - `Converting Twisted Deferreds into asyncio Futures - `_. - - Note that you might be using Deferreds without realizing it through - some Scrapy functions and methods. For example, when you yield the - return value of ``self.crawler.engine.download()`` from a spider - callback, you are yielding a Deferred. - -- Set `your Zyte API key - `_ as - either the ``ZYTE_API_KEY`` Scrapy setting or as an environment variable of - the same name. - -The ``ZYTE_API_ENABLED`` setting, which is ``True`` by default, can be set to -``False`` to disable this plugin. - -If you want to use scrapy-poet integration, add a provider to -``SCRAPY_POET_PROVIDERS`` (see `scrapy-poet integration`_): - -.. code-block:: python - - SCRAPY_POET_PROVIDERS = { - "scrapy_zyte_api.providers.ZyteApiProvider": 1100, - } - -Usage -===== - -You can send requests through Zyte API in one of the following ways: - -- Send all request through Zyte API by default, letting Zyte API parameters - be chosen automatically based on your Scrapy request parameters. See - `Using transparent mode`_. - -- Send specific requests through Zyte API, setting all Zyte API parameters - manually, keeping full control of what is sent to Zyte API. - See `Sending requests with manually-defined parameters`_. - -- Send specific requests through Zyte API, letting Zyte API parameters be - chosen automatically based on your Scrapy request parameters. - See `Sending requests with automatically-mapped parameters`_. - -Zyte API response parameters are mapped into Scrapy response parameters where -possible. See `Response mapping`_ for details. - - -Using transparent mode ----------------------- - -Set the ``ZYTE_API_TRANSPARENT_MODE`` `Scrapy setting`_ to ``True`` to handle -Scrapy requests as follows: - -.. _Scrapy setting: https://docs.scrapy.org/en/latest/topics/settings.html - -- By default, requests are sent through Zyte API with automatically-mapped - parameters. See `Sending requests with automatically-mapped parameters`_ - for details about automatic request parameter mapping. - - You do not need to set the ``zyte_api_automap`` request meta key to - ``True``, but you can set it to a dictionary to extend your Zyte API - request parameters. - -- Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent - through Zyte API with manually-defined parameters. - See `Sending requests with manually-defined parameters`_. - -- Requests with the ``zyte_api_automap`` request meta key set to ``False`` - are *not* sent through Zyte API. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - start_urls = ["https://quotes.toscrape.com/"] - - custom_settings = { - "ZYTE_API_TRANSPARENT_MODE": True, - } - - def parse(self, response): - print(response.text) - # "…" - - -Sending requests with manually-defined parameters -------------------------------------------------- - -To send a Scrapy request through Zyte API with manually-defined parameters, -define your Zyte API parameters in the ``zyte_api`` key in -`Request.meta `_ -as a ``dict``. - -The only exception is the ``url`` parameter, which should not be defined as a -Zyte API parameter. The value from ``Request.url`` is used automatically. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api": { - "browserHtml": True, - } - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -Note that response headers are necessary for raw response decoding. When -defining parameters manually and requesting ``httpResponseBody`` extraction, -remember to also request ``httpResponseHeaders`` extraction: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api": { - "httpResponseBody": True, - "httpResponseHeaders": True, - } - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -To learn more about Zyte API parameters, see the `data extraction usage`_ and -`API reference`_ pages of the `Zyte API documentation`_. - -.. _API reference: https://docs.zyte.com/zyte-api/openapi.html -.. _data extraction usage: https://docs.zyte.com/zyte-api/usage/extract.html -.. _Zyte API documentation: https://docs.zyte.com/zyte-api/get-started.html - - -Sending requests with automatically-mapped parameters ------------------------------------------------------ - -To send a Scrapy request through Zyte API letting Zyte API parameters be -automatically chosen based on the parameters of that Scrapy request, set the -``zyte_api_automap`` key in -`Request.meta `_ -to ``True``. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api_automap": True, - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -See also `Using transparent mode`_ and `Automated request parameter mapping`_. - - -Response mapping ----------------- - -Zyte API responses are mapped with one of the following classes: - -- ``scrapy_zyte_api.responses.ZyteAPITextResponse``, a subclass of - ``scrapy.http.TextResponse``, is used to map text responses, i.e. responses - with ``browserHtml`` or responses with both ``httpResponseBody`` and - ``httpResponseHeaders`` with a text body (e.g. plain text, HTML, JSON). - -- ``scrapy_zyte_api.responses.ZyteAPIResponse``, a subclass of - ``scrapy.http.Response``, is used to map any other response. - -Zyte API response parameters are mapped into response class attributes where -possible: - -- ``url`` becomes ``response.url``. - -- ``statusCode`` becomes ``response.status``. - -- ``httpResponseHeaders`` and ``experimental.responseCookies`` become - ``response.headers``. - -- ``experimental.responseCookies`` is also mapped into the request cookiejar. - -- ``browserHtml`` and ``httpResponseBody`` are mapped into both - ``response.text`` (``str``) and ``response.body`` (``bytes``). - - If none of these parameters were present, e.g. if the only requested output - was ``screenshot``, ``response.text`` and ``response.body`` would be empty. - - If a future version of Zyte API supported requesting both outputs on the - same request, and both parameters were present, ``browserHtml`` would be - the one mapped into ``response.text`` and ``response.body``. - -Both response classes have a ``raw_api_response`` attribute that contains a -``dict`` with the complete, raw response from Zyte API, where you can find all -Zyte API response parameters, including those that are not mapped into other -response class atttributes. - -For example, for a request for ``httpResponseBody`` and -``httpResponseHeaders``, you would get: - -.. code-block:: python - - def parse(self, response): - print(response.url) - # "https://quotes.toscrape.com/" - print(response.status) - # 200 - print(response.headers) - # {b"Content-Type": [b"text/html"], …} - print(response.text) - # "…" - print(response.body) - # b"…" - print(response.raw_api_response) - # { - # "url": "https://quotes.toscrape.com/", - # "statusCode": 200, - # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", - # "httpResponseHeaders": […], - # } - -For a request for ``screenshot``, on the other hand, the response would look -as follows: - -.. code-block:: python - - def parse(self, response): - print(response.url) - # "https://quotes.toscrape.com/" - print(response.status) - # 200 - print(response.headers) - # {} - print(response.text) - # "" - print(response.body) - # b"" - print(response.raw_api_response) - # { - # "url": "https://quotes.toscrape.com/", - # "statusCode": 200, - # "screenshot": "iVBORw0KGgoAAAANSUh…", - # } - from base64 import b64decode - print(b64decode(response.raw_api_response["screenshot"])) - # b'\x89PNG\r\n\x1a\n\x00\x00\x00\r…' - - -Automated request parameter mapping ------------------------------------ - -When you enable automated request parameter mapping, be it through transparent -mode (see `Using transparent mode`_) or for a specific request (see -`Sending requests with automatically-mapped parameters`_), Zyte API -parameters are chosen as follows by default: - -- ``Request.url`` becomes ``url``, same as in requests with manually-defined - parameters. - -- If ``Request.method`` is something other than ``"GET"``, it becomes - ``httpRequestMethod``. - -- ``Request.headers`` become ``customHttpRequestHeaders``. - -- ``Request.body`` becomes ``httpRequestBody``. - -- If the ``ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED`` Scrapy setting is - ``True``, the COOKIES_ENABLED_ Scrapy setting is ``True`` (default), and - provided request metadata does not set dont_merge_cookies_ to ``True``: - - .. _COOKIES_ENABLED: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-setting-COOKIES_ENABLED - .. _dont_merge_cookies: https://docs.scrapy.org/en/latest/topics/request-response.html#std-reqmeta-dont_merge_cookies - - - ``experimental.responseCookies`` is set to ``True``. - - - Cookies from the request `cookie jar`_ become - ``experimental.requestCookies``. - - .. _cookie jar: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-reqmeta-cookiejar - - All cookies from the cookie jar are set, regardless of their cookie - domain. This is because Zyte API requests may involve requests to - different domains (e.g. when following cross-domain redirects, or - during browser rendering). - - If the cookies to be set exceed the limit defined in the - ``ZYTE_API_MAX_COOKIES`` setting (100 by default), a warning is logged, - and only as many cookies as the limit allows are set for the target - request. To silence this warning, set ``experimental.requestCookies`` - manually, e.g. to an empty dict. Alternatively, if Zyte API starts - supporting more than 100 request cookies, update the - ``ZYTE_API_MAX_COOKIES`` setting accordingly. - - If you are using a custom downloader middleware to handle request - cookiejars, you can point the ``ZYTE_API_COOKIE_MIDDLEWARE`` setting to - its import path to make scrapy-zyte-api work with it. The downloader - middleware is expected to have a ``jars`` property with the same - signature as in the built-in Scrapy downloader middleware for cookie - handling. - -- ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. - - This is subject to change without prior notice in future versions of - scrapy-zyte-api, so please account for the following: - - - If you are requesting a binary resource, such as a PDF file or an - image file, set ``httpResponseBody`` to ``True`` explicitly in your - requests: - - .. code-block:: python - - Request( - url="https://toscrape.com/img/zyte.png", - meta={ - "zyte_api_automap": {"httpResponseBody": True}, - }, - ) - - In the future, we may stop setting ``httpResponseBody`` to ``True`` by - default, and instead use a different, new Zyte API parameter that only - works for non-binary responses (e.g. HMTL, JSON, plain text). - - - If you need to access response headers, be it through - ``response.headers`` or through - ``response.raw_api_response["httpResponseHeaders"]``, set - ``httpResponseHeaders`` to ``True`` explicitly in your requests: - - .. code-block:: python - - Request( - url="https://toscrape.com/", - meta={ - "zyte_api_automap": {"httpResponseHeaders": True}, - }, - ) - - At the moment we request response headers because some response headers - are necessary to properly decode the response body as text. In the - future, Zyte API may be able to handle this decoding automatically, so - we would stop setting ``httpResponseHeaders`` to ``True`` by default. - -For example, the following Scrapy request: - -.. code-block:: python - - Request( - method="POST" - url="https://httpbin.org/anything", - headers={"Content-Type": "application/json"}, - body=b'{"foo": "bar"}', - cookies={"a": "b"}, - ) - -Results in a request to the Zyte API data extraction endpoint with the -following parameters: - -.. code-block:: javascript - - { - "customHttpRequestHeaders": [ - { - "name": "Content-Type", - "value": "application/json" - } - ], - "experimental": { - "requestCookies": [ - { - "name": "a", - "value": "b", - "domain": "" - } - ], - "responseCookies": true - }, - "httpResponseBody": true, - "httpResponseHeaders": true, - "httpRequestBody": "eyJmb28iOiAiYmFyIn0=", - "httpRequestMethod": "POST", - "url": "https://httpbin.org/anything" - } - -You may set the ``zyte_api_automap`` key in -`Request.meta `_ -to a ``dict`` of Zyte API parameters to extend or override choices made by -automated request parameter mapping. - -Enabling ``browserHtml``, ``screenshot``, or an automatic extraction property, -unsets ``httpResponseBody`` and ``httpResponseHeaders``, and makes -``Request.headers`` become ``requestHeaders`` instead of -``customHttpRequestHeaders``. For example, the following Scrapy request: - -.. code-block:: python - - Request( - url="https://quotes.toscrape.com", - headers={"Referer": "https://example.com/"}, - meta={"zyte_api_automap": {"browserHtml": True}}, - ) - -Results in a request to the Zyte API data extraction endpoint with the -following parameters: - -.. code-block:: javascript - - { - "browserHtml": true, - "experimental": { - "responseCookies": true - }, - "requestHeaders": {"referer": "https://example.com/"}, - "url": "https://quotes.toscrape.com" - } - -When mapping headers, headers not supported by Zyte API are excluded from the -mapping by default. Use the following `Scrapy settings`_ to change which -headers are included or excluded from header mapping: - -.. _Scrapy settings: https://docs.scrapy.org/en/latest/topics/settings.html - -- ``ZYTE_API_SKIP_HEADERS`` determines headers that must *not* be mapped as - ``customHttpRequestHeaders``, and its default value is: - - .. code-block:: python - - ["User-Agent"] - -- ``ZYTE_API_BROWSER_HEADERS`` determines headers that *can* be mapped as - ``requestHeaders``. It is a ``dict``, where keys are header names and - values are the key that represents them in ``requestHeaders``. Its default - value is: - - .. code-block:: python - - {"Referer": "referer"} - -To maximize support for potential future changes in Zyte API, automated -request parameter mapping allows some parameter values and parameter -combinations that Zyte API does not currently support, and may never support: - -- ``Request.method`` becomes ``httpRequestMethod`` even for unsupported_ - ``httpRequestMethod`` values, and even if ``httpResponseBody`` is unset. - - .. _unsupported: https://docs.zyte.com/zyte-api/usage/extract.html#zyte-api-set-method - -- You can set ``customHttpRequestHeaders`` or ``requestHeaders`` to ``True`` - to force their mapping from ``Request.headers`` in scenarios where they - would not be mapped otherwise. - - Conversely, you can set ``customHttpRequestHeaders`` or ``requestHeaders`` - to ``False`` to prevent their mapping from ``Request.headers``. - -- ``Request.body`` becomes ``httpRequestBody`` even if ``httpResponseBody`` - is unset. - -- You can set ``httpResponseBody`` to ``False`` (which unsets the parameter), - and not set ``browserHtml`` or ``screenshot`` to ``True``. In this case, - ``Request.headers`` is mapped as ``requestHeaders``. - -- You can set ``httpResponseBody`` to ``True`` and also set ``browserHtml`` - or ``screenshot`` to ``True``. In this case, ``Request.headers`` is mapped - both as ``customHttpRequestHeaders`` and as ``requestHeaders``, and - ``browserHtml`` is used as the Scrapy response body. - - -Setting default parameters -========================== - -Often the same configuration needs to be used for all Zyte API requests. For -example, all requests may need to set the same geolocation, or the spider only -uses ``browserHtml`` requests. - -The following settings allow you to define Zyte API parameters to be included -in all requests: - -- ``ZYTE_API_DEFAULT_PARAMS`` is a ``dict`` of parameters to be combined with - manually-defined parameters. See `Sending requests with manually-defined parameters`_. - - You may set the ``zyte_api`` request meta key to an empty ``dict`` to only - use default parameters for that request. - -- ``ZYTE_API_AUTOMAP_PARAMS`` is a ``dict`` of parameters to be combined with - automatically-mapped parameters. - See `Sending requests with automatically-mapped parameters`_. - -For example, if you set ``ZYTE_API_DEFAULT_PARAMS`` to -``{"geolocation": "US"}`` and ``zyte_api`` to ``{"browserHtml": True}``, -``{"url: "…", "geolocation": "US", "browserHtml": True}`` is sent to Zyte API. - -Parameters in these settings are merged with request-specific parameters, with -request-specific parameters taking precedence. - -``ZYTE_API_DEFAULT_PARAMS`` has no effect on requests that use automated -request parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on -requests that use manually-defined parameters. - -When using transparent mode (see `Using transparent mode`_), be careful -of which parameters you define through ``ZYTE_API_AUTOMAP_PARAMS``. In -transparent mode, all Scrapy requests go through Zyte API, even requests that -Scrapy sends automatically, such as those for ``robots.txt`` files when -ROBOTSTXT_OBEY_ is ``True``, or those for sitemaps when using a `sitemap -spider`_. Certain parameters, like ``browserHtml`` or ``screenshot``, are not -meant to be used for every single request. - -If the ``zyte_api_default_params`` request meta key is set to ``False``, the -value of the ``ZYTE_API_DEFAULT_PARAMS`` setting for this request is ignored. - -.. _ROBOTSTXT_OBEY: https://docs.scrapy.org/en/latest/topics/settings.html#robotstxt-obey -.. _sitemap spider: https://docs.scrapy.org/en/latest/topics/spiders.html#sitemapspider - - -Customizing the retry policy -============================ - -API requests are retried automatically using the default retry policy of -`python-zyte-api`_. - -API requests that exceed retries are dropped. You cannot manage API request -retries through Scrapy downloader middlewares. - -Use the ``ZYTE_API_RETRY_POLICY`` setting or the ``zyte_api_retry_policy`` -request meta key to override the default `python-zyte-api`_ retry policy with a -custom retry policy. - -A custom retry policy must be an instance of `tenacity.AsyncRetrying`_. - -Scrapy settings must be picklable, which `retry policies are not -`_, so you cannot assign retry -policy objects directly to the ``ZYTE_API_RETRY_POLICY`` setting, and must use -their import path string instead. - -When setting a retry policy through request meta, you can assign the -``zyte_api_retry_policy`` request meta key either the retry policy object -itself or its import path string. If you need your requests to be serializable, -however, you may also need to use the import path string. - -For example, to increase the maximum number of retries to 10 before dropping -the API request, you can subclass RetryFactory_ as follows: - -.. code-block:: python - - # project/retry_policies.py - from tenacity import stop_after_attempt - from zyte_api.aio.retry import RetryFactory - - class CustomRetryFactory(RetryFactory): - temporary_download_error_stop = stop_after_attempt(10) - - CUSTOM_RETRY_POLICY = CustomRetryFactory().build() - - # project/settings.py - ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" - - -To extend this retry policy, so it will also retry HTTP 521 errors, the same -as HTTP 520 errors, you can implement: - -.. code-block:: python - - # project/retry_policies.py - from tenacity import retry_if_exception, RetryCallState, stop_after_attempt - from zyte_api.aio.errors import RequestError - from zyte_api.aio.retry import RetryFactory - - def is_http_521(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status == 521 - - class CustomRetryFactory(RetryFactory): - - retry_condition = ( - RetryFactory.retry_condition - | retry_if_exception(is_http_521) - ) - temporary_download_error_stop = stop_after_attempt(10) - - def wait(self, retry_state: RetryCallState) -> float: - if is_http_521(retry_state.outcome.exception()): - return self.temporary_download_error_wait(retry_state=retry_state) - return super().wait(retry_state) - - def stop(self, retry_state: RetryCallState) -> bool: - if is_http_521(retry_state.outcome.exception()): - return self.temporary_download_error_stop(retry_state) - return super().stop(retry_state) - - CUSTOM_RETRY_POLICY = CustomRetryFactory().build() - - # project/settings.py - ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" - -.. _python-zyte-api: https://github.com/zytedata/python-zyte-api -.. _RetryFactory: https://github.com/zytedata/python-zyte-api/blob/main/zyte_api/aio/retry.py -.. _tenacity.AsyncRetrying: https://tenacity.readthedocs.io/en/latest/api.html#tenacity.AsyncRetrying - - -Misc settings -============= - -- ``ZYTE_API_MAX_REQUESTS`` - - Default: ``None`` - - When set to an integer value > 0, the spider will close when the number of - successful Zyte API requests reaches it. Note that in some cases, the actual - number of successful Zyte API requests would be below this number if some of - the in-progress requests fail or error out. - - -Stats -===== - -Stats from python-zyte-api_ are exposed as Scrapy stats with the -``scrapy-zyte-api`` prefix. - -For example, ``scrapy-zyte-api/status_codes/`` stats indicate the -status code of Zyte API responses (e.g. ``429`` for `rate limiting -`_ or -``520`` for `temporary download errors -`_). - -.. note:: The actual status code that is received from the target website, i.e. - the `statusCode - `_ - response field of a `Zyte API successful response - `_, - is accounted for in the ``downloader/response_status_count/`` - stat, as with any other Scrapy response. - - -Request fingerprinting -====================== - -The request fingerprinter class of this plugin ensures that Scrapy 2.7 and -later generate unique `request fingerprints -`_ -for Zyte API requests based on some of their parameters. - -For example, a request for ``browserHtml`` and a request for ``screenshot`` -with the same target URL are considered different requests. Similarly, requests -with the same target URL but different ``actions`` are also considered -different requests. - -Zyte API parameters that affect request fingerprinting ------------------------------------------------------- - -The request fingerprinter class of this plugin generates request fingerprints -for Zyte API requests based on the following Zyte API parameters: - -- ``url`` (`canonicalized `_) - - For URLs that include a URL fragment, like ``https://example.com#foo``, URL - canonicalization keeps the URL fragment if ``browserHtml`` or - ``screenshot`` are enabled. - -- Request attribute parameters (``httpRequestBody``, - ``httpRequestMethod``) - -- Output parameters (``browserHtml``, ``httpResponseBody``, - ``httpResponseHeaders``, ``screenshot``) - -- Rendering option parameters (``actions``, ``javascript``, - ``screenshotOptions``) - -- ``geolocation`` - -The following Zyte API parameters are *not* taken into account for request -fingerprinting: - -- Request header parameters (``customHttpRequestHeaders``, - ``requestHeaders``) - -- Metadata parameters (``echoData``, ``jobId``) - -- Experimental parameters (``experimental``) - - -Changing the fingerprinting of non-Zyte-API requests ----------------------------------------------------- - -You can assign a request fingerprinter class to the -``ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS`` Scrapy setting to configure -a custom request fingerprinter class to use for requests that do not go through -Zyte API: - -.. code-block:: python - - ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS = "custom.RequestFingerprinter" - -By default, requests that do not go through Zyte API use the default request -fingerprinter class of the installed Scrapy version. - - -Request fingerprinting before Scrapy 2.7 ----------------------------------------- - -If you have a Scrapy version older than Scrapy 2.7, Zyte API parameters are not -taken into account for request fingerprinting. This can cause some Scrapy -components, like the filter of duplicate requests or the HTTP cache extension, -to interpret 2 different requests as being the same. - -To avoid most issues, use automated request parameter mapping, either through -transparent mode or setting ``zyte_api_automap`` to ``True`` in -``Request.meta``, and then use ``Request`` attributes instead of -``Request.meta`` as much as possible. Unlike ``Request.meta``, ``Request`` -attributes do affect request fingerprints in Scrapy versions older than Scrapy -2.7. - -For requests that must have the same ``Request`` attributes but should still -be considered different, such as browser-based requests with different URL -fragments, you can set ``dont_filter`` to ``True`` on ``Request.meta`` to -prevent the duplicate filter of Scrapy to filter any of them out. For example: - -.. code-block:: python - - yield Request( - "https://toscrape.com#1", - meta={"zyte_api_automap": {"browserHtml": True}}, - dont_filter=True, - ) - yield Request( - "https://toscrape.com#2", - meta={"zyte_api_automap": {"browserHtml": True}}, - dont_filter=True, - ) - -Note, however, that for other Scrapy components, like the HTTP cache -extensions, these 2 requests would still be considered identical. - - -Logging request parameters -========================== - -Set the ``ZYTE_API_LOG_REQUESTS`` setting to ``True`` and the ``LOG_LEVEL`` -setting to ``"DEBUG"`` to enable the logging of debug messages that indicate -the JSON object sent on every extract request to Zyte API. - -For example:: - - Sending Zyte API extract request: {"url": "https://example.com", "httpResponseBody": true} - -The ``ZYTE_API_LOG_REQUESTS_TRUNCATE``, 64 by default, determines the maximum -length of any string value in the logged JSON object, excluding object keys. To -disable truncation, set it to 0. - -scrapy-poet integration -======================= - -``scrapy-zyte-api`` includes a `scrapy-poet provider`_ that you can use to get -data from Zyte API in page objects. It requires additional dependencies which -you can get by installing the optional ``provider`` feature: -``pip install scrapy-zyte-api[provider]``. Enable the provider in the Scrapy -settings:: - - SCRAPY_POET_PROVIDERS = { - "scrapy_zyte_api.providers.ZyteApiProvider": 1100, - } - -Request some supported dependencies in the page object:: - - @attrs.define - class ProductPage(BasePage): - response: BrowserResponse - product: Product - - - class ZyteApiSpider(scrapy.Spider): - ... - - def parse_page(self, response: DummyResponse, page: ProductPage): - ... - -Or request them directly in the callback:: - - class ZyteApiSpider(scrapy.Spider): - ... - - def parse_page(self, - response: DummyResponse, - browser_response: BrowserResponse, - product: Product, - ): - ... - -The currently supported dependencies are: - -* ``web_poet.BrowserHtml`` -* ``web_poet.BrowserResponse`` -* ``zyte_common_items.Product`` -* ``zyte_common_items.ProductList`` -* ``zyte_common_items.ProductNavigation`` -* ``zyte_common_items.Article`` -* ``zyte_common_items.ArticleList`` -* ``zyte_common_items.ArticleNavigation`` - -The provider will make a request to Zyte API using the ``ZYTE_API_KEY`` and -``ZYTE_API_URL`` settings. - -The provider will ignore the transparent mode and parameter mapping settings. -To add extra parameters to all Zyte API requests sent by the provider, set them -as a dictionary through the ``ZYTE_API_PROVIDER_PARAMS`` setting, for example -in ``settings.py``:: - - ZYTE_API_PROVIDER_PARAMS = {"geolocation": "IE"} - -When the ``ZYTE_API_PROVIDER_PARAMS`` setting includes one of the Zyte API -extraction options (e.g. ``productOptions`` for ``product``), but the -final Zyte API request doesn't include the corresponding data type, the -unused options are automatically removed. So, it's safe to use -``ZYTE_API_PROVIDER_PARAMS`` to set the default options for various extraction -types, e.g.:: - - ZYTE_API_PROVIDER_PARAMS = { - "productOptions": {"extractFrom": "httpResponseBody"}, - "productNavigationOptions": {"extractFrom": "httpResponseBody"}, - } - -Note that the built-in ``scrapy_poet.page_input_providers.ItemProvider`` has a -priority of 2000, so when you have page objects producing -``zyte_common_items.Product`` items you should use higher values for -``ZyteApiProvider`` if you want these items to come from these page objects, -and lower values if you want them to come from Zyte API. - -Currently, when ``ItemProvider`` is used together with ``ZyteApiProvider``, -it may make more requests than is optimal: the normal Scrapy response will be -always requested even when using a ``DummyResponse`` annotation, and in some -dependency combinations two Zyte API requests will be made for the same page. -We are planning to solve these problems in the future releases of -``scrapy-poet`` and ``scrapy-zyte-api``. - -.. _scrapy-poet provider: https://scrapy-poet.readthedocs.io/en/stable/providers.html - - -Running behind a proxy -====================== - -If you require a proxy to access Zyte API (e.g. a corporate proxy), configure -the ``HTTP_PROXY`` and ``HTTPS_PROXY`` environment variables accordingly, and -set the ``ZYTE_API_USE_ENV_PROXY`` setting to ``True``. -======= * Documentation: https://scrapy-zyte-api.readthedocs.io/en/latest/ * License: BSD 3-clause ->>>>>>> scrapy-plugins/main From 76e7fd92d2f930c674e475f31319ee5df4bada8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 23 Jan 2024 11:59:21 +0100 Subject: [PATCH 11/16] Improve cookie handling and messaging --- scrapy_zyte_api/_params.py | 304 +++++++++++++++------------ tests/test_api_requests.py | 418 ++++++++++++++++++++++++++++++------- 2 files changed, 514 insertions(+), 208 deletions(-) diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 92c749c0..6e6d19c6 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -453,20 +453,105 @@ def _set_http_response_headers_from_request( api_params.pop("httpResponseHeaders") +def _handle_experimental_unnamespacing(api_params, request, experimental, field): + experimental_params = api_params.setdefault("experimental", {}) + if not experimental and field in experimental_params: + if field in api_params: + logger.warning( + f"Request {request!r} defines both {field} " + f"({api_params[field]}) and " + f"experimental.{field} " + f"({experimental_params[field]}). " + f"experimental.{field} will be ignored." + ) + del experimental_params[field] + else: + logger.warning( + f"Request {request!r} defines experimental.{field}. " + f"experimental.{field} will be removed, and its value " + f"will be set as {field}." + ) + api_params[field] = experimental_params.pop(field) + elif experimental and field in api_params: + if field in experimental_params: + logger.warning( + f"Request {request!r} defines both {field} " + f"({api_params[field]}) and " + f"experimental.{field} " + f"({experimental_params[field]}). Since the " + f"ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting is enabled, " + f"{field} will be removed, and its value will be set " + f"as experimental.{field}, overriding its current " + f"value." + ) + else: + logger.warning( + f"Request {request!r} defines {field}. Since the " + f"ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED setting is enabled, " + f"{field} will be removed, and its value will be set " + f"as experimental.{field}." + ) + experimental_params[field] = api_params.pop(field) + if not experimental_params: + del api_params["experimental"] + + def _set_http_response_cookies_from_request( *, api_params: Dict[str, Any], experimental: bool, + request: Request, ): + _handle_experimental_unnamespacing( + api_params, request, experimental, "responseCookies" + ) + + if "responseCookies" in api_params and api_params["responseCookies"] is False: + del api_params["responseCookies"] + return + experimental_params = api_params.get("experimental", {}) + if ( + "responseCookies" in experimental_params + and experimental_params["responseCookies"] is False + ): + del experimental_params["responseCookies"] + if not experimental_params: + del api_params["experimental"] + return if not experimental: api_params.setdefault("responseCookies", True) - if api_params["responseCookies"] is False: - del api_params["responseCookies"] else: api_params.setdefault("experimental", {}) api_params["experimental"].setdefault("responseCookies", True) - if api_params["experimental"]["responseCookies"] is False: - del api_params["experimental"]["responseCookies"] + + +def _get_output_cookies(request, cookie_jars, max_cookies, field): + output_cookies = [] + input_cookies = _get_all_cookies(request, cookie_jars) + input_cookie_count = len(input_cookies) + if input_cookie_count > max_cookies: + logger.warning( + f"Request {request!r} would get {input_cookie_count!r} cookies, " + f"but request cookie automatic mapping is limited to " + f"{max_cookies!r} cookies (see the ZYTE_API_MAX_COOKIES setting), " + f"so only {max_cookies!r} cookies have been added to this " + f"request. To silence this warning, set the request cookies " + f"manually through the {field} Zyte API parameter instead. " + f"Alternatively, if Zyte API starts supporting more than " + f"{max_cookies!r} request cookies, update the ZYTE_API_MAX_COOKIES " + f"setting accordingly." + ) + input_cookies = input_cookies[:max_cookies] + for input_cookie in input_cookies: + output_cookie = { + "name": input_cookie.name, + "value": input_cookie.value, + "domain": input_cookie.domain, + } + if input_cookie.path_specified: + output_cookie["path"] = input_cookie.path + output_cookies.append(output_cookie) + return output_cookies def _set_http_request_cookies_from_request( @@ -477,115 +562,69 @@ def _set_http_request_cookies_from_request( max_cookies: int, experimental: bool, ): - if not experimental: - if "requestCookies" in api_params: - request_cookies = api_params["requestCookies"] - if request_cookies is False: - del api_params["requestCookies"] - elif not request_cookies and isinstance(request_cookies, list): + _handle_experimental_unnamespacing( + api_params, request, experimental, "requestCookies" + ) + + if "requestCookies" in api_params: + request_cookies = api_params["requestCookies"] + if not request_cookies: + del api_params["requestCookies"] + # Note: We do not warn about setting requestCookies to False + # when there is no need (i.e. no input_cookies below) because + # input cookies can change at run time due to cookiejars, so + # False may make sense for some iterations of the code. + if isinstance(request_cookies, list): logger.warning( - ( - "Request %(request)r is overriding automatic request " - "cookie mapping by explicitly setting " - "requestCookies to []. If this was your intention, " - "please use False instead of []. Otherwise, stop " - "defining requestCookies in your request to let " - "automatic mapping work." - ), - { - "request": request, - }, + f"Request {request!r} is overriding automatic request " + f"cookie mapping by explicitly setting " + f"requestCookies to []. If this was your intention, " + f"please use False instead of []. Otherwise, stop " + f"defining requestCookies in your request to let " + f"automatic mapping work." ) - return - output_cookies = [] - input_cookies = _get_all_cookies(request, cookie_jars) - input_cookie_count = len(input_cookies) - if input_cookie_count > max_cookies: - logger.warning( - ( - "Request %(request)r would get %(count)r cookies, but " - "request cookie automatic mapping is limited to %(max)r " - "cookies (see the ZYTE_API_MAX_COOKIES setting), so only " - "%(max)r cookies have been added to this request. To " - "silence this warning, set the request cookies manually " - "through the requestCookies Zyte API parameter instead. " - "Alternatively, if Zyte API starts supporting more than " - "%(max)r request cookies, update the ZYTE_API_MAX_COOKIES " - "setting accordingly." - ), - { - "request": request, - "count": input_cookie_count, - "max": max_cookies, - }, - ) - input_cookies = input_cookies[:max_cookies] - for input_cookie in input_cookies: - output_cookie = { - "name": input_cookie.name, - "value": input_cookie.value, - "domain": input_cookie.domain, - } - if input_cookie.path_specified: - output_cookie["path"] = input_cookie.path - output_cookies.append(output_cookie) + return + + experimental_params = api_params.get("experimental", {}) + if "requestCookies" in experimental_params: + request_cookies = experimental_params["requestCookies"] + if not request_cookies: + del experimental_params["requestCookies"] + if not experimental_params: + del api_params["experimental"] + # Note: We do not warn about setting requestCookies to False + # when there is no need (i.e. no input_cookies below) because + # input cookies can change at run time due to cookiejars, so + # False may make sense for some iterations of the code. + if isinstance(request_cookies, list): + logger.warning( + f"Request {request!r} is overriding automatic request " + f"cookie mapping by explicitly setting " + f"experimental.requestCookies to []. If this was your " + f"intention, please use False instead of []. " + f"Otherwise, stop defining " + f"experimental.requestCookies in your request to let " + f"automatic mapping work." + ) + elif not experimental: + del experimental_params["requestCookies"] + if not experimental_params: + del api_params["experimental"] + api_params.setdefault("requestCookies", request_cookies) + return + + if not experimental: + output_cookies = _get_output_cookies( + request, cookie_jars, max_cookies, "requestCookies" + ) if output_cookies: api_params["requestCookies"] = output_cookies else: - api_params.setdefault("experimental", {}) - if "requestCookies" in api_params["experimental"]: - request_cookies = api_params["experimental"]["requestCookies"] - if request_cookies is False: - del api_params["experimental"]["requestCookies"] - elif not request_cookies and isinstance(request_cookies, list): - logger.warning( - ( - "Request %(request)r is overriding automatic request " - "cookie mapping by explicitly setting " - "experimental.requestCookies to []. If this was your " - "intention, please use False instead of []. Otherwise, " - "stop defining experimental.requestCookies in your " - "request to let automatic mapping work." - ), - { - "request": request, - }, - ) - return - output_cookies = [] - input_cookies = _get_all_cookies(request, cookie_jars) - input_cookie_count = len(input_cookies) - if input_cookie_count > max_cookies: - logger.warning( - ( - "Request %(request)r would get %(count)r cookies, but request " - "cookie automatic mapping is limited to %(max)r cookies " - "(see the ZYTE_API_MAX_COOKIES setting), so only %(max)r " - "cookies have been added to this request. To silence this " - "warning, set the request cookies manually through the " - "experimental.requestCookies Zyte API parameter instead. " - "Alternatively, if Zyte API starts supporting more than " - "%(max)r request cookies, update the ZYTE_API_MAX_COOKIES " - "setting accordingly." - ), - { - "request": request, - "count": input_cookie_count, - "max": max_cookies, - }, - ) - input_cookies = input_cookies[:max_cookies] - for input_cookie in input_cookies: - output_cookie = { - "name": input_cookie.name, - "value": input_cookie.value, - "domain": input_cookie.domain, - } - if input_cookie.path_specified: - output_cookie["path"] = input_cookie.path - output_cookies.append(output_cookie) + output_cookies = _get_output_cookies( + request, cookie_jars, max_cookies, "experimental.requestCookies" + ) if output_cookies: - api_params["experimental"]["requestCookies"] = output_cookies + api_params.setdefault("experimental", {})["requestCookies"] = output_cookies def _set_http_request_method_from_request( @@ -642,13 +681,14 @@ def _unset_unneeded_api_params( default_params: Dict[str, Any], request: Request, ): + experimental_had_content = bool(api_params.get("experimental", None)) for param, default_value in _DEFAULT_API_PARAMS.items(): value = api_params.get(param, _Undefined) - if value is _Undefined: + if value is _Undefined or value != default_value: continue - if value != default_value: - continue - if param not in default_params or default_params.get(param) == default_value: + if ( + param not in default_params or default_params.get(param) == default_value + ) and (param != "experimental" or not experimental_had_content): logger.warning( f"Request {request} unnecessarily defines the Zyte API {param!r} " f"parameter with its default value, {default_value!r}. It will " @@ -669,7 +709,23 @@ def _update_api_params_from_request( cookie_jars: Optional[Dict[Any, CookieJar]], max_cookies: int, experimental_cookies: bool, + unreported_deprecated_experimental_fields: Set[str], ): + if ( + api_params + and unreported_deprecated_experimental_fields + and "experimental" in api_params + ): + for field in list(unreported_deprecated_experimental_fields): + if field in api_params["experimental"]: + unreported_deprecated_experimental_fields.remove(field) + logger.warning( + f"Zyte API parameters for request {request} include " + f"experimental.{field}, which is deprecated. Please, " + f"replace it with {field}, both in request parameters " + f"and in any response parsing logic that might rely " + f"on the old parameter." + ) _set_http_response_body_from_request(api_params=api_params, request=request) _set_http_response_headers_from_request( api_params=api_params, @@ -687,7 +743,9 @@ def _update_api_params_from_request( if cookies_enabled: assert cookie_jars is not None # typing _set_http_response_cookies_from_request( - api_params=api_params, experimental=experimental_cookies + api_params=api_params, + experimental=experimental_cookies, + request=request, ) _set_http_request_cookies_from_request( api_params=api_params, @@ -696,8 +754,6 @@ def _update_api_params_from_request( max_cookies=max_cookies, experimental=experimental_cookies, ) - if experimental_cookies and not api_params["experimental"]: - del api_params["experimental"] _unset_unneeded_api_params( api_params=api_params, request=request, default_params=default_params ) @@ -803,6 +859,7 @@ def _get_automap_params( cookie_jars: Optional[Dict[Any, CookieJar]], max_cookies: int, experimental_cookies: bool, + unreported_deprecated_experimental_fields: Set[str], ): meta_params = request.meta.get("zyte_api_automap", default_enabled) if meta_params is False: @@ -833,6 +890,7 @@ def _get_automap_params( cookie_jars=cookie_jars, max_cookies=max_cookies, experimental_cookies=experimental_cookies, + unreported_deprecated_experimental_fields=unreported_deprecated_experimental_fields, ) return params @@ -851,6 +909,7 @@ def _get_api_params( cookie_jars: Optional[Dict[Any, CookieJar]], max_cookies: int, experimental_cookies: bool, + unreported_deprecated_experimental_fields: Set[str], ) -> Optional[dict]: """Returns a dictionary of API parameters that must be sent to Zyte API for the specified request, or None if the request should not be sent through @@ -867,6 +926,7 @@ def _get_api_params( cookie_jars=cookie_jars, max_cookies=max_cookies, experimental_cookies=experimental_cookies, + unreported_deprecated_experimental_fields=unreported_deprecated_experimental_fields, ) if api_params is None: return None @@ -984,20 +1044,6 @@ def parse(self, request): cookie_jars=self._cookie_jars, max_cookies=self._max_cookies, experimental_cookies=self._experimental_cookies, + unreported_deprecated_experimental_fields=self._unreported_deprecated_experimental_fields, ) - if ( - params - and self._unreported_deprecated_experimental_fields - and "experimental" in params - ): - for field in list(self._unreported_deprecated_experimental_fields): - if field in params["experimental"]: - self._unreported_deprecated_experimental_fields.remove(field) - logger.warning( - f"Zyte API parameters for request {request} include " - f"experimental.{field}, which is deprecated. Please, " - f"replace it with {field}, both in request parameters " - f"and in any response parsing logic that might rely " - f"on the old parameter." - ) return params diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 2934de16..cac14a04 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -682,8 +682,29 @@ def _test_automap( api_params.pop("url") assert expected == api_params if warnings: + seen_warnings = {record.getMessage(): False for record in caplog.records} for warning in warnings: - assert warning in caplog.text + matched = False + for seen_warning in list(seen_warnings): + if warning in seen_warning: + if seen_warnings[seen_warning] is True: + raise AssertionError( + f"Expected warning {warning!r} matches more than " + f"1 seen warning (all seen warnings: " + f"{list(seen_warnings)!r})" + ) + seen_warnings[seen_warning] = True + matched = True + break + if not matched: + raise AssertionError( + f"Expected warning {warning!r} not found in {list(seen_warnings)!r}" + ) + unexpected_warnings = [ + warning for warning, is_expected in seen_warnings.items() if not is_expected + ] + if unexpected_warnings: + raise AssertionError(f"Got unexpected warnings: {unexpected_warnings}") else: assert not caplog.records @@ -1841,13 +1862,11 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "responseCookies": False, } }, - { - "responseCookies": True, - "experimental": { - "responseCookies": False, - }, - }, - ["experimental.responseCookies, which is deprecated"], + {}, + [ + "include experimental.responseCookies, which is deprecated", + "experimental.responseCookies will be removed, and its value will be set as responseCookies", + ], ), ( { @@ -1857,13 +1876,11 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca { "responseCookies": False, }, - { - "responseCookies": False, - "experimental": { - "responseCookies": True, - }, - }, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + {}, + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "responseCookies will be removed, and its value will be set as experimental.responseCookies", + ], ), ( { @@ -1876,7 +1893,9 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca } }, {}, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + ], ), # No cookies, requestCookies disabled. ( @@ -1900,11 +1919,11 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca }, { "responseCookies": True, - "experimental": { - "requestCookies": False, - }, }, - ["experimental.requestCookies, which is deprecated"], + [ + "experimental.requestCookies, which is deprecated", + "experimental.requestCookies will be removed, and its value will be set as requestCookies", + ], ), ( { @@ -1915,10 +1934,12 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "requestCookies": False, }, { - "requestCookies": False, "experimental": {"responseCookies": True}, }, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "requestCookies will be removed, and its value will be set as experimental.requestCookies", + ], ), ( { @@ -1955,16 +1976,12 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "responseCookies": False, } }, - { - "responseCookies": True, - "experimental": { - "requestCookies": False, - "responseCookies": False, - }, - }, + {}, [ - "experimental.responseCookies, which is deprecated", - "experimental.requestCookies, which is deprecated", + "include experimental.requestCookies, which is deprecated", + "include experimental.responseCookies, which is deprecated", + "experimental.responseCookies will be removed, and its value will be set as responseCookies", + "experimental.requestCookies will be removed, and its value will be set as requestCookies", ], ), ( @@ -1976,12 +1993,12 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "requestCookies": False, "responseCookies": False, }, - { - "requestCookies": False, - "responseCookies": False, - "experimental": {"responseCookies": True}, - }, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + {}, + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "requestCookies will be removed, and its value will be set as experimental.requestCookies", + "responseCookies will be removed, and its value will be set as experimental.responseCookies", + ], ), ( { @@ -1995,7 +2012,9 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca } }, {}, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + ], ), # Cookies, responseCookies disabled. ( @@ -2019,13 +2038,10 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca }, { "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, - "responseCookies": True, - "experimental": { - "responseCookies": False, - }, }, [ - "experimental.responseCookies, which is deprecated", + "include experimental.responseCookies, which is deprecated", + "experimental.responseCookies will be removed, and its value will be set as responseCookies", ], ), ( @@ -2037,13 +2053,14 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "responseCookies": False, }, { - "responseCookies": False, "experimental": { "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, - "responseCookies": True, }, }, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "responseCookies will be removed, and its value will be set as experimental.responseCookies", + ], ), ( { @@ -2083,14 +2100,11 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca } }, { - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, "responseCookies": True, - "experimental": { - "requestCookies": False, - }, }, [ "experimental.requestCookies, which is deprecated", + "experimental.requestCookies will be removed, and its value will be set as requestCookies", ], ), ( @@ -2102,13 +2116,14 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "requestCookies": False, }, { - "requestCookies": False, "experimental": { - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, "responseCookies": True, }, }, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "requestCookies will be removed, and its value will be set as experimental.requestCookies", + ], ), ( { @@ -2147,17 +2162,12 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "responseCookies": False, } }, - { - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, - "responseCookies": True, - "experimental": { - "requestCookies": False, - "responseCookies": False, - }, - }, + {}, [ - "experimental.responseCookies, which is deprecated", - "experimental.requestCookies, which is deprecated", + "include experimental.requestCookies, which is deprecated", + "include experimental.responseCookies, which is deprecated", + "experimental.requestCookies will be removed, and its value will be set as requestCookies", + "experimental.responseCookies will be removed, and its value will be set as responseCookies", ], ), ( @@ -2169,15 +2179,12 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "requestCookies": False, "responseCookies": False, }, - { - "requestCookies": False, - "responseCookies": False, - "experimental": { - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, - "responseCookies": True, - }, - }, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + {}, + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "requestCookies will be removed, and its value will be set as experimental.requestCookies", + "responseCookies will be removed, and its value will be set as experimental.responseCookies", + ], ), ( { @@ -2215,6 +2222,92 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca [], ["is overriding automatic request cookie mapping"], ), + ) + for settings, input_params, output_params, warnings in ( + ( + {}, + { + "requestCookies": override_cookies, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "responseCookies": True, + }, + override_warnings, + ), + ( + {}, + { + "experimental": { + "requestCookies": override_cookies, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "responseCookies": True, + }, + [ + "experimental.requestCookies, which is deprecated", + "experimental.requestCookies will be removed, and its value will be set as requestCookies", + *override_warnings, + ], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + { + "experimental": { + "requestCookies": override_cookies, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + }, + }, + [ + *cast(List, override_warnings), + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + ], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + { + "requestCookies": override_cookies, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + }, + }, + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "requestCookies will be removed, and its value will be set as experimental.requestCookies", + *override_warnings, + ], + ), + ) + ), + *( + ( + settings, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + input_params, + output_params, + warnings, + [], + ) + for override_cookies, override_warnings in ( ( REQUEST_OUTPUT_COOKIES_MAXIMAL, [], @@ -2244,14 +2337,12 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca { "httpResponseBody": True, "httpResponseHeaders": True, - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "requestCookies": override_cookies, "responseCookies": True, - "experimental": { - "requestCookies": override_cookies, - }, }, [ "experimental.requestCookies, which is deprecated", + "experimental.requestCookies will be removed, and its value will be set as requestCookies", ], ), ( @@ -2286,13 +2377,15 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca { "httpResponseBody": True, "httpResponseHeaders": True, - "requestCookies": override_cookies, "experimental": { - "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + "requestCookies": override_cookies, "responseCookies": True, }, }, - ["deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED"], + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "requestCookies will be removed, and its value will be set as experimental.requestCookies", + ], ), ) ), @@ -2444,6 +2537,173 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca ), ) ), + # If (contradictory) values are set for requestCookies or + # responseCookies both outside and inside the experimental namespace, + # the non-experimental value takes priority. This is so even if + # ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is True, in which case the + # outside value is moved into the experimental namespace, overriding + # its value. + ( + {}, + REQUEST_INPUT_COOKIES_EMPTY, + {}, + { + "responseCookies": True, + "experimental": { + "responseCookies": False, + }, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "responseCookies": True, + }, + [ + "include experimental.responseCookies, which is deprecated", + "defines both responseCookies (True) and experimental.responseCookies (False)", + ], + [], + ), + ( + {}, + REQUEST_INPUT_COOKIES_EMPTY, + {}, + { + "responseCookies": False, + "experimental": { + "responseCookies": True, + }, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [ + "defines both responseCookies (False) and experimental.responseCookies (True)", + "include experimental.responseCookies, which is deprecated", + ], + [], + ), + *( + ( + {}, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "requestCookies": [ + {"name": regular_k, "value": regular_v}, + ], + "experimental": { + "requestCookies": [ + {"name": experimental_k, "value": experimental_v}, + ], + }, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestCookies": [ + {"name": regular_k, "value": regular_v}, + ], + "responseCookies": True, + }, + [ + "include experimental.requestCookies, which is deprecated", + "experimental.requestCookies will be ignored", + ], + [], + ) + for regular_k, regular_v, experimental_k, experimental_v in ( + ("b", "2", "c", "3"), + ("c", "3", "b", "2"), + ) + ), + # Now with ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED=True + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + {}, + { + "responseCookies": True, + "experimental": { + "responseCookies": False, + }, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + }, + }, + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "defines both responseCookies (True) and experimental.responseCookies (False)", + ], + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + {}, + { + "responseCookies": False, + "experimental": { + "responseCookies": True, + }, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "defines both responseCookies (False) and experimental.responseCookies (True)", + ], + [], + ), + *( + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "requestCookies": [ + {"name": regular_k, "value": regular_v}, + ], + "experimental": { + "requestCookies": [ + {"name": experimental_k, "value": experimental_v}, + ], + }, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "requestCookies": [ + {"name": regular_k, "value": regular_v}, + ], + "responseCookies": True, + }, + }, + [ + "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", + "requestCookies will be removed, and its value will be set as experimental.requestCookies", + ], + [], + ) + for regular_k, regular_v, experimental_k, experimental_v in ( + ("b", "2", "c", "3"), + ("c", "3", "b", "2"), + ) + ), ], ) def test_automap_cookies( From 30cf8595bec1d4a0ff40f1fa2fd2ea4f2269f6e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 23 Jan 2024 12:03:59 +0100 Subject: [PATCH 12/16] Keep mypy happy --- tests/test_api_requests.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index cac14a04..3328b1c2 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -2219,7 +2219,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca ) for override_cookies, override_warnings in ( ( - [], + cast(List[Dict[str, str]], []), ["is overriding automatic request cookie mapping"], ), ) @@ -2307,12 +2307,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca warnings, [], ) - for override_cookies, override_warnings in ( - ( - REQUEST_OUTPUT_COOKIES_MAXIMAL, - [], - ), - ) + for override_cookies in ((REQUEST_OUTPUT_COOKIES_MAXIMAL,),) for settings, input_params, output_params, warnings in ( ( {}, @@ -2325,7 +2320,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca "requestCookies": override_cookies, "responseCookies": True, }, - override_warnings, + [], ), ( {}, @@ -2363,7 +2358,6 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca }, }, [ - *cast(List, override_warnings), "deprecated ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED", ], ), From c5f700295d7fbb5d1f5440f34797b1f2a0b744b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 23 Jan 2024 13:32:59 +0100 Subject: [PATCH 13/16] Improve tests --- scrapy_zyte_api/_params.py | 73 ++++++---- scrapy_zyte_api/_request_fingerprinter.py | 14 +- tests/test_api_requests.py | 166 ++++++++++++---------- tests/test_request_fingerprinter.py | 40 +++++- 4 files changed, 184 insertions(+), 109 deletions(-) diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 6e6d19c6..47983f5e 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -502,10 +502,6 @@ def _set_http_response_cookies_from_request( experimental: bool, request: Request, ): - _handle_experimental_unnamespacing( - api_params, request, experimental, "responseCookies" - ) - if "responseCookies" in api_params and api_params["responseCookies"] is False: del api_params["responseCookies"] return @@ -562,10 +558,6 @@ def _set_http_request_cookies_from_request( max_cookies: int, experimental: bool, ): - _handle_experimental_unnamespacing( - api_params, request, experimental, "requestCookies" - ) - if "requestCookies" in api_params: request_cookies = api_params["requestCookies"] if not request_cookies: @@ -711,21 +703,22 @@ def _update_api_params_from_request( experimental_cookies: bool, unreported_deprecated_experimental_fields: Set[str], ): - if ( - api_params - and unreported_deprecated_experimental_fields - and "experimental" in api_params - ): - for field in list(unreported_deprecated_experimental_fields): - if field in api_params["experimental"]: - unreported_deprecated_experimental_fields.remove(field) - logger.warning( - f"Zyte API parameters for request {request} include " - f"experimental.{field}, which is deprecated. Please, " - f"replace it with {field}, both in request parameters " - f"and in any response parsing logic that might rely " - f"on the old parameter." - ) + for field in ("responseCookies", "requestCookies", "cookieManagement"): + if ( + field in unreported_deprecated_experimental_fields + and field in api_params.get("experimental", {}) + ): + unreported_deprecated_experimental_fields.remove(field) + logger.warning( + f"Zyte API parameters for request {request} include " + f"experimental.{field}, which is deprecated. Please, " + f"replace it with {field}, both in request parameters " + f"and in any response parsing logic that might rely " + f"on the old parameter." + ) + _handle_experimental_unnamespacing( + api_params, request, experimental_cookies, field + ) _set_http_response_body_from_request(api_params=api_params, request=request) _set_http_response_headers_from_request( api_params=api_params, @@ -741,19 +734,21 @@ def _update_api_params_from_request( ) _set_http_request_body_from_request(api_params=api_params, request=request) if cookies_enabled: - assert cookie_jars is not None # typing _set_http_response_cookies_from_request( api_params=api_params, experimental=experimental_cookies, request=request, ) - _set_http_request_cookies_from_request( - api_params=api_params, - request=request, - cookie_jars=cookie_jars, - max_cookies=max_cookies, - experimental=experimental_cookies, - ) + # cookie_jars can be None when the param parser is used for request + # fingerprinting, in which case request cookies are not relevant. + if cookie_jars is not None: + _set_http_request_cookies_from_request( + api_params=api_params, + request=request, + cookie_jars=cookie_jars, + max_cookies=max_cookies, + experimental=experimental_cookies, + ) _unset_unneeded_api_params( api_params=api_params, request=request, default_params=default_params ) @@ -935,6 +930,22 @@ def _get_api_params( f"Request {request} combines manually-defined parameters and " f"automatically-mapped parameters." ) + else: + if ( + api_params + and unreported_deprecated_experimental_fields + and "experimental" in api_params + ): + for field in list(unreported_deprecated_experimental_fields): + if field in api_params["experimental"]: + unreported_deprecated_experimental_fields.remove(field) + logger.warning( + f"Zyte API parameters for request {request} include " + f"experimental.{field}, which is deprecated. Please, " + f"replace it with {field}, both in request parameters " + f"and in any response parsing logic that might rely " + f"on the old parameter." + ) if job_id is not None: api_params["jobId"] = job_id diff --git a/scrapy_zyte_api/_request_fingerprinter.py b/scrapy_zyte_api/_request_fingerprinter.py index 3a023968..7bd04d46 100644 --- a/scrapy_zyte_api/_request_fingerprinter.py +++ b/scrapy_zyte_api/_request_fingerprinter.py @@ -18,6 +18,8 @@ from ._params import _REQUEST_PARAMS, _ParamParser, _uses_browser + _Undefined = object() + class ScrapyZyteAPIRequestFingerprinter: @classmethod def from_crawler(cls, crawler): @@ -36,7 +38,7 @@ def __init__(self, crawler): crawler=crawler, ) self._cache: "WeakKeyDictionary[Request, bytes]" = WeakKeyDictionary() - self._param_parser = _ParamParser(crawler, cookies_enabled=False) + self._param_parser = _ParamParser(crawler) def _normalize_params(self, api_params): api_params["url"] = canonicalize_url( @@ -49,9 +51,19 @@ def _normalize_params(self, api_params): api_params.pop("httpRequestText").encode() ).decode() + if ( + "responseCookies" not in api_params + and "responseCookies" in api_params.get("experimental", {}) + ): + api_params["responseCookies"] = api_params["experimental"].pop( + "responseCookies" + ) + for key, value in _REQUEST_PARAMS.items(): if value["changes_fingerprint"] is False: api_params.pop(key, None) + elif value["default"] == api_params.get(key, _Undefined): + api_params.pop(key) def fingerprint(self, request): if request in self._cache: diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 3328b1c2..6c9e27b7 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -206,7 +206,7 @@ async def test_exceptions( req = Request("http://example.com", method="POST", meta=meta) with pytest.raises(exception_type): await handler.download_request(req, None) - assert exception_text in caplog.text + _assert_warnings(caplog, [exception_text]) @ensureDeferred @@ -520,8 +520,13 @@ async def test_default_params_none(mockserver, caplog): async with mockserver.make_handler(settings) as handler: assert handler._param_parser._automap_params == {"e": "f"} assert handler._param_parser._default_params == {"b": "c"} - assert "Parameter 'a' in the ZYTE_API_DEFAULT_PARAMS setting is None" in caplog.text - assert "Parameter 'd' in the ZYTE_API_AUTOMAP_PARAMS setting is None" in caplog.text + _assert_warnings( + caplog, + [ + "Parameter 'a' in the ZYTE_API_DEFAULT_PARAMS setting is None", + "Parameter 'd' in the ZYTE_API_AUTOMAP_PARAMS setting is None", + ], + ) @pytest.mark.parametrize( @@ -586,11 +591,7 @@ def test_default_params_merging( api_params.pop(key) api_params.pop("url") assert expected == api_params - if warnings: - for warning in warnings: - assert warning in caplog.text - else: - assert not caplog.records + _assert_warnings(caplog, warnings) @pytest.mark.parametrize( @@ -638,6 +639,36 @@ def test_default_params_immutability(setting_key, meta_key, setting, meta): assert default_params == setting +def _assert_warnings(caplog, warnings): + if warnings: + seen_warnings = {record.getMessage(): False for record in caplog.records} + for warning in warnings: + matched = False + for seen_warning in list(seen_warnings): + if warning in seen_warning: + if seen_warnings[seen_warning] is True: + raise AssertionError( + f"Expected warning {warning!r} matches more than " + f"1 seen warning (all seen warnings: " + f"{list(seen_warnings)!r})" + ) + seen_warnings[seen_warning] = True + matched = True + break + if not matched: + raise AssertionError( + f"Expected warning {warning!r} not found in {list(seen_warnings)!r}" + ) + unexpected_warnings = [ + warning for warning, is_expected in seen_warnings.items() if not is_expected + ] + if unexpected_warnings: + raise AssertionError(f"Got unexpected warnings: {unexpected_warnings}") + else: + assert not caplog.records + caplog.clear() + + def _test_automap( settings, request_kwargs, meta, expected, warnings, caplog, cookie_jar=None ): @@ -681,32 +712,7 @@ def _test_automap( api_params = param_parser.parse(request) api_params.pop("url") assert expected == api_params - if warnings: - seen_warnings = {record.getMessage(): False for record in caplog.records} - for warning in warnings: - matched = False - for seen_warning in list(seen_warnings): - if warning in seen_warning: - if seen_warnings[seen_warning] is True: - raise AssertionError( - f"Expected warning {warning!r} matches more than " - f"1 seen warning (all seen warnings: " - f"{list(seen_warnings)!r})" - ) - seen_warnings[seen_warning] = True - matched = True - break - if not matched: - raise AssertionError( - f"Expected warning {warning!r} not found in {list(seen_warnings)!r}" - ) - unexpected_warnings = [ - warning for warning, is_expected in seen_warnings.items() if not is_expected - ] - if unexpected_warnings: - raise AssertionError(f"Got unexpected warnings: {unexpected_warnings}") - else: - assert not caplog.records + _assert_warnings(caplog, warnings) @pytest.mark.parametrize( @@ -983,7 +989,10 @@ def test_automap_header_output(meta, expected, warnings, caplog): None, {"httpRequestMethod": "GET"}, DEFAULT_AUTOMAP_PARAMS, - ["Use Request.method"], + [ + "Use Request.method", + "unnecessarily defines the Zyte API 'httpRequestMethod' parameter with its default value", + ], ), ( "POST", @@ -1004,6 +1013,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): [ "Use Request.method", "does not match the Zyte API httpRequestMethod", + "unnecessarily defines the Zyte API 'httpRequestMethod' parameter with its default value", ], ), ( @@ -1053,7 +1063,10 @@ def test_automap_header_output(meta, expected, warnings, caplog): ], ) def test_automap_method(method, meta, expected, warnings, caplog): - _test_automap({}, {"method": method}, meta, expected, warnings, caplog) + request_kwargs = {} + if method is not None: + request_kwargs["method"] = method + _test_automap({}, request_kwargs, meta, expected, warnings, caplog) @pytest.mark.parametrize( @@ -2901,8 +2914,7 @@ def test_automap_cookie_limit(meta, caplog): assert api_params["requestCookies"] == [ {"name": "z", "value": "y", "domain": "example.com"} ] - assert not caplog.records - caplog.clear() + _assert_warnings(caplog, []) # Verify that requests with 2 cookies results in only 1 cookie set and a # warning. @@ -2919,9 +2931,12 @@ def test_automap_cookie_limit(meta, caplog): [{"name": "z", "value": "y", "domain": "example.com"}], [{"name": "x", "value": "w", "domain": "example.com"}], ] - assert "would get 2 cookies" in caplog.text - assert "limited to 1 cookies" in caplog.text - caplog.clear() + _assert_warnings( + caplog, + [ + "would get 2 cookies, but request cookie automatic mapping is limited to 1 cookies" + ], + ) # Verify that 1 cookie in the cookie jar and 1 cookie in the request count # as 2 cookies, resulting in only 1 cookie set and a warning. @@ -2944,9 +2959,12 @@ def test_automap_cookie_limit(meta, caplog): [{"name": "z", "value": "y", "domain": "example.com"}], [{"name": "x", "value": "w", "domain": "example.com"}], ] - assert "would get 2 cookies" in caplog.text - assert "limited to 1 cookies" in caplog.text - caplog.clear() + _assert_warnings( + caplog, + [ + "would get 2 cookies, but request cookie automatic mapping is limited to 1 cookies" + ], + ) # Vefify that unrelated-domain cookies count for the limit. pre_request = Request( @@ -2968,9 +2986,12 @@ def test_automap_cookie_limit(meta, caplog): [{"name": "z", "value": "y", "domain": "other.example"}], [{"name": "x", "value": "w", "domain": "example.com"}], ] - assert "would get 2 cookies" in caplog.text - assert "limited to 1 cookies" in caplog.text - caplog.clear() + _assert_warnings( + caplog, + [ + "would get 2 cookies, but request cookie automatic mapping is limited to 1 cookies" + ], + ) class CustomCookieJar(CookieJar): @@ -3208,11 +3229,7 @@ def test_default_params_automap(default_params, meta, expected, warnings, caplog api_params = param_parser.parse(request) api_params.pop("url") assert expected == api_params - if warnings: - for warning in warnings: - assert warning in caplog.text - else: - assert not caplog.records + _assert_warnings(caplog, warnings) @pytest.mark.parametrize( @@ -3252,21 +3269,15 @@ def unflatten(dictionary): @pytest.mark.parametrize( - "old_field,new_field", + "field", [ - ( - f"experimental.{field}", - field, - ) - for field in ( - "responseCookies", - "requestCookies", - "cookieManagement", - ) + "responseCookies", + "requestCookies", + "cookieManagement", ], ) -def test_field_deprecation_warnings(old_field, new_field, caplog): - input_params = unflatten({old_field: "foo"}) +def test_field_deprecation_warnings(field, caplog): + input_params = {"experimental": {field: "foo"}} # Raw raw_request = Request( @@ -3280,13 +3291,11 @@ def test_field_deprecation_warnings(old_field, new_field, caplog): output_params = param_parser.parse(raw_request) output_params.pop("url") assert input_params == output_params - assert f"{old_field}, which is deprecated" in caplog.text - caplog.clear() + _assert_warnings(caplog, [f"experimental.{field}, which is deprecated"]) with caplog.at_level("WARNING"): # Only warn once per field. param_parser.parse(raw_request) - assert not caplog.text - caplog.clear() + _assert_warnings(caplog, []) # Automap raw_request = Request( @@ -3299,15 +3308,22 @@ def test_field_deprecation_warnings(old_field, new_field, caplog): with caplog.at_level("WARNING"): output_params = param_parser.parse(raw_request) output_params.pop("url") - for key, value in input_params.items(): + for key, value in input_params["experimental"].items(): assert output_params[key] == value - assert f"{old_field}, which is deprecated" in caplog.text - caplog.clear() + _assert_warnings( + caplog, + [ + f"experimental.{field}, which is deprecated", + f"experimental.{field} will be removed, and its value will be set as {field}", + ], + ) with caplog.at_level("WARNING"): # Only warn once per field. param_parser.parse(raw_request) - assert not caplog.text - caplog.clear() + _assert_warnings( + caplog, + [f"experimental.{field} will be removed, and its value will be set as {field}"], + ) def test_field_deprecation_warnings_false_positives(caplog): @@ -3329,7 +3345,7 @@ def test_field_deprecation_warnings_false_positives(caplog): output_params = param_parser.parse(raw_request) output_params.pop("url") assert input_params == output_params - assert not caplog.text + _assert_warnings(caplog, []) # Automap raw_request = Request( @@ -3344,4 +3360,4 @@ def test_field_deprecation_warnings_false_positives(caplog): output_params.pop("url") for key, value in input_params.items(): assert output_params[key] == value - assert not caplog.text + _assert_warnings(caplog, []) diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index ca4da582..6e058a93 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -87,10 +87,41 @@ def test_cookies(): ScrapyZyteAPIRequestFingerprinter, settings=crawler.settings, crawler=crawler ) request1 = Request( + "https://example.com", + meta={ + "zyte_api": { + "responseCookies": False, + "requestCookies": [{"name": "foo", "value": "bar"}], + "cookieManagement": False, + "experimental": { + "responseCookies": False, + "requestCookies": [{"name": "foo", "value": "bar"}], + "cookieManagement": False, + }, + } + }, + ) + # Same with responseCookies set to `True`. + request2 = Request( "https://example.com", meta={ "zyte_api": { "responseCookies": True, + "requestCookies": [{"name": "foo", "value": "bar"}], + "cookieManagement": False, + "experimental": { + "responseCookies": False, + "requestCookies": [{"name": "foo", "value": "bar"}], + "cookieManagement": False, + }, + } + }, + ) + # Same with experimental.responseCookies set to `True`. + request3 = Request( + "https://example.com", + meta={ + "zyte_api": { "requestCookies": [{"name": "foo", "value": "bar"}], "cookieManagement": False, "experimental": { @@ -101,10 +132,15 @@ def test_cookies(): } }, ) - request2 = Request("https://example.com", meta={"zyte_api": True}) + request4 = Request("https://example.com", meta={"zyte_api": True}) fingerprint1 = fingerprinter.fingerprint(request1) fingerprint2 = fingerprinter.fingerprint(request2) - assert fingerprint1 == fingerprint2 + fingerprint3 = fingerprinter.fingerprint(request3) + fingerprint4 = fingerprinter.fingerprint(request4) + assert fingerprint1 != fingerprint2 + assert fingerprint1 != fingerprint3 + assert fingerprint1 == fingerprint4 + assert fingerprint2 == fingerprint3 @pytest.mark.parametrize( From e21655aa3ce86a9782b0541f4fae8207d7acf022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 23 Jan 2024 13:37:33 +0100 Subject: [PATCH 14/16] Fix a merge issue --- CHANGES.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 2d1df1cc..8e6a785d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -87,8 +87,6 @@ To be released * Zyte API Request IDs are now included in the error logs. -* Bump the zyte-api dependency: 0.4.7 → 0.4.8. - * Split README.rst into multiple documentation files and publish them on ReadTheDocs. From e7125bf21f1319e3576a8f56692c4a20b812c260 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 23 Jan 2024 13:40:09 +0100 Subject: [PATCH 15/16] Update the fingerprint docs now that responseCookies affects fingerprinting --- docs/reference/fingerprint-params.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/reference/fingerprint-params.rst b/docs/reference/fingerprint-params.rst index c5179d08..c94d93cc 100644 --- a/docs/reference/fingerprint-params.rst +++ b/docs/reference/fingerprint-params.rst @@ -24,10 +24,10 @@ fingerprints for Zyte API requests based on the following Zyte API parameters: :http:`request:httpRequestText` values generate the same signature. - Output parameters (:http:`request:browserHtml`, - :http:`request:httpResponseBody`, :http:`request:httpResponseHeaders`, - :http:`request:responseCookies`, :http:`request:screenshot`, and - :ref:`automatic extraction outputs ` like - :http:`request:product`) + :http:`request:httpResponseBody`, :http:`request:responseCookies`, + :http:`request:httpResponseHeaders`, :http:`request:responseCookies`, + :http:`request:screenshot`, and :ref:`automatic extraction outputs + ` like :http:`request:product`) - Rendering option parameters (:http:`request:actions`, :http:`request:device`, :http:`request:javascript`, @@ -44,8 +44,7 @@ fingerprinting: - Request header parameters (:http:`request:customHttpRequestHeaders`, :http:`request:requestHeaders`, :http:`request:requestCookies`) -- Request cookie parameters (:http:`request:cookieManagement`, - :http:`request:requestCookies`) +- :http:`request:cookieManagement` - Session handling parameters (:http:`request:sessionContext`, :http:`request:sessionContextParameters`) From 888ac92fe12471dd30b5518580d5050b9ad6fdd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 23 Jan 2024 13:50:26 +0100 Subject: [PATCH 16/16] Remove an unused function --- tests/test_api_requests.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 6c9e27b7..4d38dbdd 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -3254,20 +3254,6 @@ def test_default_params_false(default_params): assert api_params is None -# https://stackoverflow.com/a/6037657 -def unflatten(dictionary): - resultDict: Dict[Any, Any] = dict() - for key, value in dictionary.items(): - parts = key.split(".") - d = resultDict - for part in parts[:-1]: - if part not in d: - d[part] = dict() - d = d[part] - d[parts[-1]] = value - return resultDict - - @pytest.mark.parametrize( "field", [