SERP (MVP) and 0.9.0 release notes (#62)

zytedata · Sep 17, 2024 · 426c0c7 · 426c0c7
1 parent 84a37a1
commit 426c0c7
Show file tree

Hide file tree

Showing 23 changed files with 951 additions and 192 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,31 @@
 Changes
 =======
 
+0.9.0 (2024-09-NN)
+------------------
+
+* Now requires ``zyte-common-items >= 0.22.0``.
+
+* New :ref:`Google Search spider template <google-search>`, built on top of
+  Zyte API’s :http:`request:serp`.
+
+* The heuristics of the :ref:`e-commerce spider template <e-commerce>` to
+  ignore certain URLs when following category links now also handles
+  subdomains. For example, before https://example.com/blog was ignored, now
+  https://blog.example.com is also ignored.
+
+* In the :ref:`spider parameters JSON schema <params-schema>`, the
+  :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy`
+  parameter of the :ref:`e-commerce spider template <e-commerce>` switches
+  position, from being the last parameter to being between
+  :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file`
+  and
+  :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`.
+
+* Removed the ``valid_page_types`` attribute of
+  :class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`.
+
+
 0.8.0 (2024-08-21)
 ------------------
 

diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py
@@ -1,4 +1,45 @@
+import re
+
+from docutils import nodes
+from docutils.parsers.rst.roles import set_classes
+
+
+def http_api_reference_role(
+    name, rawtext, text, lineno, inliner, options={}, content=[]
+):
+    match = re.search(
+        r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text
+    )
+    if match:
+        display_text = match[1]
+        reference = match[2]
+    else:
+        display_text = None
+        reference = text
+    if reference.startswith("request:"):
+        request_or_response = "request"
+    elif reference.startswith("response:"):
+        request_or_response = "response/200"
+    else:
+        raise ValueError(
+            f":http: directive reference must start with request: or "
+            f"response:, got {reference} from {text!r}."
+        )
+
+    field = reference.split(":", maxsplit=1)[1]
+    if not display_text:
+        display_text = field
+    refuri = (
+        f"https://docs.zyte.com/zyte-api/usage/reference.html"
+        f"#operation/extract/{request_or_response}/{field}"
+    )
+    set_classes(options)
+    node = nodes.reference(rawtext, display_text, refuri=refuri, **options)
+    return [node], []
+
+
 def setup(app):
+    app.add_role("http", http_api_reference_role)
     # https://stackoverflow.com/a/13663325
     #
     # Scrapy’s

diff --git a/docs/conf.py b/docs/conf.py
@@ -34,6 +34,10 @@
         "https://scrapy-poet.readthedocs.io/en/stable",
         None,
     ),
+    "scrapy-spider-metadata": (
+        "https://scrapy-spider-metadata.readthedocs.io/en/latest",
+        None,
+    ),
     "scrapy-zyte-api": (
         "https://scrapy-zyte-api.readthedocs.io/en/stable",
         None,
@@ -48,8 +52,11 @@
     ),
 }
 
+autodoc_pydantic_model_show_config_summary = False
 autodoc_pydantic_model_show_field_summary = False
 autodoc_pydantic_model_show_json = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_show_validator_summary = False
 
 # sphinx-reredirects
 redirects = {

diff --git a/docs/index.rst b/docs/index.rst
@@ -18,6 +18,7 @@ zyte-spider-templates documentation
 
    templates/index
    E-commerce <templates/e-commerce>
+   Google search <templates/google-search>
 
 .. toctree::
    :caption: Customization

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
@@ -9,6 +9,8 @@ Spiders
 
 .. autoclass:: zyte_spider_templates.EcommerceSpider
 
+.. autoclass:: zyte_spider_templates.GoogleSearchSpider
+
 
 Pages
 =====
@@ -41,3 +43,6 @@ Parameter mixins
     :exclude-members: model_computed_fields
 
 .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
+
+.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
+    :exclude-members: model_computed_fields
diff --git a/docs/templates/google-search.rst b/docs/templates/google-search.rst
@@ -0,0 +1,19 @@
+.. _google-search:
+
+=================================================
+Google search spider template (``google_search``)
+=================================================
+
+Basic use
+=========
+
+.. code-block:: shell
+
+    scrapy crawl google_search -a search_queries="foo bar"
+
+Parameters
+==========
+
+.. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams
+    :inherited-members: BaseModel
+    :exclude-members: model_computed_fields
diff --git a/docs/templates/index.rst b/docs/templates/index.rst
@@ -29,3 +29,6 @@ Spider template list
 
 :ref:`E-commerce <e-commerce>`
     Get products from an e-commerce website.
+
+:ref:`Google Search <google-search>`
+    Get Google search results.
diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
         "scrapy-poet>=0.21.0",
         "scrapy-spider-metadata>=0.1.2",
         "scrapy-zyte-api[provider]>=0.16.0",
-        "zyte-common-items>=0.13.0",
+        "zyte-common-items>=0.22.0",
     ],
     classifiers=[
         "Development Status :: 3 - Alpha",

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,7 +1,11 @@
 from typing import Any, Dict, Optional
 
+import pytest
 from scrapy.utils.test import TestSpider
 
+# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting
+pytest.register_assert_rewrite("tests.utils")
+
 
 # scrapy.utils.test.get_crawler alternative that does not freeze settings.
 def get_crawler(*, settings: Optional[Dict[str, Any]] = None):

diff --git a/tests/test_base.py b/tests/test_base.py
@@ -5,4 +5,4 @@
 
 def test_deprecation():
     with pytest.deprecated_call(match="^BaseSpiderParams is deprecated.*"):
-        BaseSpiderParams(url="https://example.com")
+        BaseSpiderParams(url="https://example.com")  # type: ignore[call-arg]
diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
@@ -1,6 +1,4 @@
-import json
 import logging
-import re
 from unittest.mock import MagicMock, call, patch
 
 import pytest
@@ -11,7 +9,6 @@
 from scrapy_spider_metadata import get_spider_metadata
 from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request
 
-from zyte_spider_templates import BaseSpiderParams
 from zyte_spider_templates._geolocations import (
     GEOLOCATION_OPTIONS,
     GEOLOCATION_OPTIONS_WITH_CODE,
@@ -24,6 +21,7 @@
 
 from . import get_crawler
 from .test_utils import URL_TO_DOMAIN
+from .utils import assertEqualSpiderMetadata
 
 
 def test_parameters():
@@ -362,21 +360,6 @@ def test_arguments():
         assert spider.allowed_domains == ["example.com"]
 
 
-def assertEqualJson(actual, expected):
-    """Compare the JSON representation of 2 Python objects.
-
-    This allows to take into account things like the order of key-value pairs
-    in dictionaries, which would not be taken into account when comparing
-    dictionaries directly.
-
-    It also generates a better diff in pytest output when enums are involved,
-    e.g. geolocation values.
-    """
-    actual_json = json.dumps(actual, indent=2)
-    expected_json = json.dumps(expected, indent=2)
-    assert actual_json == expected_json
-
-
 def test_metadata():
     actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True)
     expected_metadata = {
@@ -428,7 +411,7 @@ def test_metadata():
                     "description": (
                         "URL that point to a plain-text file with a list of "
                         "URLs to crawl, e.g. "
-                        "https://example.com/url-list.txt. The linked list "
+                        "https://example.com/url-list.txt. The linked file "
                         "must contain 1 URL per line."
                     ),
                     "exclusiveRequired": True,
@@ -480,7 +463,7 @@ def test_metadata():
                             "title": "Pagination Only",
                         },
                     },
-                    "title": "Crawl strategy",
+                    "title": "Crawl Strategy",
                     "enum": [
                         "automatic",
                         "full",
@@ -550,60 +533,14 @@ def test_metadata():
             "type": "object",
         },
     }
-    assertEqualJson(actual_metadata, expected_metadata)
+    assertEqualSpiderMetadata(actual_metadata, expected_metadata)
 
     geolocation = actual_metadata["param_schema"]["properties"]["geolocation"]
     assert geolocation["enum"][0] == "AF"
     assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"}
     assert set(geolocation["enum"]) == set(geolocation["enumMeta"])
 
 
-@pytest.mark.parametrize(
-    "valid,url",
-    [
-        (False, ""),
-        (False, "http://"),
-        (False, "http:/example.com"),
-        (False, "ftp://example.com"),
-        (False, "example.com"),
-        (False, "//example.com"),
-        (False, "http://foo:[email protected]"),
-        (False, " http://example.com"),
-        (False, "http://example.com "),
-        (False, "http://examp le.com"),
-        (False, "https://example.com:232323"),
-        (True, "http://example.com"),
-        (True, "http://bücher.example"),
-        (True, "http://xn--bcher-kva.example"),
-        (True, "https://i❤.ws"),
-        (True, "https://example.com"),
-        (True, "https://example.com/"),
-        (True, "https://example.com:2323"),
-        (True, "https://example.com:2323/"),
-        (True, "https://example.com:2323/foo"),
-        (True, "https://example.com/f"),
-        (True, "https://example.com/foo"),
-        (True, "https://example.com/foo/"),
-        (True, "https://example.com/foo/bar"),
-        (True, "https://example.com/foo/bar/"),
-        (True, "https://example.com/foo/bar?baz"),
-        (True, "https://example.com/foo/bar/?baz"),
-        (True, "https://example.com?foo"),
-        (True, "https://example.com?foo=bar"),
-        (True, "https://example.com/?foo=bar&baz"),
-        (True, "https://example.com/?foo=bar&baz#"),
-        (True, "https://example.com/?foo=bar&baz#frag"),
-        (True, "https://example.com#"),
-        (True, "https://example.com/#"),
-        (True, "https://example.com/&"),
-        (True, "https://example.com/&#"),
-    ],
-)
-def test_validation_url(url, valid):
-    url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern
-    assert bool(re.match(url_re, url)) == valid
-
-
 def test_get_parse_product_request():
     base_kwargs = {
         "url": "https://example.com",
@@ -818,7 +755,7 @@ def test_urls_file():
     crawler = get_crawler()
     url = "https://example.com"
 
-    with patch("zyte_spider_templates.spiders.ecommerce.requests.get") as mock_get:
+    with patch("zyte_spider_templates.params.requests.get") as mock_get:
         response = requests.Response()
         response._content = (
             b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n"