Skip to content

Commit

Permalink
SERP (MVP) and 0.9.0 release notes (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Sep 17, 2024
1 parent 84a37a1 commit 426c0c7
Show file tree
Hide file tree
Showing 23 changed files with 951 additions and 192 deletions.
25 changes: 25 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
Changes
=======

0.9.0 (2024-09-NN)
------------------

* Now requires ``zyte-common-items >= 0.22.0``.

* New :ref:`Google Search spider template <google-search>`, built on top of
Zyte API’s :http:`request:serp`.

* The heuristics of the :ref:`e-commerce spider template <e-commerce>` to
ignore certain URLs when following category links now also handles
subdomains. For example, before https://example.com/blog was ignored, now
https://blog.example.com is also ignored.

* In the :ref:`spider parameters JSON schema <params-schema>`, the
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy`
parameter of the :ref:`e-commerce spider template <e-commerce>` switches
position, from being the last parameter to being between
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file`
and
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`.

* Removed the ``valid_page_types`` attribute of
:class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`.


0.8.0 (2024-08-21)
------------------

Expand Down
41 changes: 41 additions & 0 deletions docs/_ext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,45 @@
import re

from docutils import nodes
from docutils.parsers.rst.roles import set_classes


def http_api_reference_role(
name, rawtext, text, lineno, inliner, options={}, content=[]
):
match = re.search(
r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text
)
if match:
display_text = match[1]
reference = match[2]
else:
display_text = None
reference = text
if reference.startswith("request:"):
request_or_response = "request"
elif reference.startswith("response:"):
request_or_response = "response/200"
else:
raise ValueError(
f":http: directive reference must start with request: or "
f"response:, got {reference} from {text!r}."
)

field = reference.split(":", maxsplit=1)[1]
if not display_text:
display_text = field
refuri = (
f"https://docs.zyte.com/zyte-api/usage/reference.html"
f"#operation/extract/{request_or_response}/{field}"
)
set_classes(options)
node = nodes.reference(rawtext, display_text, refuri=refuri, **options)
return [node], []


def setup(app):
app.add_role("http", http_api_reference_role)
# https://stackoverflow.com/a/13663325
#
# Scrapy’s
Expand Down
7 changes: 7 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
"https://scrapy-poet.readthedocs.io/en/stable",
None,
),
"scrapy-spider-metadata": (
"https://scrapy-spider-metadata.readthedocs.io/en/latest",
None,
),
"scrapy-zyte-api": (
"https://scrapy-zyte-api.readthedocs.io/en/stable",
None,
Expand All @@ -48,8 +52,11 @@
),
}

autodoc_pydantic_model_show_config_summary = False
autodoc_pydantic_model_show_field_summary = False
autodoc_pydantic_model_show_json = False
autodoc_pydantic_model_show_validator_members = False
autodoc_pydantic_model_show_validator_summary = False

# sphinx-reredirects
redirects = {
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ zyte-spider-templates documentation

templates/index
E-commerce <templates/e-commerce>
Google search <templates/google-search>

.. toctree::
:caption: Customization
Expand Down
5 changes: 5 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Spiders

.. autoclass:: zyte_spider_templates.EcommerceSpider

.. autoclass:: zyte_spider_templates.GoogleSearchSpider


Pages
=====
Expand Down Expand Up @@ -41,3 +43,6 @@ Parameter mixins
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
:exclude-members: model_computed_fields
19 changes: 19 additions & 0 deletions docs/templates/google-search.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
.. _google-search:

=================================================
Google search spider template (``google_search``)
=================================================

Basic use
=========

.. code-block:: shell
scrapy crawl google_search -a search_queries="foo bar"
Parameters
==========

.. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams
:inherited-members: BaseModel
:exclude-members: model_computed_fields
3 changes: 3 additions & 0 deletions docs/templates/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ Spider template list

:ref:`E-commerce <e-commerce>`
Get products from an e-commerce website.

:ref:`Google Search <google-search>`
Get Google search results.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"scrapy-poet>=0.21.0",
"scrapy-spider-metadata>=0.1.2",
"scrapy-zyte-api[provider]>=0.16.0",
"zyte-common-items>=0.13.0",
"zyte-common-items>=0.22.0",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
4 changes: 4 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from typing import Any, Dict, Optional

import pytest
from scrapy.utils.test import TestSpider

# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting
pytest.register_assert_rewrite("tests.utils")


# scrapy.utils.test.get_crawler alternative that does not freeze settings.
def get_crawler(*, settings: Optional[Dict[str, Any]] = None):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@

def test_deprecation():
with pytest.deprecated_call(match="^BaseSpiderParams is deprecated.*"):
BaseSpiderParams(url="https://example.com")
BaseSpiderParams(url="https://example.com") # type: ignore[call-arg]
73 changes: 5 additions & 68 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import json
import logging
import re
from unittest.mock import MagicMock, call, patch

import pytest
Expand All @@ -11,7 +9,6 @@
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request

from zyte_spider_templates import BaseSpiderParams
from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS,
GEOLOCATION_OPTIONS_WITH_CODE,
Expand All @@ -24,6 +21,7 @@

from . import get_crawler
from .test_utils import URL_TO_DOMAIN
from .utils import assertEqualSpiderMetadata


def test_parameters():
Expand Down Expand Up @@ -362,21 +360,6 @@ def test_arguments():
assert spider.allowed_domains == ["example.com"]


def assertEqualJson(actual, expected):
"""Compare the JSON representation of 2 Python objects.
This allows to take into account things like the order of key-value pairs
in dictionaries, which would not be taken into account when comparing
dictionaries directly.
It also generates a better diff in pytest output when enums are involved,
e.g. geolocation values.
"""
actual_json = json.dumps(actual, indent=2)
expected_json = json.dumps(expected, indent=2)
assert actual_json == expected_json


def test_metadata():
actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True)
expected_metadata = {
Expand Down Expand Up @@ -428,7 +411,7 @@ def test_metadata():
"description": (
"URL that point to a plain-text file with a list of "
"URLs to crawl, e.g. "
"https://example.com/url-list.txt. The linked list "
"https://example.com/url-list.txt. The linked file "
"must contain 1 URL per line."
),
"exclusiveRequired": True,
Expand Down Expand Up @@ -480,7 +463,7 @@ def test_metadata():
"title": "Pagination Only",
},
},
"title": "Crawl strategy",
"title": "Crawl Strategy",
"enum": [
"automatic",
"full",
Expand Down Expand Up @@ -550,60 +533,14 @@ def test_metadata():
"type": "object",
},
}
assertEqualJson(actual_metadata, expected_metadata)
assertEqualSpiderMetadata(actual_metadata, expected_metadata)

geolocation = actual_metadata["param_schema"]["properties"]["geolocation"]
assert geolocation["enum"][0] == "AF"
assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"}
assert set(geolocation["enum"]) == set(geolocation["enumMeta"])


@pytest.mark.parametrize(
"valid,url",
[
(False, ""),
(False, "http://"),
(False, "http:/example.com"),
(False, "ftp://example.com"),
(False, "example.com"),
(False, "//example.com"),
(False, "http://foo:[email protected]"),
(False, " http://example.com"),
(False, "http://example.com "),
(False, "http://examp le.com"),
(False, "https://example.com:232323"),
(True, "http://example.com"),
(True, "http://bücher.example"),
(True, "http://xn--bcher-kva.example"),
(True, "https://i❤.ws"),
(True, "https://example.com"),
(True, "https://example.com/"),
(True, "https://example.com:2323"),
(True, "https://example.com:2323/"),
(True, "https://example.com:2323/foo"),
(True, "https://example.com/f"),
(True, "https://example.com/foo"),
(True, "https://example.com/foo/"),
(True, "https://example.com/foo/bar"),
(True, "https://example.com/foo/bar/"),
(True, "https://example.com/foo/bar?baz"),
(True, "https://example.com/foo/bar/?baz"),
(True, "https://example.com?foo"),
(True, "https://example.com?foo=bar"),
(True, "https://example.com/?foo=bar&baz"),
(True, "https://example.com/?foo=bar&baz#"),
(True, "https://example.com/?foo=bar&baz#frag"),
(True, "https://example.com#"),
(True, "https://example.com/#"),
(True, "https://example.com/&"),
(True, "https://example.com/&#"),
],
)
def test_validation_url(url, valid):
url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern
assert bool(re.match(url_re, url)) == valid


def test_get_parse_product_request():
base_kwargs = {
"url": "https://example.com",
Expand Down Expand Up @@ -818,7 +755,7 @@ def test_urls_file():
crawler = get_crawler()
url = "https://example.com"

with patch("zyte_spider_templates.spiders.ecommerce.requests.get") as mock_get:
with patch("zyte_spider_templates.params.requests.get") as mock_get:
response = requests.Response()
response._content = (
b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n"
Expand Down
Loading

0 comments on commit 426c0c7

Please sign in to comment.