diff --git a/CHANGES.rst b/CHANGES.rst index abce60b..b64a334 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,31 @@ Changes ======= +0.9.0 (2024-09-NN) +------------------ + +* Now requires ``zyte-common-items >= 0.22.0``. + +* New :ref:`Google Search spider template `, built on top of + Zyte API’s :http:`request:serp`. + +* The heuristics of the :ref:`e-commerce spider template ` to + ignore certain URLs when following category links now also handles + subdomains. For example, before https://example.com/blog was ignored, now + https://blog.example.com is also ignored. + +* In the :ref:`spider parameters JSON schema `, the + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.crawl_strategy` + parameter of the :ref:`e-commerce spider template ` switches + position, from being the last parameter to being between + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls_file` + and + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.geolocation`. + +* Removed the ``valid_page_types`` attribute of + :class:`zyte_spider_templates.middlewares.CrawlingLogsMiddleware`. + + 0.8.0 (2024-08-21) ------------------ diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py index 5a3839e..4181427 100644 --- a/docs/_ext/__init__.py +++ b/docs/_ext/__init__.py @@ -1,4 +1,45 @@ +import re + +from docutils import nodes +from docutils.parsers.rst.roles import set_classes + + +def http_api_reference_role( + name, rawtext, text, lineno, inliner, options={}, content=[] +): + match = re.search( + r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text + ) + if match: + display_text = match[1] + reference = match[2] + else: + display_text = None + reference = text + if reference.startswith("request:"): + request_or_response = "request" + elif reference.startswith("response:"): + request_or_response = "response/200" + else: + raise ValueError( + f":http: directive reference must start with request: or " + f"response:, got {reference} from {text!r}." + ) + + field = reference.split(":", maxsplit=1)[1] + if not display_text: + display_text = field + refuri = ( + f"https://docs.zyte.com/zyte-api/usage/reference.html" + f"#operation/extract/{request_or_response}/{field}" + ) + set_classes(options) + node = nodes.reference(rawtext, display_text, refuri=refuri, **options) + return [node], [] + + def setup(app): + app.add_role("http", http_api_reference_role) # https://stackoverflow.com/a/13663325 # # Scrapy’s diff --git a/docs/conf.py b/docs/conf.py index 5a610e3..ff0ef7f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,6 +34,10 @@ "https://scrapy-poet.readthedocs.io/en/stable", None, ), + "scrapy-spider-metadata": ( + "https://scrapy-spider-metadata.readthedocs.io/en/latest", + None, + ), "scrapy-zyte-api": ( "https://scrapy-zyte-api.readthedocs.io/en/stable", None, @@ -48,8 +52,11 @@ ), } +autodoc_pydantic_model_show_config_summary = False autodoc_pydantic_model_show_field_summary = False autodoc_pydantic_model_show_json = False +autodoc_pydantic_model_show_validator_members = False +autodoc_pydantic_model_show_validator_summary = False # sphinx-reredirects redirects = { diff --git a/docs/index.rst b/docs/index.rst index d344faa..1083299 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,6 +18,7 @@ zyte-spider-templates documentation templates/index E-commerce + Google search .. toctree:: :caption: Customization diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 81826cb..dd368dd 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -9,6 +9,8 @@ Spiders .. autoclass:: zyte_spider_templates.EcommerceSpider +.. autoclass:: zyte_spider_templates.GoogleSearchSpider + Pages ===== @@ -41,3 +43,6 @@ Parameter mixins :exclude-members: model_computed_fields .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy + +.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam + :exclude-members: model_computed_fields diff --git a/docs/templates/google-search.rst b/docs/templates/google-search.rst new file mode 100644 index 0000000..a8ba77c --- /dev/null +++ b/docs/templates/google-search.rst @@ -0,0 +1,19 @@ +.. _google-search: + +================================================= +Google search spider template (``google_search``) +================================================= + +Basic use +========= + +.. code-block:: shell + + scrapy crawl google_search -a search_queries="foo bar" + +Parameters +========== + +.. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams + :inherited-members: BaseModel + :exclude-members: model_computed_fields diff --git a/docs/templates/index.rst b/docs/templates/index.rst index c70a7de..ea86c6d 100644 --- a/docs/templates/index.rst +++ b/docs/templates/index.rst @@ -29,3 +29,6 @@ Spider template list :ref:`E-commerce ` Get products from an e-commerce website. + +:ref:`Google Search ` + Get Google search results. diff --git a/setup.py b/setup.py index 3871341..e5f8e9b 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "scrapy-poet>=0.21.0", "scrapy-spider-metadata>=0.1.2", "scrapy-zyte-api[provider]>=0.16.0", - "zyte-common-items>=0.13.0", + "zyte-common-items>=0.22.0", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/__init__.py b/tests/__init__.py index 5e99e9c..2aa5953 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,11 @@ from typing import Any, Dict, Optional +import pytest from scrapy.utils.test import TestSpider +# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting +pytest.register_assert_rewrite("tests.utils") + # scrapy.utils.test.get_crawler alternative that does not freeze settings. def get_crawler(*, settings: Optional[Dict[str, Any]] = None): diff --git a/tests/test_base.py b/tests/test_base.py index 34fb730..b5f5c1c 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -5,4 +5,4 @@ def test_deprecation(): with pytest.deprecated_call(match="^BaseSpiderParams is deprecated.*"): - BaseSpiderParams(url="https://example.com") + BaseSpiderParams(url="https://example.com") # type: ignore[call-arg] diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 4e794fb..ae77049 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -1,6 +1,4 @@ -import json import logging -import re from unittest.mock import MagicMock, call, patch import pytest @@ -11,7 +9,6 @@ from scrapy_spider_metadata import get_spider_metadata from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request -from zyte_spider_templates import BaseSpiderParams from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, GEOLOCATION_OPTIONS_WITH_CODE, @@ -24,6 +21,7 @@ from . import get_crawler from .test_utils import URL_TO_DOMAIN +from .utils import assertEqualSpiderMetadata def test_parameters(): @@ -362,21 +360,6 @@ def test_arguments(): assert spider.allowed_domains == ["example.com"] -def assertEqualJson(actual, expected): - """Compare the JSON representation of 2 Python objects. - - This allows to take into account things like the order of key-value pairs - in dictionaries, which would not be taken into account when comparing - dictionaries directly. - - It also generates a better diff in pytest output when enums are involved, - e.g. geolocation values. - """ - actual_json = json.dumps(actual, indent=2) - expected_json = json.dumps(expected, indent=2) - assert actual_json == expected_json - - def test_metadata(): actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True) expected_metadata = { @@ -428,7 +411,7 @@ def test_metadata(): "description": ( "URL that point to a plain-text file with a list of " "URLs to crawl, e.g. " - "https://example.com/url-list.txt. The linked list " + "https://example.com/url-list.txt. The linked file " "must contain 1 URL per line." ), "exclusiveRequired": True, @@ -480,7 +463,7 @@ def test_metadata(): "title": "Pagination Only", }, }, - "title": "Crawl strategy", + "title": "Crawl Strategy", "enum": [ "automatic", "full", @@ -550,7 +533,7 @@ def test_metadata(): "type": "object", }, } - assertEqualJson(actual_metadata, expected_metadata) + assertEqualSpiderMetadata(actual_metadata, expected_metadata) geolocation = actual_metadata["param_schema"]["properties"]["geolocation"] assert geolocation["enum"][0] == "AF" @@ -558,52 +541,6 @@ def test_metadata(): assert set(geolocation["enum"]) == set(geolocation["enumMeta"]) -@pytest.mark.parametrize( - "valid,url", - [ - (False, ""), - (False, "http://"), - (False, "http:/example.com"), - (False, "ftp://example.com"), - (False, "example.com"), - (False, "//example.com"), - (False, "http://foo:bar@example.com"), - (False, " http://example.com"), - (False, "http://example.com "), - (False, "http://examp le.com"), - (False, "https://example.com:232323"), - (True, "http://example.com"), - (True, "http://bücher.example"), - (True, "http://xn--bcher-kva.example"), - (True, "https://i❤.ws"), - (True, "https://example.com"), - (True, "https://example.com/"), - (True, "https://example.com:2323"), - (True, "https://example.com:2323/"), - (True, "https://example.com:2323/foo"), - (True, "https://example.com/f"), - (True, "https://example.com/foo"), - (True, "https://example.com/foo/"), - (True, "https://example.com/foo/bar"), - (True, "https://example.com/foo/bar/"), - (True, "https://example.com/foo/bar?baz"), - (True, "https://example.com/foo/bar/?baz"), - (True, "https://example.com?foo"), - (True, "https://example.com?foo=bar"), - (True, "https://example.com/?foo=bar&baz"), - (True, "https://example.com/?foo=bar&baz#"), - (True, "https://example.com/?foo=bar&baz#frag"), - (True, "https://example.com#"), - (True, "https://example.com/#"), - (True, "https://example.com/&"), - (True, "https://example.com/&#"), - ], -) -def test_validation_url(url, valid): - url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern - assert bool(re.match(url_re, url)) == valid - - def test_get_parse_product_request(): base_kwargs = { "url": "https://example.com", @@ -818,7 +755,7 @@ def test_urls_file(): crawler = get_crawler() url = "https://example.com" - with patch("zyte_spider_templates.spiders.ecommerce.requests.get") as mock_get: + with patch("zyte_spider_templates.params.requests.get") as mock_get: response = requests.Response() response._content = ( b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n" diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 6fc03ea..9b808bb 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -36,13 +36,7 @@ def results_gen(): crawl_logs = middleware.crawl_logs(response, results_gen()) assert crawl_logs == ( "Crawling Logs for https://example.com (parsed as: None):\n" - "Number of Requests per page type:\n" - "- product: 0\n" - "- nextPage: 0\n" - "- subCategories: 0\n" - "- productNavigation: 0\n" - "- productNavigation-heuristics: 0\n" - "- unknown: 0\n" + "Nothing to crawl.\n" "Structured Logs:\n" "{\n" ' "time": "2023-10-10 20:09:29",\n' @@ -53,14 +47,7 @@ def results_gen(): ' "page_type": null,\n' ' "probability": null\n' " },\n" - ' "to_crawl": {\n' - ' "product": [],\n' - ' "nextPage": [],\n' - ' "subCategories": [],\n' - ' "productNavigation": [],\n' - ' "productNavigation-heuristics": [],\n' - ' "unknown": []\n' - " }\n" + ' "to_crawl": {}\n' "}" ) @@ -131,15 +118,19 @@ def test_crawling_logs_middleware(): }, }, ) - unknown_request = Request( - "https://example.com/other-unknown", + custom_request = Request( + "https://example.com/custom-page-type", meta={ "crawling_logs": { - "name": "Unknown Page", + "name": "Custom Page", "page_type": "some other page_type", + "foo": "bar", }, }, ) + unknown_request = Request( + "https://example.com/other-unknown", + ) request_fingerprint = get_fingerprinter(crawler) fingerprint = request_fingerprint(request) @@ -150,6 +141,7 @@ def test_crawling_logs_middleware(): product_navigation_heuristics_request_fp = request_fingerprint( product_navigation_heuristics_request ) + custom_request_fp = request_fingerprint(custom_request) unknown_request_fp = request_fingerprint(unknown_request) def results_gen(): @@ -158,6 +150,7 @@ def results_gen(): yield subcategory_request yield product_navigation_request yield product_navigation_heuristics_request + yield custom_request yield unknown_request crawl_logs = middleware.crawl_logs(response, results_gen()) @@ -169,6 +162,7 @@ def results_gen(): "- subCategories: 1\n" "- productNavigation: 1\n" "- productNavigation-heuristics: 1\n" + "- some other page_type: 1\n" "- unknown: 1\n" "Structured Logs:\n" "{\n" @@ -231,10 +225,18 @@ def results_gen(): f' "request_fingerprint": "{product_navigation_heuristics_request_fp}"\n' " }\n" " ],\n" - ' "unknown": [\n' + ' "some other page_type": [\n' " {\n" - ' "name": "Unknown Page",\n' + ' "name": "Custom Page",\n' ' "page_type": "some other page_type",\n' + ' "foo": "bar",\n' + ' "request_url": "https://example.com/custom-page-type",\n' + ' "request_priority": 0,\n' + f' "request_fingerprint": "{custom_request_fp}"\n' + " }\n" + " ],\n" + ' "unknown": [\n' + " {\n" ' "request_url": "https://example.com/other-unknown",\n' ' "request_priority": 0,\n' f' "request_fingerprint": "{unknown_request_fp}"\n' diff --git a/tests/test_params.py b/tests/test_params.py new file mode 100644 index 0000000..df08a19 --- /dev/null +++ b/tests/test_params.py @@ -0,0 +1,51 @@ +import re + +import pytest + +from zyte_spider_templates.params import URL_FIELD_KWARGS + + +@pytest.mark.parametrize( + "valid,url", + [ + (False, ""), + (False, "http://"), + (False, "http:/example.com"), + (False, "ftp://example.com"), + (False, "example.com"), + (False, "//example.com"), + (False, "http://foo:bar@example.com"), + (False, " http://example.com"), + (False, "http://example.com "), + (False, "http://examp le.com"), + (False, "https://example.com:232323"), + (True, "http://example.com"), + (True, "http://bücher.example"), + (True, "http://xn--bcher-kva.example"), + (True, "https://i❤.ws"), + (True, "https://example.com"), + (True, "https://example.com/"), + (True, "https://example.com:2323"), + (True, "https://example.com:2323/"), + (True, "https://example.com:2323/foo"), + (True, "https://example.com/f"), + (True, "https://example.com/foo"), + (True, "https://example.com/foo/"), + (True, "https://example.com/foo/bar"), + (True, "https://example.com/foo/bar/"), + (True, "https://example.com/foo/bar?baz"), + (True, "https://example.com/foo/bar/?baz"), + (True, "https://example.com?foo"), + (True, "https://example.com?foo=bar"), + (True, "https://example.com/?foo=bar&baz"), + (True, "https://example.com/?foo=bar&baz#"), + (True, "https://example.com/?foo=bar&baz#frag"), + (True, "https://example.com#"), + (True, "https://example.com/#"), + (True, "https://example.com/&"), + (True, "https://example.com/&#"), + ], +) +def test_url_pattern(url, valid): + assert isinstance(URL_FIELD_KWARGS["pattern"], str) + assert bool(re.match(URL_FIELD_KWARGS["pattern"], url)) == valid diff --git a/tests/test_serp.py b/tests/test_serp.py new file mode 100644 index 0000000..e8ec9fe --- /dev/null +++ b/tests/test_serp.py @@ -0,0 +1,313 @@ +import pytest +from pydantic import ValidationError +from scrapy_spider_metadata import get_spider_metadata + +from zyte_spider_templates.spiders.serp import GoogleSearchSpider + +from . import get_crawler +from .utils import assertEqualSpiderMetadata + + +def test_parameters(): + with pytest.raises(ValidationError): + GoogleSearchSpider() + + with pytest.raises(ValidationError): + GoogleSearchSpider(domain="google.com") + + GoogleSearchSpider(search_queries="foo bar") + GoogleSearchSpider(domain="google.cat", search_queries="foo bar") + GoogleSearchSpider(domain="google.cat", search_queries="foo bar", max_pages=10) + + with pytest.raises(ValidationError): + GoogleSearchSpider(domain="google.foo", search_queries="foo bar") + + with pytest.raises(ValidationError): + GoogleSearchSpider(search_queries="foo bar", max_pages="all") + + +def test_start_requests(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo+bar" + assert requests[0].callback == spider.parse_serp + + +def test_metadata(): + actual_metadata = get_spider_metadata(GoogleSearchSpider, normalize=True) + expected_metadata = { + "template": True, + "title": "Google Search Results", + "description": "Template for spiders that extract Google search results.", + "param_schema": { + "properties": { + "domain": { + "default": "google.com", + "description": "Target Google domain.", + "title": "Domain", + "enum": [ + "google.com", + "google.ad", + "google.ae", + "google.al", + "google.am", + "google.as", + "google.at", + "google.az", + "google.ba", + "google.be", + "google.bf", + "google.bg", + "google.bi", + "google.bj", + "google.bs", + "google.bt", + "google.by", + "google.ca", + "google.cat", + "google.cd", + "google.cf", + "google.cg", + "google.ch", + "google.ci", + "google.cl", + "google.cm", + "google.cn", + "google.co.ao", + "google.co.bw", + "google.co.ck", + "google.co.cr", + "google.co.id", + "google.co.il", + "google.co.in", + "google.co.jp", + "google.co.ke", + "google.co.kr", + "google.co.ls", + "google.co.ma", + "google.co.mz", + "google.co.nz", + "google.co.th", + "google.co.tz", + "google.co.ug", + "google.co.uk", + "google.co.uz", + "google.co.ve", + "google.co.vi", + "google.co.za", + "google.co.zm", + "google.co.zw", + "google.com.af", + "google.com.ag", + "google.com.ar", + "google.com.au", + "google.com.bd", + "google.com.bh", + "google.com.bn", + "google.com.bo", + "google.com.br", + "google.com.bz", + "google.com.co", + "google.com.cu", + "google.com.cy", + "google.com.do", + "google.com.ec", + "google.com.eg", + "google.com.et", + "google.com.fj", + "google.com.gh", + "google.com.gi", + "google.com.gt", + "google.com.hk", + "google.com.jm", + "google.com.kh", + "google.com.kw", + "google.com.lb", + "google.com.ly", + "google.com.mm", + "google.com.mt", + "google.com.mx", + "google.com.my", + "google.com.na", + "google.com.ng", + "google.com.ni", + "google.com.np", + "google.com.om", + "google.com.pa", + "google.com.pe", + "google.com.pg", + "google.com.ph", + "google.com.pk", + "google.com.pr", + "google.com.py", + "google.com.qa", + "google.com.sa", + "google.com.sb", + "google.com.sg", + "google.com.sl", + "google.com.sv", + "google.com.tj", + "google.com.tr", + "google.com.tw", + "google.com.ua", + "google.com.uy", + "google.com.vc", + "google.com.vn", + "google.cv", + "google.cz", + "google.de", + "google.dj", + "google.dk", + "google.dm", + "google.dz", + "google.ee", + "google.es", + "google.fi", + "google.fm", + "google.fr", + "google.ga", + "google.ge", + "google.gg", + "google.gl", + "google.gm", + "google.gr", + "google.gy", + "google.hn", + "google.hr", + "google.ht", + "google.hu", + "google.ie", + "google.im", + "google.iq", + "google.is", + "google.it", + "google.je", + "google.jo", + "google.kg", + "google.ki", + "google.kz", + "google.la", + "google.li", + "google.lk", + "google.lt", + "google.lu", + "google.lv", + "google.md", + "google.me", + "google.mg", + "google.mk", + "google.ml", + "google.mn", + "google.mu", + "google.mv", + "google.mw", + "google.ne", + "google.nl", + "google.no", + "google.nr", + "google.nu", + "google.pl", + "google.pn", + "google.ps", + "google.pt", + "google.ro", + "google.rs", + "google.ru", + "google.rw", + "google.sc", + "google.se", + "google.sh", + "google.si", + "google.sk", + "google.sm", + "google.sn", + "google.so", + "google.sr", + "google.st", + "google.td", + "google.tg", + "google.tl", + "google.tm", + "google.tn", + "google.to", + "google.tt", + "google.vu", + "google.ws", + ], + "type": "string", + }, + "search_queries": { + "anyOf": [ + {"items": {"type": "string"}, "type": "array"}, + {"type": "null"}, + ], + "description": "Input 1 search query per line (e.g. foo bar).", + "title": "Search Queries", + "widget": "textarea", + }, + "max_pages": { + "default": 1, + "description": ( + "Maximum number of result pages to visit per search query." + ), + "title": "Max Pages", + "type": "integer", + }, + "max_requests": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + "default": 100, + "description": ( + "The maximum number of Zyte API requests allowed for the crawl.\n" + "\n" + "Requests with error responses that cannot be retried or exceed " + "their retry limit also count here, but they incur in no costs " + "and do not increase the request count in Scrapy Cloud." + ), + "title": "Max Requests", + "widget": "request-limit", + }, + }, + "required": ["search_queries"], + "title": "GoogleSearchSpiderParams", + "type": "object", + }, + } + assertEqualSpiderMetadata(actual_metadata, expected_metadata) + + +def test_input_none(): + crawler = get_crawler() + with pytest.raises(ValueError): + GoogleSearchSpider.from_crawler(crawler) + + +@pytest.mark.parametrize( + ("input_domain", "expected_domain"), + ( + (None, "google.com"), + ("google.com", "google.com"), + ("google.cat", "google.cat"), + ), +) +def test_domain(input_domain, expected_domain): + crawler = get_crawler() + kwargs = {} + if input_domain: + kwargs["domain"] = input_domain + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", **kwargs + ) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == f"https://www.{expected_domain}/search?q=foo+bar" + + +def test_search_queries(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar\nbaz") + requests = list(spider.start_requests()) + assert len(requests) == 2 + assert requests[0].url == "https://www.google.com/search?q=foo+bar" + assert requests[1].url == "https://www.google.com/search?q=baz" diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..c18cb9b --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,19 @@ +import json + + +def assertEqualSpiderMetadata(actual, expected): + """Compare 2 JSON schemas of spider metadata. + + The parameter order in the parameter schema is taken into account, given + how it affects the UI, while the order of other object keys may be + different. + + It also generates a better diff in pytest output when enums are involved, + e.g. geolocation values. + """ + assert tuple(actual["param_schema"]["properties"]) == tuple( + expected["param_schema"]["properties"] + ) + actual_json = json.dumps(actual, indent=2, sort_keys=True) + expected_json = json.dumps(expected, indent=2, sort_keys=True) + assert actual_json == expected_json diff --git a/tox.ini b/tox.ini index ce4287d..a88f936 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ deps = scrapy-poet==0.21.0 scrapy-spider-metadata==0.1.2 scrapy-zyte-api[provider]==0.16.0 - zyte-common-items==0.13.0 + zyte-common-items==0.22.0 [testenv:mypy] deps = diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py index e3de8c9..75bfbde 100644 --- a/zyte_spider_templates/__init__.py +++ b/zyte_spider_templates/__init__.py @@ -1,2 +1,3 @@ from .spiders.base import BaseSpider, BaseSpiderParams from .spiders.ecommerce import EcommerceSpider +from .spiders.serp import GoogleSearchSpider diff --git a/zyte_spider_templates/middlewares.py b/zyte_spider_templates/middlewares.py index 5a40872..2cd8019 100644 --- a/zyte_spider_templates/middlewares.py +++ b/zyte_spider_templates/middlewares.py @@ -1,6 +1,7 @@ import json import logging import warnings +from collections import defaultdict from datetime import datetime from typing import Any, Dict from warnings import warn @@ -28,13 +29,6 @@ class CrawlingLogsMiddleware: the fingerprints logged in Scrapy Cloud's request data. """ - valid_page_types = [ - "product", - "nextPage", - "subCategories", - "productNavigation", - "productNavigation-heuristics", - ] unknown_page_type = "unknown" @classmethod @@ -82,12 +76,9 @@ def crawl_logs(self, response, result): "probability" ), }, - "to_crawl": {}, + "to_crawl": defaultdict(list), } - for page_type in self.valid_page_types + [self.unknown_page_type]: - data["to_crawl"][page_type] = [] - if result: for entry in result: if not isinstance(entry, Request): @@ -104,14 +95,17 @@ def crawl_logs(self, response, result): ) page_type = crawling_logs.get("page_type") - if page_type not in self.valid_page_types: + if not page_type: page_type = self.unknown_page_type data["to_crawl"][page_type].append(crawling_logs) - summary = ["Number of Requests per page type:"] - for page_type, requests in data["to_crawl"].items(): - summary.append(f"- {page_type}: {len(requests)}") + if data["to_crawl"]: + summary = ["Number of Requests per page type:"] + for page_type, requests in data["to_crawl"].items(): + summary.append(f"- {page_type}: {len(requests)}") + else: + summary = ["Nothing to crawl."] report = [ f"Crawling Logs for {response.url} (parsed as: {current_page_type}):", diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index ca1158b..f3190ab 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -4,6 +4,7 @@ from logging import getLogger from typing import Dict, List, Optional, Union +import requests from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator try: @@ -18,7 +19,7 @@ ) from zyte_spider_templates.documentation import document_enum -from .utils import _URL_PATTERN +from .utils import _URL_PATTERN, get_domain, load_url_list logger = getLogger(__name__) @@ -119,39 +120,108 @@ def validate_input_group(model): return model +URLS_FILE_FIELD_KWARGS = { + "title": "URLs file", + "description": ( + "URL that point to a plain-text file with a list of URLs to " + "crawl, e.g. https://example.com/url-list.txt. The linked file " + "must contain 1 URL per line." + ), + "pattern": _URL_PATTERN, + "default": "", + "json_schema_extra": { + "group": "inputs", + "exclusiveRequired": True, + }, +} + + class UrlsFileParam(BaseModel): - urls_file: str = Field( - title="URLs file", - description=( - "URL that point to a plain-text file with a list of URLs to " - "crawl, e.g. https://example.com/url-list.txt. The linked list " - "must contain 1 URL per line." - ), - pattern=_URL_PATTERN, - default="", - json_schema_extra={ - "group": "inputs", - "exclusiveRequired": True, - }, - ) + urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[misc, arg-type] @model_validator(mode="after") def input_group(self): return validate_input_group(self) +def parse_input_params(spider): + urls_file = spider.args.urls_file + if urls_file: + response = requests.get(urls_file) + urls = load_url_list(response.text) + spider.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.") + spider.start_urls = urls + elif spider.args.urls: + spider.start_urls = spider.args.urls + else: + spider.start_urls = [spider.args.url] + spider.allowed_domains = list(set(get_domain(url) for url in spider.start_urls)) + + +URL_FIELD_KWARGS = { + "title": "URL", + "description": ( + "Initial URL for the crawl. Enter the full URL including http(s), " + "you can copy and paste it from your browser. Example: " + "https://toscrape.com/" + ), + "pattern": _URL_PATTERN, + "default": "", + "json_schema_extra": { + "group": "inputs", + "exclusiveRequired": True, + }, +} + + class UrlParam(BaseModel): - url: str = Field( - title="URL", - description="Initial URL for the crawl. Enter the full URL including http(s), " - "you can copy and paste it from your browser. Example: https://toscrape.com/", - pattern=_URL_PATTERN, - default="", - json_schema_extra={ - "group": "inputs", - "exclusiveRequired": True, - }, - ) + url: str = Field(**URL_FIELD_KWARGS) # type: ignore[misc, arg-type] + + +URLS_FIELD_KWARGS = { + "title": "URLs", + "description": ( + "Initial URLs for the crawl, separated by new lines. Enter the " + "full URL including http(s), you can copy and paste it from your " + "browser. Example: https://toscrape.com/" + ), + "default": None, + "json_schema_extra": { + "group": "inputs", + "exclusiveRequired": True, + "widget": "textarea", + }, +} + + +def validate_url_list(value: Union[List[str], str]) -> List[str]: + """Validate a list of URLs. + + If a string is received as input, it is split into multiple strings + on new lines. + + List items that do not match a URL pattern trigger a warning and are + removed from the list. If all URLs are invalid, validation fails. + """ + if isinstance(value, str): + value = value.split("\n") + if not value: + return value + result = [] + for v in value: + v = v.strip() + if not v: + continue + if not re.search(_URL_PATTERN, v): + logger.warning( + f"{v!r}, from the 'urls' spider argument, is not a " + f"valid URL and will be ignored." + ) + continue + result.append(v) + if not result: + raise ValueError(f"No valid URL found in {value!r}") + return result @model_validator(mode="after") def input_group(self): @@ -159,20 +229,7 @@ def input_group(self): class UrlsParam(BaseModel): - urls: Optional[List[str]] = Field( - title="URLs", - description=( - "Initial URLs for the crawl, separated by new lines. Enter the " - "full URL including http(s), you can copy and paste it from your " - "browser. Example: https://toscrape.com/" - ), - default=None, - json_schema_extra={ - "group": "inputs", - "exclusiveRequired": True, - "widget": "textarea", - }, - ) + urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] @model_validator(mode="after") def input_group(self): @@ -181,33 +238,7 @@ def input_group(self): @field_validator("urls", mode="before") @classmethod def validate_url_list(cls, value: Union[List[str], str]) -> List[str]: - """Validate a list of URLs. - - If a string is received as input, it is split into multiple strings - on new lines. - - List items that do not match a URL pattern trigger a warning and are - removed from the list. If all URLs are invalid, validation fails. - """ - if isinstance(value, str): - value = value.split("\n") - if not value: - return value - result = [] - for v in value: - v = v.strip() - if not v: - continue - if not re.search(_URL_PATTERN, v): - logger.warning( - f"{v!r}, from the 'urls' spider argument, is not a " - f"valid URL and will be ignored." - ) - continue - result.append(v) - if not result: - raise ValueError(f"No valid URL found in {value!r}") - return result + return validate_url_list(value) class PostalAddress(BaseModel): diff --git a/zyte_spider_templates/spiders/_google_domains.py b/zyte_spider_templates/spiders/_google_domains.py new file mode 100644 index 0000000..b38d582 --- /dev/null +++ b/zyte_spider_templates/spiders/_google_domains.py @@ -0,0 +1,193 @@ +from enum import Enum + + +# https://www.google.com/supported_domains +# Sorted alphabetically, except for keeping the main domain first. +class GoogleDomain(str, Enum): + google_com: str = "google.com" + google_ad: str = "google.ad" + google_ae: str = "google.ae" + google_al: str = "google.al" + google_am: str = "google.am" + google_as: str = "google.as" + google_at: str = "google.at" + google_az: str = "google.az" + google_ba: str = "google.ba" + google_be: str = "google.be" + google_bf: str = "google.bf" + google_bg: str = "google.bg" + google_bi: str = "google.bi" + google_bj: str = "google.bj" + google_bs: str = "google.bs" + google_bt: str = "google.bt" + google_by: str = "google.by" + google_ca: str = "google.ca" + google_cat: str = "google.cat" + google_cd: str = "google.cd" + google_cf: str = "google.cf" + google_cg: str = "google.cg" + google_ch: str = "google.ch" + google_ci: str = "google.ci" + google_cl: str = "google.cl" + google_cm: str = "google.cm" + google_cn: str = "google.cn" + google_co_ao: str = "google.co.ao" + google_co_bw: str = "google.co.bw" + google_co_ck: str = "google.co.ck" + google_co_cr: str = "google.co.cr" + google_co_id: str = "google.co.id" + google_co_il: str = "google.co.il" + google_co_in: str = "google.co.in" + google_co_jp: str = "google.co.jp" + google_co_ke: str = "google.co.ke" + google_co_kr: str = "google.co.kr" + google_co_ls: str = "google.co.ls" + google_co_ma: str = "google.co.ma" + google_co_mz: str = "google.co.mz" + google_co_nz: str = "google.co.nz" + google_co_th: str = "google.co.th" + google_co_tz: str = "google.co.tz" + google_co_ug: str = "google.co.ug" + google_co_uk: str = "google.co.uk" + google_co_uz: str = "google.co.uz" + google_co_ve: str = "google.co.ve" + google_co_vi: str = "google.co.vi" + google_co_za: str = "google.co.za" + google_co_zm: str = "google.co.zm" + google_co_zw: str = "google.co.zw" + google_com_af: str = "google.com.af" + google_com_ag: str = "google.com.ag" + google_com_ar: str = "google.com.ar" + google_com_au: str = "google.com.au" + google_com_bd: str = "google.com.bd" + google_com_bh: str = "google.com.bh" + google_com_bn: str = "google.com.bn" + google_com_bo: str = "google.com.bo" + google_com_br: str = "google.com.br" + google_com_bz: str = "google.com.bz" + google_com_co: str = "google.com.co" + google_com_cu: str = "google.com.cu" + google_com_cy: str = "google.com.cy" + google_com_do: str = "google.com.do" + google_com_ec: str = "google.com.ec" + google_com_eg: str = "google.com.eg" + google_com_et: str = "google.com.et" + google_com_fj: str = "google.com.fj" + google_com_gh: str = "google.com.gh" + google_com_gi: str = "google.com.gi" + google_com_gt: str = "google.com.gt" + google_com_hk: str = "google.com.hk" + google_com_jm: str = "google.com.jm" + google_com_kh: str = "google.com.kh" + google_com_kw: str = "google.com.kw" + google_com_lb: str = "google.com.lb" + google_com_ly: str = "google.com.ly" + google_com_mm: str = "google.com.mm" + google_com_mt: str = "google.com.mt" + google_com_mx: str = "google.com.mx" + google_com_my: str = "google.com.my" + google_com_na: str = "google.com.na" + google_com_ng: str = "google.com.ng" + google_com_ni: str = "google.com.ni" + google_com_np: str = "google.com.np" + google_com_om: str = "google.com.om" + google_com_pa: str = "google.com.pa" + google_com_pe: str = "google.com.pe" + google_com_pg: str = "google.com.pg" + google_com_ph: str = "google.com.ph" + google_com_pk: str = "google.com.pk" + google_com_pr: str = "google.com.pr" + google_com_py: str = "google.com.py" + google_com_qa: str = "google.com.qa" + google_com_sa: str = "google.com.sa" + google_com_sb: str = "google.com.sb" + google_com_sg: str = "google.com.sg" + google_com_sl: str = "google.com.sl" + google_com_sv: str = "google.com.sv" + google_com_tj: str = "google.com.tj" + google_com_tr: str = "google.com.tr" + google_com_tw: str = "google.com.tw" + google_com_ua: str = "google.com.ua" + google_com_uy: str = "google.com.uy" + google_com_vc: str = "google.com.vc" + google_com_vn: str = "google.com.vn" + google_cv: str = "google.cv" + google_cz: str = "google.cz" + google_de: str = "google.de" + google_dj: str = "google.dj" + google_dk: str = "google.dk" + google_dm: str = "google.dm" + google_dz: str = "google.dz" + google_ee: str = "google.ee" + google_es: str = "google.es" + google_fi: str = "google.fi" + google_fm: str = "google.fm" + google_fr: str = "google.fr" + google_ga: str = "google.ga" + google_ge: str = "google.ge" + google_gg: str = "google.gg" + google_gl: str = "google.gl" + google_gm: str = "google.gm" + google_gr: str = "google.gr" + google_gy: str = "google.gy" + google_hn: str = "google.hn" + google_hr: str = "google.hr" + google_ht: str = "google.ht" + google_hu: str = "google.hu" + google_ie: str = "google.ie" + google_im: str = "google.im" + google_iq: str = "google.iq" + google_is: str = "google.is" + google_it: str = "google.it" + google_je: str = "google.je" + google_jo: str = "google.jo" + google_kg: str = "google.kg" + google_ki: str = "google.ki" + google_kz: str = "google.kz" + google_la: str = "google.la" + google_li: str = "google.li" + google_lk: str = "google.lk" + google_lt: str = "google.lt" + google_lu: str = "google.lu" + google_lv: str = "google.lv" + google_md: str = "google.md" + google_me: str = "google.me" + google_mg: str = "google.mg" + google_mk: str = "google.mk" + google_ml: str = "google.ml" + google_mn: str = "google.mn" + google_mu: str = "google.mu" + google_mv: str = "google.mv" + google_mw: str = "google.mw" + google_ne: str = "google.ne" + google_nl: str = "google.nl" + google_no: str = "google.no" + google_nr: str = "google.nr" + google_nu: str = "google.nu" + google_pl: str = "google.pl" + google_pn: str = "google.pn" + google_ps: str = "google.ps" + google_pt: str = "google.pt" + google_ro: str = "google.ro" + google_rs: str = "google.rs" + google_ru: str = "google.ru" + google_rw: str = "google.rw" + google_sc: str = "google.sc" + google_se: str = "google.se" + google_sh: str = "google.sh" + google_si: str = "google.si" + google_sk: str = "google.sk" + google_sm: str = "google.sm" + google_sn: str = "google.sn" + google_so: str = "google.so" + google_sr: str = "google.sr" + google_st: str = "google.st" + google_td: str = "google.td" + google_tg: str = "google.tg" + google_tl: str = "google.tl" + google_tm: str = "google.tm" + google_tn: str = "google.tn" + google_to: str = "google.to" + google_tt: str = "google.tt" + google_vu: str = "google.vu" + google_ws: str = "google.ws" diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index bd5eae7..deb00ee 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -67,13 +67,13 @@ class BaseSpider(scrapy.Spider): def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider = super().from_crawler(crawler, *args, **kwargs) - if spider.args.geolocation: + if geolocation := getattr(spider.args, "geolocation", None): # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object # additional requests. for component in ("AUTOMAP", "PROVIDER"): default_params = spider.settings.getdict(f"ZYTE_API_{component}_PARAMS") - default_params["geolocation"] = spider.args.geolocation + default_params["geolocation"] = geolocation spider.settings.set( f"ZYTE_API_{component}_PARAMS", default_params, diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index eefb43e..3868649 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -1,7 +1,6 @@ from enum import Enum from typing import Any, Callable, Dict, Iterable, Optional, Union -import requests import scrapy from pydantic import BaseModel, ConfigDict, Field from scrapy import Request @@ -11,6 +10,7 @@ from zyte_common_items import ProbabilityRequest, Product, ProductNavigation from zyte_spider_templates.heuristics import is_homepage +from zyte_spider_templates.params import parse_input_params from zyte_spider_templates.spiders.base import ( ARG_SETTING_PRIORITY, INPUT_GROUP, @@ -27,7 +27,6 @@ UrlsFileParam, UrlsParam, ) -from ..utils import load_url_list @document_enum @@ -62,7 +61,7 @@ class EcommerceCrawlStrategy(str, Enum): class EcommerceCrawlStrategyParam(BaseModel): crawl_strategy: EcommerceCrawlStrategy = Field( - title="Crawl strategy", + title="Crawl Strategy", description="Determines how the start URL and follow-up URLs are crawled.", default=EcommerceCrawlStrategy.automatic, json_schema_extra={ @@ -149,23 +148,10 @@ class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs) - spider._init_input() + parse_input_params(spider) spider._init_extract_from() return spider - def _init_input(self): - urls_file = self.args.urls_file - if urls_file: - response = requests.get(urls_file) - urls = load_url_list(response.text) - self.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.") - self.start_urls = urls - elif self.args.urls: - self.start_urls = self.args.urls - else: - self.start_urls = [self.args.url] - self.allowed_domains = list(set(get_domain(url) for url in self.start_urls)) - def _init_extract_from(self): if self.args.extract_from is not None: self.settings.set( diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py new file mode 100644 index 0000000..cbf9554 --- /dev/null +++ b/zyte_spider_templates/spiders/serp.py @@ -0,0 +1,127 @@ +from typing import Any, Dict, Iterable, List, Optional, Union + +from pydantic import BaseModel, Field, field_validator +from scrapy import Request +from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings +from scrapy_spider_metadata import Args +from w3lib.url import add_or_replace_parameter +from zyte_common_items import Serp + +from ..params import MaxRequestsParam +from ._google_domains import GoogleDomain +from .base import BaseSpider + + +class SearchQueriesParam(BaseModel): + search_queries: Optional[List[str]] = Field( + title="Search Queries", + description="Input 1 search query per line (e.g. foo bar).", + json_schema_extra={ + "widget": "textarea", + }, + ) + + @field_validator("search_queries", mode="before") + @classmethod + def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]: + """Validate a list of search queries. + If a string is received as input, it is split into multiple strings + on new lines. + """ + if isinstance(value, str): + value = value.split("\n") + result = [] + for v in value: + if v := v.strip(): + result.append(v) + if not result: + raise ValueError("The search_queries parameter value is missing or empty.") + return result + + +class SerpMaxPagesParam(BaseModel): + max_pages: int = Field( + title="Max Pages", + description="Maximum number of result pages to visit per search query.", + default=1, + ) + + +class GoogleDomainParam(BaseModel): + domain: GoogleDomain = Field( + title="Domain", + description="Target Google domain.", + default=GoogleDomain.google_com, + ) + + +class GoogleSearchSpiderParams( + MaxRequestsParam, + SerpMaxPagesParam, + SearchQueriesParam, + GoogleDomainParam, + BaseModel, +): + pass + + +class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider): + """Yield results from Google searches. + + See :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams` + for supported parameters. + + .. seealso:: :ref:`google-search`. + """ + + name = "google_search" + + metadata: Dict[str, Any] = { + **BaseSpider.metadata, + "title": "Google Search Results", + "description": "Template for spiders that extract Google search results.", + } + + @classmethod + def update_settings(cls, settings: BaseSettings) -> None: + super().update_settings(settings) + retry_policy_setting_priority = settings.getpriority("ZYTE_API_RETRY_POLICY") + if ( + retry_policy_setting_priority is None + or retry_policy_setting_priority < SETTINGS_PRIORITIES["spider"] + ): + settings.set( + "ZYTE_API_RETRY_POLICY", + "zyte_api.aggressive_retrying", + priority="spider", + ) + + def get_start_request(self, url): + return Request( + url=url, + callback=self.parse_serp, + meta={ + "crawling_logs": {"page_type": "serp"}, + "zyte_api": { + "serp": True, + }, + }, + ) + + def start_requests(self) -> Iterable[Request]: + search_queries = self.args.search_queries + if not search_queries: + raise ValueError("No search queries specified.") + + url = f"https://www.{self.args.domain.value}/search" + for search_query in search_queries: + search_url = add_or_replace_parameter(url, "q", search_query) + for start in range(0, self.args.max_pages * 10, 10): + if start: + search_url = add_or_replace_parameter( + search_url, "start", str(start) + ) + yield self.get_start_request(search_url) + + def parse_serp(self, response) -> Iterable[Serp]: + yield Serp.from_dict(response.raw_api_response["serp"])