-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SERP (MVP) and 0.9.0 release notes (#62)
- Loading branch information
Showing
23 changed files
with
951 additions
and
192 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
.. _google-search: | ||
|
||
================================================= | ||
Google search spider template (``google_search``) | ||
================================================= | ||
|
||
Basic use | ||
========= | ||
|
||
.. code-block:: shell | ||
scrapy crawl google_search -a search_queries="foo bar" | ||
Parameters | ||
========== | ||
|
||
.. autopydantic_model:: zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams | ||
:inherited-members: BaseModel | ||
:exclude-members: model_computed_fields |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,4 @@ | ||
import json | ||
import logging | ||
import re | ||
from unittest.mock import MagicMock, call, patch | ||
|
||
import pytest | ||
|
@@ -11,7 +9,6 @@ | |
from scrapy_spider_metadata import get_spider_metadata | ||
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request | ||
|
||
from zyte_spider_templates import BaseSpiderParams | ||
from zyte_spider_templates._geolocations import ( | ||
GEOLOCATION_OPTIONS, | ||
GEOLOCATION_OPTIONS_WITH_CODE, | ||
|
@@ -24,6 +21,7 @@ | |
|
||
from . import get_crawler | ||
from .test_utils import URL_TO_DOMAIN | ||
from .utils import assertEqualSpiderMetadata | ||
|
||
|
||
def test_parameters(): | ||
|
@@ -362,21 +360,6 @@ def test_arguments(): | |
assert spider.allowed_domains == ["example.com"] | ||
|
||
|
||
def assertEqualJson(actual, expected): | ||
"""Compare the JSON representation of 2 Python objects. | ||
This allows to take into account things like the order of key-value pairs | ||
in dictionaries, which would not be taken into account when comparing | ||
dictionaries directly. | ||
It also generates a better diff in pytest output when enums are involved, | ||
e.g. geolocation values. | ||
""" | ||
actual_json = json.dumps(actual, indent=2) | ||
expected_json = json.dumps(expected, indent=2) | ||
assert actual_json == expected_json | ||
|
||
|
||
def test_metadata(): | ||
actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True) | ||
expected_metadata = { | ||
|
@@ -428,7 +411,7 @@ def test_metadata(): | |
"description": ( | ||
"URL that point to a plain-text file with a list of " | ||
"URLs to crawl, e.g. " | ||
"https://example.com/url-list.txt. The linked list " | ||
"https://example.com/url-list.txt. The linked file " | ||
"must contain 1 URL per line." | ||
), | ||
"exclusiveRequired": True, | ||
|
@@ -480,7 +463,7 @@ def test_metadata(): | |
"title": "Pagination Only", | ||
}, | ||
}, | ||
"title": "Crawl strategy", | ||
"title": "Crawl Strategy", | ||
"enum": [ | ||
"automatic", | ||
"full", | ||
|
@@ -550,60 +533,14 @@ def test_metadata(): | |
"type": "object", | ||
}, | ||
} | ||
assertEqualJson(actual_metadata, expected_metadata) | ||
assertEqualSpiderMetadata(actual_metadata, expected_metadata) | ||
|
||
geolocation = actual_metadata["param_schema"]["properties"]["geolocation"] | ||
assert geolocation["enum"][0] == "AF" | ||
assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"} | ||
assert set(geolocation["enum"]) == set(geolocation["enumMeta"]) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"valid,url", | ||
[ | ||
(False, ""), | ||
(False, "http://"), | ||
(False, "http:/example.com"), | ||
(False, "ftp://example.com"), | ||
(False, "example.com"), | ||
(False, "//example.com"), | ||
(False, "http://foo:[email protected]"), | ||
(False, " http://example.com"), | ||
(False, "http://example.com "), | ||
(False, "http://examp le.com"), | ||
(False, "https://example.com:232323"), | ||
(True, "http://example.com"), | ||
(True, "http://bücher.example"), | ||
(True, "http://xn--bcher-kva.example"), | ||
(True, "https://i❤.ws"), | ||
(True, "https://example.com"), | ||
(True, "https://example.com/"), | ||
(True, "https://example.com:2323"), | ||
(True, "https://example.com:2323/"), | ||
(True, "https://example.com:2323/foo"), | ||
(True, "https://example.com/f"), | ||
(True, "https://example.com/foo"), | ||
(True, "https://example.com/foo/"), | ||
(True, "https://example.com/foo/bar"), | ||
(True, "https://example.com/foo/bar/"), | ||
(True, "https://example.com/foo/bar?baz"), | ||
(True, "https://example.com/foo/bar/?baz"), | ||
(True, "https://example.com?foo"), | ||
(True, "https://example.com?foo=bar"), | ||
(True, "https://example.com/?foo=bar&baz"), | ||
(True, "https://example.com/?foo=bar&baz#"), | ||
(True, "https://example.com/?foo=bar&baz#frag"), | ||
(True, "https://example.com#"), | ||
(True, "https://example.com/#"), | ||
(True, "https://example.com/&"), | ||
(True, "https://example.com/&#"), | ||
], | ||
) | ||
def test_validation_url(url, valid): | ||
url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern | ||
assert bool(re.match(url_re, url)) == valid | ||
|
||
|
||
def test_get_parse_product_request(): | ||
base_kwargs = { | ||
"url": "https://example.com", | ||
|
@@ -818,7 +755,7 @@ def test_urls_file(): | |
crawler = get_crawler() | ||
url = "https://example.com" | ||
|
||
with patch("zyte_spider_templates.spiders.ecommerce.requests.get") as mock_get: | ||
with patch("zyte_spider_templates.params.requests.get") as mock_get: | ||
response = requests.Response() | ||
response._content = ( | ||
b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n" | ||
|
Oops, something went wrong.