From e88ee2bb943aa73331fb845107107f080ec4dde4 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Sun, 4 Feb 2024 13:43:43 +0100 Subject: [PATCH 01/36] first running draft --- docs/4_how_to_filter_articles.md | 3 +- docs/5_how_to_search_for_publishers.md | 3 +- docs/how_to_add_a_publisher.md | 10 +- src/fundus/__init__.py | 8 +- src/fundus/publishers/at/__init__.py | 2 +- src/fundus/publishers/base_objects.py | 13 +- src/fundus/publishers/de/__init__.py | 2 +- src/fundus/publishers/fr/__init__.py | 3 +- src/fundus/publishers/na/__init__.py | 2 +- src/fundus/publishers/uk/__init__.py | 2 +- src/fundus/publishers/us/__init__.py | 2 +- src/fundus/scraping/common_crawl/__init__.py | 3 - src/fundus/scraping/common_crawl/html.py | 92 ---- src/fundus/scraping/common_crawl/pipeline.py | 300 ------------- src/fundus/scraping/common_crawl/scraper.py | 48 --- src/fundus/scraping/delay.py | 26 ++ src/fundus/scraping/html.py | 419 ++++++------------ src/fundus/scraping/pipeline.py | 428 ++++++++++++------- src/fundus/scraping/scraper.py | 100 +++-- src/fundus/scraping/session.py | 108 +++++ src/fundus/scraping/url.py | 139 ++++++ 21 files changed, 764 insertions(+), 949 deletions(-) delete mode 100644 src/fundus/scraping/common_crawl/__init__.py delete mode 100644 src/fundus/scraping/common_crawl/html.py delete mode 100644 src/fundus/scraping/common_crawl/pipeline.py delete mode 100644 src/fundus/scraping/common_crawl/scraper.py create mode 100644 src/fundus/scraping/delay.py create mode 100644 src/fundus/scraping/session.py create mode 100644 src/fundus/scraping/url.py diff --git a/docs/4_how_to_filter_articles.md b/docs/4_how_to_filter_articles.md index d6233d18f..786e6a387 100644 --- a/docs/4_how_to_filter_articles.md +++ b/docs/4_how_to_filter_articles.md @@ -184,7 +184,8 @@ You can preselect the source for your articles when initializing a new `Crawler` Let's initiate a crawler who only crawls from `NewsMaps`'s. ````python -from fundus import Crawler, PublisherCollection, NewsMap +from fundus import Crawler, PublisherCollection +from fundus.scraping.url import NewsMap crawler = Crawler(PublisherCollection.us, restrict_sources_to=[NewsMap]) ```` diff --git a/docs/5_how_to_search_for_publishers.md b/docs/5_how_to_search_for_publishers.md index bbbbf79ee..3d09bb5f5 100644 --- a/docs/5_how_to_search_for_publishers.md +++ b/docs/5_how_to_search_for_publishers.md @@ -15,7 +15,8 @@ You can search through the collection to get only publishers fitting your use ca Let's get some publishers based in the US, supporting an attribute called `topics` and `NewsMap` as a source, and use them to initialize a crawler afterward. ````python -from fundus import Crawler, PublisherCollection, NewsMap +from fundus import Crawler, PublisherCollection +from fundus.scraping.url import NewsMap fitting_publishers = PublisherCollection.us.search(attributes=["topics"], source_types=[NewsMap]) crawler = Crawler(fitting_publishers) diff --git a/docs/how_to_add_a_publisher.md b/docs/how_to_add_a_publisher.md index 0c9cf14cb..d7c97fa1d 100644 --- a/docs/how_to_add_a_publisher.md +++ b/docs/how_to_add_a_publisher.md @@ -106,8 +106,11 @@ To instantiate an object inheriting from URLSource like `RSSFeed` or `Sitemap`, Getting links for RSS feeds can vary from publisher to publisher. Most of the time, you can find them through a quick browser search. Building an `RSSFeed` looks like this: + ````python -from fundus.scraping.html import RSSFeed + +from fundus import RSSFeed + RSSFeed("https://theintercept.com/feed/?rss") ```` @@ -159,8 +162,11 @@ You can alter this behavior or reverse the order in which sitemaps are processed **_NOTE:_** If you wonder why you should reverse your sources from time to time, `URLSource`'s should, if possible, yield URLs in descending order by publishing date. Now building a new `URLSource` for a `NewsMap` covering the LA Times looks like this: + ````python -from fundus.scraping.html import NewsMap + +from fundus import NewsMap + NewsMap("https://www.latimes.com/news-sitemap.xml", reverse=True) ```` diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py index 07fa55e96..fb5e46405 100644 --- a/src/fundus/__init__.py +++ b/src/fundus/__init__.py @@ -2,10 +2,9 @@ import sys from fundus.publishers import PublisherCollection -from fundus.scraping.common_crawl import CCNewsCrawler from fundus.scraping.filter import Requires -from fundus.scraping.html import NewsMap, RSSFeed, Sitemap -from fundus.scraping.pipeline import BaseCrawler, Crawler +from fundus.scraping.pipeline import BaseCrawler, CCNewsCrawler, Crawler +from fundus.scraping.url import NewsMap, RSSFeed, Sitemap __module_path__ = pathlib.Path(__file__).parent __development_base_path__ = __module_path__.parents[1] @@ -16,9 +15,6 @@ "CCNewsCrawler", "PublisherCollection", "Requires", - "RSSFeed", - "Sitemap", - "NewsMap", ] # On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times, diff --git a/src/fundus/publishers/at/__init__.py b/src/fundus/publishers/at/__init__.py index 07e7a35ee..970da6295 100644 --- a/src/fundus/publishers/at/__init__.py +++ b/src/fundus/publishers/at/__init__.py @@ -1,5 +1,5 @@ from fundus.publishers.base_objects import PublisherEnum, PublisherSpec -from fundus.scraping.html import RSSFeed +from fundus.scraping.url import RSSFeed from .orf import OrfParser diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py index 198464bdd..c1a0505eb 100644 --- a/src/fundus/publishers/base_objects.py +++ b/src/fundus/publishers/base_objects.py @@ -5,7 +5,7 @@ from fundus.parser.base_parser import ParserProxy from fundus.scraping.filter import URLFilter -from fundus.scraping.html import FundusSource, NewsMap, RSSFeed, Sitemap, URLSource +from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource from fundus.utils.iteration import iterate_all_subclasses @@ -34,10 +34,11 @@ def __init__(self, spec: PublisherSpec): self.parser = spec.parser() self.publisher_name = spec.name self.url_filter = spec.url_filter + self.request_header = spec.request_header # we define the dict here manually instead of using default dict so that we can control # the order in which sources are proceeded. - source_mapping: Dict[Type[URLSource], List[FundusSource]] = { + source_mapping: Dict[Type[URLSource], List[URLSource]] = { RSSFeed: [], NewsMap: [], Sitemap: [], @@ -49,13 +50,7 @@ def __init__(self, spec: PublisherSpec): f"Unexpected type '{type(url_source).__name__}' as source for {self.name}. " f"Allowed are '{', '.join(cls.__name__ for cls in iterate_all_subclasses(URLSource))}'" ) - source: FundusSource = FundusSource( - url_source=url_source, - publisher=self.publisher_name, - url_filter=spec.url_filter, - request_header=spec.request_header, - ) - source_mapping[type(url_source)].append(source) + source_mapping[type(url_source)].append(url_source) self.source_mapping = source_mapping diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index 14192bde1..112805fd9 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -2,7 +2,7 @@ from fundus.publishers.base_objects import PublisherEnum, PublisherSpec from fundus.scraping.filter import regex_filter -from fundus.scraping.html import NewsMap, RSSFeed, Sitemap +from fundus.scraping.url import NewsMap, RSSFeed, Sitemap from .berliner_zeitung import BerlinerZeitungParser from .bild import BildParser diff --git a/src/fundus/publishers/fr/__init__.py b/src/fundus/publishers/fr/__init__.py index 2c6f3e868..71445369d 100644 --- a/src/fundus/publishers/fr/__init__.py +++ b/src/fundus/publishers/fr/__init__.py @@ -1,7 +1,6 @@ from fundus.publishers.base_objects import PublisherEnum, PublisherSpec from fundus.publishers.fr.le_monde import LeMondeParser -from fundus.scraping.filter import inverse, regex_filter -from fundus.scraping.html import NewsMap, Sitemap +from fundus.scraping.url import NewsMap, Sitemap class FR(PublisherEnum): diff --git a/src/fundus/publishers/na/__init__.py b/src/fundus/publishers/na/__init__.py index 8a7ee1b7f..bfb8e354f 100644 --- a/src/fundus/publishers/na/__init__.py +++ b/src/fundus/publishers/na/__init__.py @@ -1,6 +1,6 @@ from fundus.publishers.base_objects import PublisherEnum, PublisherSpec from fundus.scraping.filter import inverse, regex_filter -from fundus.scraping.html import RSSFeed, Sitemap +from fundus.scraping.url import RSSFeed, Sitemap from .the_namibian import TheNamibianParser diff --git a/src/fundus/publishers/uk/__init__.py b/src/fundus/publishers/uk/__init__.py index 2f19868f3..e4e14051d 100644 --- a/src/fundus/publishers/uk/__init__.py +++ b/src/fundus/publishers/uk/__init__.py @@ -2,7 +2,7 @@ from fundus.publishers.base_objects import PublisherEnum, PublisherSpec from fundus.scraping.filter import inverse, regex_filter -from fundus.scraping.html import NewsMap, Sitemap +from fundus.scraping.url import NewsMap, Sitemap from .i_news import INewsParser from .the_guardian import TheGuardianParser diff --git a/src/fundus/publishers/us/__init__.py b/src/fundus/publishers/us/__init__.py index 8f52892ae..a6ce67906 100644 --- a/src/fundus/publishers/us/__init__.py +++ b/src/fundus/publishers/us/__init__.py @@ -1,6 +1,6 @@ from fundus.publishers.base_objects import PublisherEnum, PublisherSpec from fundus.scraping.filter import inverse, regex_filter -from fundus.scraping.html import NewsMap, RSSFeed, Sitemap +from fundus.scraping.url import NewsMap, RSSFeed, Sitemap from .ap_news import APNewsParser from .cnbc import CNBCParser diff --git a/src/fundus/scraping/common_crawl/__init__.py b/src/fundus/scraping/common_crawl/__init__.py deleted file mode 100644 index cf839eecf..000000000 --- a/src/fundus/scraping/common_crawl/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .pipeline import CCNewsCrawler - -__all__ = ["CCNewsCrawler"] diff --git a/src/fundus/scraping/common_crawl/html.py b/src/fundus/scraping/common_crawl/html.py deleted file mode 100644 index 2bca7b893..000000000 --- a/src/fundus/scraping/common_crawl/html.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Dict, Iterator, Optional -from urllib.parse import urlparse - -import chardet -import requests -from fastwarc import ArchiveIterator, WarcRecord, WarcRecordType - -from fundus.logging import basic_logger -from fundus.publishers.base_objects import PublisherEnum -from fundus.scraping.filter import URLFilter -from fundus.scraping.html import HTML, WarcSource, _default_header - - -class CCNewsSource: - def __init__(self, *publishers: PublisherEnum, warc_path: str, headers: Optional[Dict[str, str]] = None): - self.publishers = publishers - self.warc_path = warc_path - self.headers = headers or _default_header - - self._publisher_mapping: Dict[str, PublisherEnum] = { - urlparse(publisher.domain).netloc: publisher for publisher in publishers - } - - def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: - def extract_content(record: WarcRecord) -> Optional[str]: - warc_body: bytes = record.reader.read() - - try: - return str(warc_body, encoding=record.http_charset) - except (UnicodeDecodeError, TypeError): - encoding: Optional[str] = chardet.detect(warc_body)["encoding"] - - if encoding is not None: - basic_logger.debug( - f"Trying to decode record {record.record_id!r} from {target_url!r} " - f"using detected encoding {encoding}." - ) - - try: - return str(warc_body, encoding=encoding) - except UnicodeDecodeError: - basic_logger.warning( - f"Couldn't decode record {record.record_id!r} from {target_url!r} with " - f"original charset {record.http_charset!r} using detected charset {encoding!r}." - ) - else: - basic_logger.warning( - f"Couldn't detect charset for record {record.record_id!r} from {target_url!r} " - f"with invalid original charset {record.http_charset!r}." - ) - - return None - - with requests.Session() as session: - stream = session.get(self.warc_path, stream=True, headers=self.headers).raw - - for warc_record in ArchiveIterator(stream, record_types=WarcRecordType.response, verify_digests=True): - target_url = str(warc_record.headers["WARC-Target-URI"]) - - if url_filter is not None and url_filter(target_url): - basic_logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter") - continue - - publisher_domain: str = urlparse(target_url).netloc - - if publisher_domain not in self._publisher_mapping: - continue - - publisher = self._publisher_mapping[publisher_domain] - - if publisher.url_filter is not None and publisher.url_filter(target_url): - basic_logger.debug( - f"Skipped WARC record with target URI {target_url!r} because of " - f"publisher specific URL filter" - ) - continue - - if (content := extract_content(warc_record)) is None: - continue - - yield HTML( - requested_url=target_url, - responded_url=target_url, - content=content, - crawl_date=warc_record.record_date, - source=WarcSource( - publisher=publisher.publisher_name, - warc_path=self.warc_path, - warc_headers=dict(warc_record.headers), - http_headers=dict(warc_record.http_headers), - ), - ) diff --git a/src/fundus/scraping/common_crawl/pipeline.py b/src/fundus/scraping/common_crawl/pipeline.py deleted file mode 100644 index 4aa66070c..000000000 --- a/src/fundus/scraping/common_crawl/pipeline.py +++ /dev/null @@ -1,300 +0,0 @@ -from __future__ import annotations - -import gzip -import os -import re -from datetime import datetime -from functools import lru_cache, partial, wraps -from multiprocessing import Manager -from multiprocessing.context import TimeoutError -from multiprocessing.pool import MapResult, Pool, ThreadPool -from queue import Empty, Queue -from typing import ( - Any, - Callable, - Generic, - Iterator, - List, - Literal, - Optional, - Pattern, - Set, - Tuple, - TypeVar, - Union, - cast, -) - -import dill -import more_itertools -import requests -from dateutil.rrule import MONTHLY, rrule -from tqdm import tqdm -from typing_extensions import ParamSpec - -from fundus.publishers.base_objects import PublisherEnum -from fundus.scraping.article import Article -from fundus.scraping.common_crawl.html import CCNewsSource -from fundus.scraping.common_crawl.scraper import CCNewsScraper -from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter - -_T = TypeVar("_T") -_P = ParamSpec("_P") - - -# noinspection PyPep8Naming -class dill_wrapper(Generic[_P, _T]): - def __init__(self, target: Callable[_P, _T]): - """Wraps function in dill serialization. - - This is in order to use unpickable functions within multiprocessing. - - Args: - target: The function to wrap. - """ - self._serialized_target: bytes = dill.dumps(target) - - @lru_cache - def _deserialize(self) -> Callable[_P, _T]: - return cast(Callable[_P, _T], dill.loads(self._serialized_target)) - - def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T: - return self._deserialize()(*args, **kwargs) - - -def queue_wrapper(queue: Queue[_T], target: Callable[_P, Iterator[_T]]) -> Callable[_P, None]: - """Wraps the target callable to add its results to the queue instead of returning them directly. - - Args: - queue: The buffer queue. - target: A target callable. - - Returns: - (Callable[_P, None]) The wrapped target. - """ - - @wraps(target) - def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: - for obj in target(*args, **kwargs): - queue.put(obj) - - return wrapper - - -def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]: - """Utility function to iterate exhaustively over a pool queue. - - The underlying iterator of this function repeatedly exhausts the given queue. - Then, if the queue is empty only if all the pool's jobs have finished, the iterator reruns. - Otherwise, it waits for the queue to be populated with the next result from the pool. - - Args: - handle (MapResult[Any]): A handle o the MappedResult of the underling multiprocessing pool. - queue (Queue[_T]): The pool queue. - - Returns: - Iterator[_T]: The iterator over the queue as it is populated. - """ - while True: - try: - yield queue.get(timeout=0.1) - except Empty: - try: - handle.get(timeout=0.1) - except TimeoutError: - continue - return - - -class CCNewsCrawler: - def __init__( - self, - *publishers: PublisherEnum, - processes: int = -1, - server_address: str = "https://data.commoncrawl.org/", - ): - """Initializes a crawler for the CC-NEWS dataset. - - Args: - *publishers: The publishers to crawl. - processes: Number of additional process to use for crawling. - If -1, the number of processes is set to `os.cpu_count()`. - If `os.cpu_count()` is not available, the number of processes is set to 0. - If 0, only the main process is used. Defaults to -1. - server_address: The CC-NEWS dataset server address. Defaults to 'https://data.commoncrawl.org/'. - """ - self.publishers = tuple(more_itertools.collapse(publishers)) - self.processes = os.cpu_count() or 0 if processes == -1 else processes - self.server_address = server_address - - def _get_warc_paths(self, start: datetime, end: datetime) -> List[str]: - # Date regex examples: https://regex101.com/r/yDX3G6/1 - date_pattern: Pattern[str] = re.compile(r"CC-NEWS-(?P\d{14})-") - - if start >= end: - raise ValueError("Start date has to be < end date.") - - if start < datetime(2016, 8, 1): - raise ValueError("The default, and earliest possible, start date is 2016/08/01.") - - if end > datetime.now(): - raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?") - - date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=start, until=end)) - urls: List[str] = [ - f"{self.server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence - ] - - with tqdm(total=len(urls), desc="Loading WARC Paths", leave=False) as bar: - - def load_paths(url: str) -> List[str]: - with requests.Session() as session: - paths = gzip.decompress(session.get(url).content).decode("utf-8").split() - bar.update() - return paths - - if self.processes == 0: - nested_warc_paths = [load_paths(url) for url in urls] - else: - # use two threads per process, default two threads per core - max_number_of_threads = self.processes * 2 - - with ThreadPool(processes=min(len(urls), max_number_of_threads)) as pool: - nested_warc_paths = pool.map(load_paths, urls) - - warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths) - - start_strf = start.strftime("%Y%m%d%H%M%S") - end_strf = end.strftime("%Y%m%d%H%M%S") - - def filter_warc_path_by_date(path: str) -> bool: - match: Optional[re.Match[str]] = date_pattern.search(path) - if match is None: - raise AssertionError(f"Invalid WARC path {path!r}") - return start_strf <= match["date"] <= end_strf - - return sorted( - (f"{self.server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)), - reverse=True, - ) - - @staticmethod - def _fetch_articles( - warc_path: str, - publishers: Tuple[PublisherEnum, ...], - error_handling: Literal["suppress", "catch", "raise"], - extraction_filter: Optional[ExtractionFilter] = None, - url_filter: Optional[URLFilter] = None, - ) -> Iterator[Article]: - source = CCNewsSource(*publishers, warc_path=warc_path) - scraper = CCNewsScraper(source) - yield from scraper.scrape(error_handling, extraction_filter, url_filter) - - @staticmethod - def _single_crawl(warc_paths: List[str], article_task: Callable[[str], Iterator[Article]]) -> Iterator[Article]: - for warc_path in warc_paths: - yield from article_task(warc_path) - - def _parallel_crawl( - self, warc_paths: List[str], article_task: Callable[[str], Iterator[Article]] - ) -> Iterator[Article]: - # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually - # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all - # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading. - with Manager() as manager, Pool(processes=min(self.processes, len(warc_paths))) as pool: - article_queue: Queue[Article] = manager.Queue() - - # Because multiprocessing.Pool does not support iterators as targets, - # we wrap the article_task to write the articles to a queue instead of returning them directly. - wrapped_article_task: Callable[[str], None] = queue_wrapper(article_queue, article_task) - - # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill. - serialized_article_task = dill_wrapper(wrapped_article_task) - - # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished. - yield from pool_queue_iter(pool.map_async(serialized_article_task, warc_paths), article_queue) - - def crawl( - self, - start: datetime = datetime(2016, 8, 1), - end: datetime = datetime.now(), - max_articles: Optional[int] = None, - error_handling: Literal["suppress", "catch", "raise"] = "suppress", - only_complete: Union[bool, ExtractionFilter] = Requires("title", "body", "publishing_date"), - url_filter: Optional[URLFilter] = None, - only_unique: bool = True, - ) -> Iterator[Article]: - """Yields articles crawled from the CC-NEWS server. - - This method provides the same functionality as the fundus standard crawler, - except this one fetches articles from the CC-News corpus. - Specify a date range from to to fetch only articles crawled in this range. - The default range is 2016/8/1 -> datetime.now(). - These dates correspond to the crawl date of the CC-News crawler, not the publishing date. - To filter on publishing dates, use the parameter and refer to the docs about filtering articles. - - Args: - start: (datetime): Earliest possible crawl date for retrieved articles. Defaults to 2016/8/1. - end: (datetime): Latest possible crawl date for retrieved articles. Defaults to datetime.now(). - max_articles (Optional[int]): Number of articles to crawl. If there are fewer articles - than max_articles the Iterator will stop before max_articles. If None, all retrievable - articles are returned. Defaults to None. - error_handling (Literal["suppress", "catch", "raise"]): Define how to handle errors - encountered during extraction. If set to "suppress", all errors will be skipped, either - with None values for respective attributes in the extraction or by skipping entire articles. - If set to "catch", errors will be caught as attribute values or, if an entire article fails, - through Article.exception. If set to "raise", all errors encountered during extraction will - be raised. Defaults to "suppress". - only_complete (Union[bool, ExtractionFilter]): Set a callable satisfying the ExtractionFilter - protocol as an extraction filter or use a boolean. If False, all articles will be yielded, - if True, only those with all attributes extracted. Defaults to ExtractionFilter letting - through all articles with at least title, body, and publishing_date set. - url_filter (Optional[URLFilter]): A callable object satisfying the URLFilter protocol to skip - URLs before download. This filter applies on both requested and responded URL. Defaults to None. - only_unique (bool): If set to True, articles yielded will be unique on the responded URL. - Always returns the first encountered article. Defaults to True. - - Returns: - Iterator[Article]: An iterator yielding objects of type Article. - """ - - if max_articles == 0: - return - - if max_articles is None: - max_articles = -1 - - def build_extraction_filter() -> Optional[ExtractionFilter]: - if isinstance(only_complete, bool): - return ( - None - if only_complete is False - else lambda extracted: not all( - bool(v) if not isinstance(v, Exception) else False for _, v in extracted.items() - ) - ) - else: - return only_complete - - warc_paths = self._get_warc_paths(start, end) - response_cache: Set[str] = set() - - article_task: Callable[[str], Iterator[Article]] = partial( - self._fetch_articles, - publishers=self.publishers, - error_handling=error_handling, - extraction_filter=build_extraction_filter(), - url_filter=url_filter, - ) - - if self.processes == 0: - article_iter = self._single_crawl(warc_paths, article_task) - else: - article_iter = self._parallel_crawl(warc_paths, article_task) - - for article_idx, article in enumerate(article_iter, start=1): - if not only_unique or article.html.responded_url not in response_cache: - response_cache.add(article.html.responded_url) - yield article - if article_idx == max_articles: - break diff --git a/src/fundus/scraping/common_crawl/scraper.py b/src/fundus/scraping/common_crawl/scraper.py deleted file mode 100644 index ecae0a559..000000000 --- a/src/fundus/scraping/common_crawl/scraper.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Dict, Iterator, Literal, Optional - -from fundus.logging import basic_logger -from fundus.parser import ParserProxy -from fundus.scraping.article import Article -from fundus.scraping.common_crawl.html import CCNewsSource -from fundus.scraping.filter import ExtractionFilter, URLFilter - - -class CCNewsScraper: - def __init__(self, source: CCNewsSource): - self.source = source - self._parser_mapping: Dict[str, ParserProxy] = { - publisher.publisher_name: publisher.parser for publisher in source.publishers - } - - def scrape( - self, - error_handling: Literal["suppress", "catch", "raise"], - extraction_filter: Optional[ExtractionFilter] = None, - url_filter: Optional[URLFilter] = None, - ) -> Iterator[Article]: - # TODO: Once we decided on weather to continue fundus with async functionality or not, refactor this to - # be suitable for a BaseScraper class - for html in self.source.fetch(url_filter): - parser = self._parser_mapping[html.source.publisher] - try: - extracted = parser(html.crawl_date).parse(html.content, error_handling) - - except Exception as err: - if error_handling == "raise": - error_message = f"Run into an error processing article '{html.requested_url}'" - basic_logger.error(error_message) - err.args = (f"{err}\n\n{error_message}",) - raise err - elif error_handling == "catch": - yield Article(html=html, exception=err) - elif error_handling == "suppress": - basic_logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}") - else: - raise ValueError(f"Unknown value '{error_handling}' for parameter '") - - else: - if extraction_filter is not None and extraction_filter(extracted): - basic_logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter") - else: - article = Article.from_extracted(html=html, extracted=extracted) - yield article diff --git a/src/fundus/scraping/delay.py b/src/fundus/scraping/delay.py new file mode 100644 index 000000000..c83cd6918 --- /dev/null +++ b/src/fundus/scraping/delay.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +import random +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class Delay(Protocol): + """Protocol to define crawl delays between batches.""" + + def __call__(self) -> float: + """Yields a float specifying the minimum crawler delay for the current article batch in seconds. + + The effective delay does include crawling execution time between batches, + i.e. the effective delay is max(execution_time, delay). + + Examples: + >>> import random + >>> delay: Delay = lambda: random.random() + Will use a random delay in [0, 1) seconds. + + Returns: + float: The delay time in seconds. + + """ + ... diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 607a88d90..ee587f503 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -1,271 +1,34 @@ -import gzip import time -import types -from abc import ABC, abstractmethod -from dataclasses import dataclass, field +from abc import abstractmethod +from dataclasses import dataclass from datetime import datetime -from functools import cached_property -from typing import ( - AsyncIterable, - AsyncIterator, - Callable, - ClassVar, - Dict, - Iterable, - Iterator, - List, - Optional, - Union, -) - -import aiohttp -import feedparser -import lxml.html +from typing import Dict, Iterable, Iterator, List, Optional, Protocol, Union +from urllib.parse import urlparse + +import chardet +import requests import validators -from aiohttp.client_exceptions import ClientError -from aiohttp.http_exceptions import HttpProcessingError -from aiohttp.web_exceptions import HTTPError -from lxml.cssselect import CSSSelector -from lxml.etree import XPath +from fastwarc import ArchiveIterator, WarcRecord, WarcRecordType +from requests import ConnectionError, HTTPError from fundus.logging import basic_logger -from fundus.scraping.filter import URLFilter, inverse -from fundus.utils.more_async import ManagedEventLoop, async_next, make_iterable_async +from fundus.publishers.base_objects import PublisherEnum +from fundus.scraping.delay import Delay +from fundus.scraping.filter import URLFilter +from fundus.scraping.session import _default_header __all__ = [ - "URLSource", - "RSSFeed", - "Sitemap", - "NewsMap", "HTML", + "SourceInfo", + "WarcSourceInfo", + "WebSourceInfo", "HTMLSource", - "WarcSource", "WebSource", - "FundusSource", + "CCNewsSource", ] -_default_header = {"user-agent": "Fundus"} - - -class SessionHandler: - """Object for handling project global aiohttp.ClientSessions - - The session life cycle consists of three steps which can be repeated indefinitely: - Build, Supply, Teardown. - Initially there is no session build within the session handler. When a session is requested - with get_session() either a new one is created with _session_factory() or the session handler's - existing one returned. Every subsequent call to get_session() will return the same - aiohttp.ClientSession object. If close_current_session() is called, the current session will be - tear-downed and the next call to get_session() will build a new session. - """ - - def __init__(self): - self._session: Optional[aiohttp.ClientSession] = None - - @staticmethod - async def _session_factory() -> aiohttp.ClientSession: - """Builds a new ClientSession - - This returns a new client session build from pre-defined configurations - and trace configs set. These trace configs are: on_request_start, on_request_end - - Returns: - An new ClientSession - """ - timings: Dict[Optional[str], float] = dict() - - async def on_request_start( - session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestStartParams - ): - timings[params.url.host] = time.time() - - async def on_request_end( - session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestEndParams - ): - assert params.url.host - history = params.response.history - previous_status_codes = [f"({response.status})" for response in history] if history else [] - status_code_chain = " -> ".join(previous_status_codes + [f"({params.response.status})"]) - basic_logger.debug( - f"{status_code_chain} <{params.method} {params.url!r}> " - f"took {time.time() - timings[params.url.host if not history else history[0].url.host]} second(s)" - ) - - async def on_request_exception( - session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestExceptionParams - ): - basic_logger.debug( - f"FAILED: <{params.method} {params.url}> with {str(params.exception) or type(params.exception)}" - ) - - trace_config = aiohttp.TraceConfig() - trace_config.on_request_start.append(on_request_start) - trace_config.on_request_end.append(on_request_end) - trace_config.on_request_exception.append(on_request_exception) - - _connector = aiohttp.TCPConnector(limit=50) - async_session = aiohttp.ClientSession( - connector=_connector, trace_configs=[trace_config], timeout=aiohttp.ClientTimeout(total=30) - ) - return async_session - - async def get_session(self) -> aiohttp.ClientSession: - """Requests the current build session - - If called for the first time or after close_current_session was called, - this function will build a new session. Every subsequent call will return - the same session object until the session is closed with close_current_session(). - - Returns: - aiohttp.ClientSession: The current build session - """ - if not self._session: - self._session = await self._session_factory() - return self._session - - async def close_current_session(self) -> None: - """Tears down the current build session - - Returns: - None - """ - session = await self.get_session() - basic_logger.debug(f"Close session {session}") - await session.close() - self._session = None - - -session_handler = SessionHandler() - - -class _ArchiveDecompressor: - def __init__(self): - self.archive_mapping: Dict[str, Callable[[bytes], bytes]] = {"application/x-gzip": self._decompress_gzip} - - @staticmethod - def _decompress_gzip(compressed_content: bytes) -> bytes: - decompressed_content = gzip.decompress(compressed_content) - return decompressed_content - - def decompress(self, content: bytes, file_format: "str") -> bytes: - decompress_function = self.archive_mapping[file_format] - return decompress_function(content) - - @cached_property - def supported_file_formats(self) -> List[str]: - return list(self.archive_mapping.keys()) - - -@dataclass -class URLSource(AsyncIterable[str], ABC): - url: str - - _request_header: Dict[str, str] = field(default_factory=dict) - - def __post_init__(self): - if not self._request_header: - self._request_header = _default_header - if not validators.url(self.url): - raise ValueError(f"Invalid url '{self.url}'") - - def set_header(self, request_header: Dict[str, str]) -> None: - self._request_header = request_header - - @abstractmethod - def _get_pre_filtered_urls(self) -> AsyncIterator[str]: - pass - - async def __aiter__(self) -> AsyncIterator[str]: - async for url in self._get_pre_filtered_urls(): - yield url - - def get_urls(self, max_urls: int = -1) -> Iterator[str]: - """Returns a generator yielding up to URLs from . - - - Args: - max_urls (int): Number of max URLs to return. Set value is - an upper bound and not necessarily the actual number of - URLs. If set < 0, the source will be exhausted until - StopAsyncIteration is hit. Defaults to -1. - - Yields: - str: The next URL. - """ - async_url_gen = self.__aiter__() - counter = 0 - with ManagedEventLoop() as runner: - while True: - if counter == max_urls: - break - try: - yield runner.run_until_complete(async_next(async_url_gen)) - except StopAsyncIteration: - break - counter += 1 - - -@dataclass -class RSSFeed(URLSource): - async def _get_pre_filtered_urls(self) -> AsyncIterator[str]: - session = await session_handler.get_session() - async with session.get(self.url, headers=self._request_header) as response: - html = await response.text() - rss_feed = feedparser.parse(html) - if exception := rss_feed.get("bozo_exception"): - basic_logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}") - return - else: - for url in (entry["link"] for entry in rss_feed["entries"]): - yield url - - -@dataclass -class Sitemap(URLSource): - recursive: bool = True - reverse: bool = False - sitemap_filter: URLFilter = lambda url: not bool(url) - - _decompressor: ClassVar[_ArchiveDecompressor] = _ArchiveDecompressor() - _sitemap_selector: ClassVar[XPath] = CSSSelector("sitemap > loc") - _url_selector: ClassVar[XPath] = CSSSelector("url > loc") - - async def _get_pre_filtered_urls(self) -> AsyncIterator[str]: - async def yield_recursive(sitemap_url: str) -> AsyncIterator[str]: - session = await session_handler.get_session() - if not validators.url(sitemap_url): - basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed") - async with session.get(url=sitemap_url, headers=self._request_header) as response: - try: - response.raise_for_status() - except (HTTPError, ClientError, HttpProcessingError) as error: - basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}") - return - content = await response.content.read() - if response.content_type in self._decompressor.supported_file_formats: - content = self._decompressor.decompress(content, response.content_type) - if not content: - basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'") - return - tree = lxml.html.fromstring(content) - urls = [node.text_content() for node in self._url_selector(tree)] - if urls: - for new_url in reversed(urls) if self.reverse else urls: - yield new_url - elif self.recursive: - sitemap_locs = [node.text_content() for node in self._sitemap_selector(tree)] - filtered_locs = list(filter(inverse(self.sitemap_filter), sitemap_locs)) - for loc in reversed(filtered_locs) if self.reverse else filtered_locs: - async for new_url in yield_recursive(loc): - yield new_url - - async for url in yield_recursive(self.url): - yield url - - -@dataclass -class NewsMap(Sitemap): - pass +from fundus.scraping.session import session_handler +from fundus.scraping.url import URLSource @dataclass(frozen=True) @@ -274,93 +37,94 @@ class HTML: responded_url: str content: str crawl_date: datetime - source: "HTMLSource" + source: "SourceInfo" @dataclass(frozen=True) -class HTMLSource: +class SourceInfo: publisher: str @dataclass(frozen=True) -class WarcSource(HTMLSource): +class WarcSourceInfo(SourceInfo): warc_path: str warc_headers: Dict[str, str] http_headers: Dict[str, str] @dataclass(frozen=True) -class WebSource(HTMLSource): +class WebSourceInfo(SourceInfo): type: str url: str -class FundusSource: +class HTMLSource(Protocol): + @abstractmethod + def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: + ... + + +class WebSource: def __init__( self, - url_source: Union[URLSource, Iterable[str]], + url_source: Iterable[str], publisher: str, url_filter: Optional[URLFilter] = None, request_header: Optional[Dict[str, str]] = None, + delay: Optional[Delay] = None, ): - self.url_source: Union[URLSource, AsyncIterator[str]] - if isinstance(url_source, URLSource): - self.url_source = url_source - else: - self.url_source = make_iterable_async(url_source) + self.url_source = url_source self.publisher = publisher self.url_filter = url_filter self.request_header = request_header or _default_header if isinstance(url_source, URLSource): url_source.set_header(self.request_header) + self.delay = delay - async def fetch(self, url_filter: Optional[URLFilter] = None) -> AsyncIterator[Optional[HTML]]: + def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: combined_filters: List[URLFilter] = ([self.url_filter] if self.url_filter else []) + ( [url_filter] if url_filter else [] ) + timestamp = time.time() + def filter_url(u: str) -> bool: return any(f(u) for f in combined_filters) - async for url in self.url_source: + for url in self.url_source: if not validators.url(url): basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed") - yield None continue if filter_url(url): basic_logger.debug(f"Skipped requested URL '{url}' because of URL filter") - yield None continue - session = await session_handler.get_session() + session = session_handler.get_session() try: - async with session.get(url, headers=self.request_header) as response: - if filter_url(str(response.url)): - basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter") - yield None - continue - html = await response.text() - response.raise_for_status() - - except (HTTPError, ClientError, HttpProcessingError, UnicodeError) as error: + response = session.get(url, headers=self.request_header) + + except (HTTPError, ConnectionError) as error: basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'") - yield None continue except Exception as error: basic_logger.warning(f"Warning! Skipped requested URL '{url}' because of an unexpected error {error}") - yield None continue + if filter_url(str(response.url)): + basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter") + continue + html = response.text + if response.history: basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}") source = ( - WebSource(self.publisher, type(self.url_source).__name__, self.url_source.url) + WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url) if isinstance(self.url_source, URLSource) - else HTMLSource(self.publisher) + else SourceInfo(self.publisher) ) yield HTML( @@ -370,3 +134,88 @@ def filter_url(u: str) -> bool: crawl_date=datetime.now(), source=source, ) + + if self.delay: + time.sleep(max(0.0, self.delay() - time.time() + timestamp)) + timestamp = time.time() + + +class CCNewsSource: + def __init__(self, *publishers: PublisherEnum, warc_path: str, headers: Optional[Dict[str, str]] = None): + self.publishers = publishers + self.warc_path = warc_path + self.headers = headers or _default_header + + self._publisher_mapping: Dict[str, PublisherEnum] = { + urlparse(publisher.domain).netloc: publisher for publisher in publishers + } + + def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: + def extract_content(record: WarcRecord) -> Optional[str]: + warc_body: bytes = record.reader.read() + + try: + return str(warc_body, encoding=record.http_charset) + except (UnicodeDecodeError, TypeError): + encoding: Optional[str] = chardet.detect(warc_body)["encoding"] + + if encoding is not None: + basic_logger.debug( + f"Trying to decode record {record.record_id!r} from {target_url!r} " + f"using detected encoding {encoding}." + ) + + try: + return str(warc_body, encoding=encoding) + except UnicodeDecodeError: + basic_logger.warning( + f"Couldn't decode record {record.record_id!r} from {target_url!r} with " + f"original charset {record.http_charset!r} using detected charset {encoding!r}." + ) + else: + basic_logger.warning( + f"Couldn't detect charset for record {record.record_id!r} from {target_url!r} " + f"with invalid original charset {record.http_charset!r}." + ) + + return None + + with requests.Session() as session: + stream = session.get(self.warc_path, stream=True, headers=self.headers).raw + + for warc_record in ArchiveIterator(stream, record_types=WarcRecordType.response, verify_digests=True): + target_url = str(warc_record.headers["WARC-Target-URI"]) + + if url_filter is not None and url_filter(target_url): + basic_logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter") + continue + + publisher_domain: str = urlparse(target_url).netloc + + if publisher_domain not in self._publisher_mapping: + continue + + publisher = self._publisher_mapping[publisher_domain] + + if publisher.url_filter is not None and publisher.url_filter(target_url): + basic_logger.debug( + f"Skipped WARC record with target URI {target_url!r} because of " + f"publisher specific URL filter" + ) + continue + + if (content := extract_content(warc_record)) is None: + continue + + yield HTML( + requested_url=target_url, + responded_url=target_url, + content=content, + crawl_date=warc_record.record_date, + source=WarcSourceInfo( + publisher=publisher.publisher_name, + warc_path=self.warc_path, + warc_headers=dict(warc_record.headers), + http_headers=dict(warc_record.http_headers), + ), + ) diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py index 568fb97a9..c71fe86a7 100644 --- a/src/fundus/scraping/pipeline.py +++ b/src/fundus/scraping/pipeline.py @@ -1,171 +1,163 @@ -import asyncio -import time +from __future__ import annotations + +import gzip +import os +import re +from abc import abstractmethod +from datetime import datetime +from functools import lru_cache, partial, wraps +from multiprocessing import Manager +from multiprocessing.context import TimeoutError +from multiprocessing.pool import MapResult, Pool, ThreadPool +from queue import Empty, Queue from typing import ( - AsyncIterator, + Any, + Callable, + Dict, + Generic, Iterator, List, Literal, Optional, - Protocol, + Pattern, Set, Tuple, Type, + TypeVar, Union, - runtime_checkable, + cast, ) -import aioitertools +import dill import more_itertools +import requests +from dateutil.rrule import MONTHLY, rrule +from tqdm import tqdm +from typing_extensions import ParamSpec from fundus import PublisherCollection -from fundus.logging import basic_logger from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article +from fundus.scraping.delay import Delay from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter -from fundus.scraping.html import URLSource, session_handler -from fundus.scraping.scraper import Scraper -from fundus.utils.more_async import ManagedEventLoop, async_next +from fundus.scraping.html import CCNewsSource +from fundus.scraping.scraper import CCNewsScraper, Scraper +from fundus.scraping.url import URLSource +_T = TypeVar("_T") +_P = ParamSpec("_P") -@runtime_checkable -class Delay(Protocol): - """Protocol to define crawl delays between batches.""" - def __call__(self) -> float: - """Yields a float specifying the minimum crawler delay for the current article batch in seconds. +# noinspection PyPep8Naming +class dill_wrapper(Generic[_P, _T]): + def __init__(self, target: Callable[_P, _T]): + """Wraps function in dill serialization. - The effective delay does include crawling execution time between batches, - i.e. the effective delay is max(execution_time, delay). - - Examples: - >>> import random - >>> delay: Delay = lambda: random.random() - Will use a random delay in [0, 1) seconds. - - Returns: - float: The delay time in seconds. + This is in order to use unpickable functions within multiprocessing. + Args: + target: The function to wrap. """ - ... + self._serialized_target: bytes = dill.dumps(target) + @lru_cache + def _deserialize(self) -> Callable[_P, _T]: + return cast(Callable[_P, _T], dill.loads(self._serialized_target)) -class BaseCrawler: - def __init__(self, *scrapers: Scraper): - """Basic crawler to utilize scrapers. + def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T: + return self._deserialize()(*args, **kwargs) - Because scrapers are implemented asynchronously, this class handles the necessary event loops - and program logic to download articles in batches asynchronously. - Args: - *scrapers (Scraper): The scrapers which should be used. - """ - self.scrapers: Tuple[Scraper, ...] = scrapers +def queue_wrapper(queue: Queue[_T], target: Callable[_P, Iterator[_T]]) -> Callable[_P, None]: + """Wraps the target callable to add its results to the queue instead of returning them directly. - async def crawl_async( - self, - max_articles: Optional[int] = None, - error_handling: Literal["suppress", "catch", "raise"] = "suppress", - only_complete: Union[bool, ExtractionFilter] = Requires("title", "body", "publishing_date"), - delay: Optional[Union[float, Delay]] = None, - url_filter: Optional[URLFilter] = None, - only_unique: bool = True, - ) -> AsyncIterator[Article]: - """Async variant of the crawl() method. + Args: + queue: The buffer queue. + target: A target callable. - See docstring for crawl(). for detailed information about the parameters. + Returns: + (Callable[_P, None]) The wrapped target. + """ - Args: - max_articles (Optional[int]): Number of articles to crawl. Defaults to None. - error_handling (Literal["suppress", "catch", "raise"]): Set error handling. Defaults to "suppress". - only_complete (Union[bool, ExtractionFilter]): Set extraction filters. Defaults to - Requires("title", "body", "publishing_date"). - delay (Optional[Union[float, Delay]]): Set delay time between article batches. Defaults to None. - url_filter (Optional[URLFilter]): Set URLFilter. Defaults to None. - only_unique (bool): If true return only unique responses. Defaults to True. + @wraps(target) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: + for obj in target(*args, **kwargs): + queue.put(obj) - Returns: - AsyncIterator[Article]: An iterator yielding objects of type Article. - """ + return wrapper - response_cache: Set[str] = set() - def build_extraction_filter() -> Optional[ExtractionFilter]: - if isinstance(only_complete, bool): - return ( - None - if only_complete is False - else lambda extracted: not all( - bool(v) if not isinstance(v, Exception) else False for _, v in extracted.items() - ) - ) - else: - return only_complete +def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]: + """Utility function to iterate exhaustively over a pool queue. - def build_delay() -> Optional[Delay]: - if isinstance(delay, float): + The underlying iterator of this function repeatedly exhausts the given queue. + Then, if the queue is empty only if all the pool's jobs have finished, the iterator reruns. + Otherwise, it waits for the queue to be populated with the next result from the pool. - def constant_delay() -> float: - return delay # type: ignore[return-value] - - return constant_delay - else: - return delay + Args: + handle (MapResult[Any]): A handle o the MappedResult of the underling multiprocessing pool. + queue (Queue[_T]): The pool queue. - def build_url_filter() -> URLFilter: - def _filter(url: str) -> bool: - return (url_filter is not None and url_filter(url)) or (only_unique and url in response_cache) + Returns: + Iterator[_T]: The iterator over the queue as it is populated. + """ + while True: + try: + yield queue.get(timeout=0.1) + except Empty: + try: + handle.get(timeout=0.1) + except TimeoutError: + continue + return - return _filter - final_delay = build_delay() +class BaseCrawler: + def __init__( + self, + pool_factory: Type[Pool], + processes: int, + args: Tuple[_T, ...], + kwargs: Dict[str, Any], + ): + self._pool_factory = pool_factory + self._args = args + self._kwargs = kwargs + self.processes = os.cpu_count() or 0 if processes == -1 else processes + + @abstractmethod + def _fetch_articles(self, *args, **kwargs) -> Iterator[Article]: + raise NotImplementedError + + @staticmethod + def _single_crawl(args: Tuple[_T, ...], article_task: Callable[[_T], Iterator[Article]]) -> Iterator[Article]: + for arg in args: + yield from article_task(arg) + + def _parallel_crawl( + self, args: Tuple[_T, ...], article_task: Callable[[_T], Iterator[Article]] + ) -> Iterator[Article]: + # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually + # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all + # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading. + with Manager() as manager, self._pool_factory(processes=min(self.processes, len(args))) as pool: + article_queue: Queue[Article] = manager.Queue() - async_article_iterators: List[AsyncIterator[Optional[Article]]] = [ - scraper.scrape( - error_handling=error_handling, - extraction_filter=build_extraction_filter(), - url_filter=build_url_filter(), - ) - for scraper in self.scrapers - ] + # Because multiprocessing.Pool does not support iterators as targets, + # we wrap the article_task to write the articles to a queue instead of returning them directly. + wrapped_article_task: Callable[[str], None] = queue_wrapper(article_queue, article_task) - # we use this custom variant of interleave_longest in order to be able - # to delay the program flow between batches - async def _async_article_interleave_longest() -> AsyncIterator[Article]: - batches: AsyncIterator[Tuple[Optional[Article], ...]] = aioitertools.itertools.zip_longest( - *async_article_iterators - ) - start_time = time.time() - async for batch in batches: - basic_logger.debug(f"Batch took {time.time() - start_time} seconds") - for next_article in batch: - if next_article is not None: - response_cache.add(next_article.html.responded_url) - yield next_article - if final_delay: - await asyncio.sleep(max(0.0, final_delay() - time.time() + start_time)) - start_time = time.time() + # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill. + serialized_article_task = dill_wrapper(wrapped_article_task) - if max_articles is None: - max_articles = -1 - elif max_articles == 0: - return - - try: - async for article_index, article in aioitertools.builtins.enumerate( - _async_article_interleave_longest(), start=1 - ): - yield article - if article_index == max_articles: - break - finally: - await session_handler.close_current_session() + # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished. + yield from pool_queue_iter(pool.map_async(serialized_article_task, args), article_queue) def crawl( self, max_articles: Optional[int] = None, error_handling: Literal["suppress", "catch", "raise"] = "suppress", only_complete: Union[bool, ExtractionFilter] = Requires("title", "body", "publishing_date"), - delay: Optional[Union[float, Delay]] = 0.1, url_filter: Optional[URLFilter] = None, only_unique: bool = True, ) -> Iterator[Article]: @@ -185,7 +177,7 @@ def crawl( protocol as an extraction filter or use a boolean. If False, all articles will be yielded, if True, only those with all attributes extracted. Defaults to ExtractionFilter letting through all articles with at least title, body, and publishing_date set. - delay (Optional[Union[float, Delay]]): Set a delay time in seconds to be used between article + delay (Optional[Union[float, fundus.scraping.delay.Delay]]): Set a delay time in seconds to be used between article batches. You can set a delay directly using float or any callable satisfying the Delay protocol. If set to None, no delay will be used between batches. See Delay for more information. Defaults to None. @@ -198,21 +190,45 @@ def crawl( Iterator[Article]: An iterator yielding objects of type Article. """ - async_article_iter = self.crawl_async( - max_articles=max_articles, + if max_articles == 0: + return + + if max_articles is None: + max_articles = -1 + + def build_extraction_filter() -> Optional[ExtractionFilter]: + if isinstance(only_complete, bool): + return ( + None + if only_complete is False + else lambda extracted: not all( + bool(v) if not isinstance(v, Exception) else False for _, v in extracted.items() + ) + ) + else: + return only_complete + + response_cache: Set[str] = set() + + article_task: Callable[[str], Iterator[Article]] = partial( + self._fetch_articles, error_handling=error_handling, - only_complete=only_complete, - delay=delay, + extraction_filter=build_extraction_filter(), url_filter=url_filter, - only_unique=only_unique, + **self._kwargs, ) - with ManagedEventLoop() as runner: - while True: - try: - yield runner.run_until_complete(async_next(async_article_iter)) - except StopAsyncIteration: - break + if self.processes == 0: + article_iter = self._single_crawl(self._args, article_task) + else: + article_iter = self._parallel_crawl(self._args, article_task) + + for article_idx, article in enumerate(article_iter, start=1): + if not only_unique or article.html.responded_url not in response_cache: + response_cache.add(article.html.responded_url) + yield article + if article_idx == max_articles: + break class Crawler(BaseCrawler): @@ -220,6 +236,7 @@ def __init__( self, *publishers: Union[PublisherEnum, Type[PublisherEnum], Type[PublisherCollection]], restrict_sources_to: Optional[List[Type[URLSource]]] = None, + delay: Optional[Union[float, Delay]] = 0.1, ): """Fundus base class for crawling articles from the web. @@ -235,28 +252,137 @@ def __init__( restrict_sources_to (Optional[List[Type[URLSource]]]): Lets you restrict sources defined in the publisher specs. If set, only articles from given source types will be yielded. + delay (Optional[Union[float, Delay]]): Set a delay time in seconds to be used between article + downloads. You can set a delay directly using float or any callable satisfying the Delay + protocol. If set to None, no delay will be used between batches. See Delay for more + information. Defaults to None. """ if not publishers: raise ValueError("param of has to be non empty") - collapsed_publishers = more_itertools.collapse(publishers) - - # build scraper - scrapers: List[Scraper] = [] - for spec in collapsed_publishers: - if restrict_sources_to: - sources = tuple( - more_itertools.flatten(spec.source_mapping[source_type] for source_type in restrict_sources_to) - ) + collapsed_publishers = tuple(more_itertools.collapse(publishers)) + + def build_delay() -> Optional[Delay]: + if isinstance(delay, float): + + def constant_delay() -> float: + return delay # type: ignore[return-value] + + return constant_delay else: - sources = tuple(more_itertools.flatten(spec.source_mapping.values())) + return delay - if sources: - scrapers.append( - Scraper( - *sources, - parser=spec.parser, - ) - ) + super().__init__( + pool_factory=ThreadPool, + processes=len(collapsed_publishers), + args=collapsed_publishers, + kwargs={"delay": build_delay(), "restrict_sources_to": restrict_sources_to}, + ) + + @staticmethod + def _fetch_articles( + publisher: PublisherEnum, + error_handling: Literal["suppress", "catch", "raise"], + delay: Optional[Delay] = None, + restrict_sources_to: Optional[List[Type[URLSource]]] = None, + extraction_filter: Optional[ExtractionFilter] = None, + url_filter: Optional[URLFilter] = None, + ) -> Iterator[Article]: + scraper = Scraper(publisher, restrict_sources_to, delay) + yield from scraper.scrape(error_handling, extraction_filter, url_filter) + + +class CCNewsCrawler(BaseCrawler): + def __init__( + self, + *publishers: PublisherEnum, + start: datetime = datetime(2016, 8, 1), + end: datetime = datetime.now(), + processes: int = -1, + server_address: str = "https://data.commoncrawl.org/", + ): + """Initializes a crawler for the CC-NEWS dataset. + + Args: + *publishers: The publishers to crawl. + processes: Number of additional process to use for crawling. + If -1, the number of processes is set to `os.cpu_count()`. + If `os.cpu_count()` is not available, the number of processes is set to 0. + If 0, only the main process is used. Defaults to -1. + server_address: The CC-NEWS dataset server address. Defaults to 'https://data.commoncrawl.org/'. + """ + + collapsed_publishers = tuple(more_itertools.collapse(publishers)) + processes = os.cpu_count() or 0 if processes == -1 else processes + warc_paths = tuple( + self._get_warc_paths(start=start, end=end, processes=processes, server_address=server_address) + ) + + super().__init__( + pool_factory=Pool, processes=processes, args=warc_paths, kwargs={"publishers": collapsed_publishers} + ) + + @staticmethod + def _fetch_articles( + warc_path: str, + publishers: Tuple[PublisherEnum, ...], + error_handling: Literal["suppress", "catch", "raise"], + extraction_filter: Optional[ExtractionFilter] = None, + url_filter: Optional[URLFilter] = None, + ) -> Iterator[Article]: + source = CCNewsSource(*publishers, warc_path=warc_path) + scraper = CCNewsScraper(source) + yield from scraper.scrape(error_handling, extraction_filter, url_filter) - super().__init__(*scrapers) + def _get_warc_paths( + self, start: datetime, end: datetime, processes: int, server_address: str = "https://data.commoncrawl.org/" + ) -> List[str]: + # Date regex examples: https://regex101.com/r/yDX3G6/1 + date_pattern: Pattern[str] = re.compile(r"CC-NEWS-(?P\d{14})-") + + if start >= end: + raise ValueError("Start date has to be < end date.") + + if start < datetime(2016, 8, 1): + raise ValueError("The default, and earliest possible, start date is 2016/08/01.") + + if end > datetime.now(): + raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?") + + date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=start, until=end)) + urls: List[str] = [ + f"{server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence + ] + + with tqdm(total=len(urls), desc="Loading WARC Paths", leave=False) as bar: + + def load_paths(url: str) -> List[str]: + with requests.Session() as session: + paths = gzip.decompress(session.get(url).content).decode("utf-8").split() + bar.update() + return paths + + if processes == 0: + nested_warc_paths = [load_paths(url) for url in urls] + else: + # use two threads per process, default two threads per core + max_number_of_threads = processes * 2 + + with ThreadPool(processes=min(len(urls), max_number_of_threads)) as pool: + nested_warc_paths = pool.map(load_paths, urls) + + warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths) + + start_strf = start.strftime("%Y%m%d%H%M%S") + end_strf = end.strftime("%Y%m%d%H%M%S") + + def filter_warc_path_by_date(path: str) -> bool: + match: Optional[re.Match[str]] = date_pattern.search(path) + if match is None: + raise AssertionError(f"Invalid WARC path {path!r}") + return start_strf <= match["date"] <= end_strf + + return sorted( + (f"{server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)), + reverse=True, + ) diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index 004f447ce..8cde4603a 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -1,55 +1,34 @@ -from typing import AsyncIterator, Literal, Optional +from typing import Dict, Iterator, List, Literal, Optional, Type import more_itertools from fundus.logging import basic_logger from fundus.parser import ParserProxy +from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article -from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter -from fundus.scraping.html import FundusSource +from fundus.scraping.delay import Delay +from fundus.scraping.filter import ExtractionFilter, URLFilter +from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource +from fundus.scraping.url import URLSource -class Scraper: - def __init__(self, *sources: FundusSource, parser: ParserProxy): - self.sources = list(sources) +class BaseScraper: + def __init__(self, *sources: HTMLSource, parser_mapping: Dict[str, ParserProxy]): + self.sources = sources + self.parser_mapping = parser_mapping - if not parser: - raise ValueError(f"the given parser {type(parser).__name__} is empty") - - self.parser = parser - - async def scrape( + def scrape( self, error_handling: Literal["suppress", "catch", "raise"], extraction_filter: Optional[ExtractionFilter] = None, url_filter: Optional[URLFilter] = None, - ) -> AsyncIterator[Optional[Article]]: - # TODO: add docstring; especially explain why returned Article is Optional - if isinstance(extraction_filter, Requires): - supported_attributes = set( - more_itertools.flatten(collection.names for collection in self.parser.attribute_mapping.values()) - ) - if missing_attributes := extraction_filter.required_attributes - supported_attributes: - if len(missing_attributes) == 1: - basic_logger.warning( - f"The required attribute `{missing_attributes}` " - f"is not supported by {type(self.parser).__name__}. Skipping Scraper" - ) - else: - basic_logger.warning( - f"The required attributes `{', '.join(missing_attributes)}` " - f"are not supported by {type(self.parser).__name__}. Skipping Scraper" - ) + ) -> Iterator[Article]: + for source in self.sources: + for html in source.fetch(url_filter=url_filter): + parser = self.parser_mapping[html.source.publisher] - return - - for html_source in self.sources: - async for html in html_source.fetch(url_filter=url_filter): - if html is None: - yield None - continue try: - extraction = self.parser(html.crawl_date).parse(html.content, error_handling) + extraction = parser(html.crawl_date).parse(html.content, error_handling) except Exception as err: if error_handling == "raise": @@ -59,16 +38,49 @@ async def scrape( raise err elif error_handling == "catch": yield Article(html=html, exception=err) - continue elif error_handling == "suppress": basic_logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}") - yield None else: raise ValueError(f"Unknown value '{error_handling}' for parameter '") - if extraction_filter and extraction_filter(extraction): - basic_logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter") - yield None else: - article = Article.from_extracted(html=html, extracted=extraction) - yield article + if extraction_filter and extraction_filter(extraction): + basic_logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter") + else: + article = Article.from_extracted(html=html, extracted=extraction) + yield article + + +class Scraper(BaseScraper): + def __init__( + self, + publisher: PublisherEnum, + restrict_sources_to: Optional[List[Type[URLSource]]] = None, + delay: Optional[Delay] = None, + ): + if restrict_sources_to: + url_sources = tuple( + more_itertools.flatten(publisher.source_mapping[source_type] for source_type in restrict_sources_to) + ) + else: + url_sources = tuple(more_itertools.flatten(publisher.source_mapping.values())) + + html_sources = [ + WebSource( + url_source=url_source, + publisher=publisher.publisher_name, + request_header=publisher.request_header, + delay=delay, + ) + for url_source in url_sources + ] + parser_mapping = {publisher.publisher_name: publisher.parser} + super().__init__(*html_sources, parser_mapping=parser_mapping) + + +class CCNewsScraper(BaseScraper): + def __init__(self, source: CCNewsSource): + parser_mapping: Dict[str, ParserProxy] = { + publisher.publisher_name: publisher.parser for publisher in source.publishers + } + super().__init__(source, parser_mapping=parser_mapping) diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py new file mode 100644 index 000000000..e1f010c82 --- /dev/null +++ b/src/fundus/scraping/session.py @@ -0,0 +1,108 @@ +from typing import Optional + +import requests + +from fundus.logging import basic_logger + +_default_header = {"user-agent": "Fundus"} + + +class SessionHandler: + """Object for handling project global aiohttp.ClientSessions + + The session life cycle consists of three steps which can be repeated indefinitely: + Build, Supply, Teardown. + Initially there is no session build within the session handler. When a session is requested + with get_session() either a new one is created with _session_factory() or the session handler's + existing one returned. Every subsequent call to get_session() will return the same + aiohttp.ClientSession object. If close_current_session() is called, the current session will be + tear-downed and the next call to get_session() will build a new session. + """ + + def __init__(self): + self._session: Optional[requests.Session] = None + + @staticmethod + def _session_factory() -> requests.Session: + """Builds a new Session + + This returns a new client session build from pre-defined configurations: + - pool_connections: 50 + - pool_maxsize: 50 + - hooks = {'request': lambda request:} + + Returns: + An new ClientSession + """ + + # timings: Dict[Optional[str], float] = dict() + # + # async def on_request_start( + # session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestStartParams + # ): + # timings[params.url.host] = time.time() + # + # async def on_request_end( + # session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestEndParams + # ): + # assert params.url.host + # history = params.response.history + # previous_status_codes = [f"({response.status})" for response in history] if history else [] + # status_code_chain = " -> ".join(previous_status_codes + [f"({params.response.status})"]) + # basic_logger.debug( + # f"{status_code_chain} <{params.method} {params.url!r}> " + # f"took {time.time() - timings[params.url.host if not history else history[0].url.host]} second(s)" + # ) + # + # async def on_request_exception( + # session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestExceptionParams + # ): + # basic_logger.debug( + # f"FAILED: <{params.method} {params.url}> with {str(params.exception) or type(params.exception)}" + # ) + # + # trace_config = aiohttp.TraceConfig() + # trace_config.on_request_start.append(on_request_start) + # trace_config.on_request_end.append(on_request_end) + # trace_config.on_request_exception.append(on_request_exception) + + session = requests.Session() + + # hooks + hooks = {"response": lambda response, *args, **kwargs: response.raise_for_status()} + session.hooks = hooks + + # adapters + adapter_kwargs = {"pool_connections": 50, "pool_maxsize": 50} + session.mount("http://", requests.adapters.HTTPAdapter(**adapter_kwargs)) + session.mount("https://", requests.adapters.HTTPAdapter(**adapter_kwargs)) + + return session + + def get_session(self) -> requests.Session: + """Requests the current build session + + If called for the first time or after close_current_session was called, + this function will build a new session. Every subsequent call will return + the same session object until the session is closed with close_current_session(). + + Returns: + requests.Session: The current build session + """ + if not self._session: + self._session = self._session_factory() + return self._session + + def close_current_session(self) -> None: + """Tears down the current build session + + Returns: + None + """ + session = self.get_session() + basic_logger.debug(f"Close session {session}") + session.close() + self._session = None + + +session_handler = SessionHandler() diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py new file mode 100644 index 000000000..5eb414ea7 --- /dev/null +++ b/src/fundus/scraping/url.py @@ -0,0 +1,139 @@ +import gzip +import itertools +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from functools import cached_property +from typing import ( + AsyncIterator, + Callable, + ClassVar, + Dict, + Iterable, + Iterator, + List, + Optional, +) + +import feedparser +import lxml.html +import validators +from lxml.cssselect import CSSSelector +from lxml.etree import XPath +from requests import ConnectionError, HTTPError + +from fundus.logging import basic_logger +from fundus.scraping.filter import URLFilter, inverse +from fundus.scraping.session import _default_header, session_handler + + +class _ArchiveDecompressor: + def __init__(self): + self.archive_mapping: Dict[str, Callable[[bytes], bytes]] = {"application/x-gzip": self._decompress_gzip} + + @staticmethod + def _decompress_gzip(compressed_content: bytes) -> bytes: + decompressed_content = gzip.decompress(compressed_content) + return decompressed_content + + def decompress(self, content: bytes, file_format: "str") -> bytes: + decompress_function = self.archive_mapping[file_format] + return decompress_function(content) + + @cached_property + def supported_file_formats(self) -> List[str]: + return list(self.archive_mapping.keys()) + + +@dataclass +class URLSource(Iterable[str], ABC): + url: str + + _request_header: Dict[str, str] = field(default_factory=dict) + + def __post_init__(self): + if not self._request_header: + self._request_header = _default_header + if not validators.url(self.url): + raise ValueError(f"Invalid url '{self.url}'") + + def set_header(self, request_header: Dict[str, str]) -> None: + self._request_header = request_header + + @abstractmethod + def __iter__(self) -> Iterator[str]: + raise NotImplemented + + def get_urls(self, max_urls: Optional[int] = None) -> Iterator[str]: + """Returns a generator yielding up to URLs from . + + + Args: + max_urls (int): Number of max URLs to return. Set value is + an upper bound and not necessarily the actual number of + URLs. If set to None, the source will be exhausted until + StopIteration is hit. Defaults to None. + + Yields: + str: The next URL. + """ + return itertools.islice(self, max_urls) + + +@dataclass +class RSSFeed(URLSource): + def __iter__(self) -> Iterator[str]: + session = session_handler.get_session() + response = session.get(self.url, headers=self._request_header) + html = response.text + rss_feed = feedparser.parse(html) + if exception := rss_feed.get("bozo_exception"): + basic_logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}") + return + else: + for url in (entry["link"] for entry in rss_feed["entries"]): + yield url + + +@dataclass +class Sitemap(URLSource): + recursive: bool = True + reverse: bool = False + sitemap_filter: URLFilter = lambda url: not bool(url) + + _decompressor: ClassVar[_ArchiveDecompressor] = _ArchiveDecompressor() + _sitemap_selector: ClassVar[XPath] = CSSSelector("sitemap > loc") + _url_selector: ClassVar[XPath] = CSSSelector("url > loc") + + def __iter__(self) -> AsyncIterator[str]: + def yield_recursive(sitemap_url: str) -> AsyncIterator[str]: + session = session_handler.get_session() + if not validators.url(sitemap_url): + basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed") + try: + response = session.get(url=sitemap_url, headers=self._request_header) + except (HTTPError, ConnectionError) as error: + basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}") + return + content = response.content + if (content_type := response.headers["content-type"]) in self._decompressor.supported_file_formats: + content = self._decompressor.decompress(content, content_type) + if not content: + basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'") + return + tree = lxml.html.fromstring(content) + urls = [node.text_content() for node in self._url_selector(tree)] + if urls: + for new_url in reversed(urls) if self.reverse else urls: + yield new_url + elif self.recursive: + sitemap_locs = [node.text_content() for node in self._sitemap_selector(tree)] + filtered_locs = list(filter(inverse(self.sitemap_filter), sitemap_locs)) + for loc in reversed(filtered_locs) if self.reverse else filtered_locs: + yield from yield_recursive(loc) + + yield from yield_recursive(self.url) + + +@dataclass +class NewsMap(Sitemap): + pass From 60737fc9dcba2e867a5fc2fb6fbaaeb0a2fbf20a Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Sun, 4 Feb 2024 16:18:02 +0100 Subject: [PATCH 02/36] bug fixes --- src/fundus/scraping/html.py | 48 ++++++++++++++++++--------------- src/fundus/scraping/pipeline.py | 18 +++++++++---- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index ee587f503..00725d12f 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -107,33 +107,39 @@ def filter_url(u: str) -> bool: except (HTTPError, ConnectionError) as error: basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'") + if isinstance(error, HTTPError) and error.response.status_code >= 500: + return continue + except ConnectionError as error: + basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'") + except Exception as error: basic_logger.warning(f"Warning! Skipped requested URL '{url}' because of an unexpected error {error}") continue - if filter_url(str(response.url)): - basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter") - continue - html = response.text - - if response.history: - basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}") - - source = ( - WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url) - if isinstance(self.url_source, URLSource) - else SourceInfo(self.publisher) - ) - - yield HTML( - requested_url=url, - responded_url=str(response.url), - content=html, - crawl_date=datetime.now(), - source=source, - ) + else: + if filter_url(str(response.url)): + basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter") + continue + html = response.text + + if response.history: + basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}") + + source = ( + WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url) + if isinstance(self.url_source, URLSource) + else SourceInfo(self.publisher) + ) + + yield HTML( + requested_url=url, + responded_url=str(response.url), + content=html, + crawl_date=datetime.now(), + source=source, + ) if self.delay: time.sleep(max(0.0, self.delay() - time.time() + timestamp)) diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py index c71fe86a7..71e3e4ee0 100644 --- a/src/fundus/scraping/pipeline.py +++ b/src/fundus/scraping/pipeline.py @@ -223,9 +223,11 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: else: article_iter = self._parallel_crawl(self._args, article_task) - for article_idx, article in enumerate(article_iter, start=1): + article_idx = 0 + for article in article_iter: if not only_unique or article.html.responded_url not in response_cache: response_cache.add(article.html.responded_url) + article_idx += 1 yield article if article_idx == max_articles: break @@ -236,7 +238,8 @@ def __init__( self, *publishers: Union[PublisherEnum, Type[PublisherEnum], Type[PublisherCollection]], restrict_sources_to: Optional[List[Type[URLSource]]] = None, - delay: Optional[Union[float, Delay]] = 0.1, + delay: Optional[Union[float, Delay]] = 1., + threading: bool = True, ): """Fundus base class for crawling articles from the web. @@ -269,12 +272,16 @@ def constant_delay() -> float: return delay # type: ignore[return-value] return constant_delay - else: + + elif isinstance(delay, Delay): return delay + else: + raise TypeError("param of ") + super().__init__( pool_factory=ThreadPool, - processes=len(collapsed_publishers), + processes=len(collapsed_publishers) if threading else 0, args=collapsed_publishers, kwargs={"delay": build_delay(), "restrict_sources_to": restrict_sources_to}, ) @@ -334,8 +341,9 @@ def _fetch_articles( scraper = CCNewsScraper(source) yield from scraper.scrape(error_handling, extraction_filter, url_filter) + @staticmethod def _get_warc_paths( - self, start: datetime, end: datetime, processes: int, server_address: str = "https://data.commoncrawl.org/" + start: datetime, end: datetime, processes: int, server_address: str = "https://data.commoncrawl.org/" ) -> List[str]: # Date regex examples: https://regex101.com/r/yDX3G6/1 date_pattern: Pattern[str] = re.compile(r"CC-NEWS-(?P\d{14})-") From a5be52276a8c4ee8bff89b055ff77eaaa1ebda77 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Sun, 4 Feb 2024 18:51:52 +0100 Subject: [PATCH 03/36] add request logging --- src/fundus/scraping/session.py | 44 +++++++++------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index e1f010c82..f6c43a68c 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -1,6 +1,6 @@ from typing import Optional -import requests +import requests.adapters from fundus.logging import basic_logger @@ -35,41 +35,19 @@ def _session_factory() -> requests.Session: An new ClientSession """ - # timings: Dict[Optional[str], float] = dict() - # - # async def on_request_start( - # session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestStartParams - # ): - # timings[params.url.host] = time.time() - # - # async def on_request_end( - # session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestEndParams - # ): - # assert params.url.host - # history = params.response.history - # previous_status_codes = [f"({response.status})" for response in history] if history else [] - # status_code_chain = " -> ".join(previous_status_codes + [f"({params.response.status})"]) - # basic_logger.debug( - # f"{status_code_chain} <{params.method} {params.url!r}> " - # f"took {time.time() - timings[params.url.host if not history else history[0].url.host]} second(s)" - # ) - # - # async def on_request_exception( - # session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestExceptionParams - # ): - # basic_logger.debug( - # f"FAILED: <{params.method} {params.url}> with {str(params.exception) or type(params.exception)}" - # ) - # - # trace_config = aiohttp.TraceConfig() - # trace_config.on_request_start.append(on_request_start) - # trace_config.on_request_end.append(on_request_end) - # trace_config.on_request_exception.append(on_request_exception) - session = requests.Session() + def _response_log(response: requests.Response, *args, **kwargs) -> None: + history = response.history + previous_status_codes = [f"({response.status_code})" for response in history] if history else [] + status_code_chain = " -> ".join(previous_status_codes + [f"({response.status_code})"]) + basic_logger.debug( + f"{status_code_chain} <{response.request.method} {response.url!r}> " + f"took {response.elapsed.total_seconds()} second(s)" + ) + # hooks - hooks = {"response": lambda response, *args, **kwargs: response.raise_for_status()} + hooks = {"response": [lambda response, *args, **kwargs: response.raise_for_status(), _response_log]} session.hooks = hooks # adapters From 6fc5f90468786d97656f101e901669508ff781a3 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 5 Feb 2024 14:18:58 +0100 Subject: [PATCH 04/36] fix imports --- src/fundus/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py index fb5e46405..7c5cd5bbe 100644 --- a/src/fundus/__init__.py +++ b/src/fundus/__init__.py @@ -4,7 +4,7 @@ from fundus.publishers import PublisherCollection from fundus.scraping.filter import Requires from fundus.scraping.pipeline import BaseCrawler, CCNewsCrawler, Crawler -from fundus.scraping.url import NewsMap, RSSFeed, Sitemap +from fundus.scraping.url import RSSFeed, Sitemap, NewsMap __module_path__ = pathlib.Path(__file__).parent __development_base_path__ = __module_path__.parents[1] @@ -15,6 +15,9 @@ "CCNewsCrawler", "PublisherCollection", "Requires", + "RSSFeed", + "Sitemap", + "NewsMap" ] # On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times, From c70b36565294b7e7507627cac1642ce11eab312d Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 15 Feb 2024 14:04:26 +0100 Subject: [PATCH 05/36] finish Pool based crawler implementation --- src/fundus/__init__.py | 8 +- src/fundus/scraping/pipeline.py | 232 +++++++++++++++------------ src/fundus/scraping/scraper.py | 2 +- src/fundus/scraping/session.py | 32 ++-- src/fundus/scraping/url.py | 4 +- tests/fixtures/fixture_collection.py | 2 +- tests/test_collection.py | 4 +- tests/test_pipeline.py | 26 ++- 8 files changed, 168 insertions(+), 142 deletions(-) diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py index 7c5cd5bbe..f5e5e45ff 100644 --- a/src/fundus/__init__.py +++ b/src/fundus/__init__.py @@ -3,21 +3,21 @@ from fundus.publishers import PublisherCollection from fundus.scraping.filter import Requires -from fundus.scraping.pipeline import BaseCrawler, CCNewsCrawler, Crawler -from fundus.scraping.url import RSSFeed, Sitemap, NewsMap +from fundus.scraping.pipeline import CrawlerBase, Crawler, CCNewsCrawler +from fundus.scraping.url import NewsMap, RSSFeed, Sitemap __module_path__ = pathlib.Path(__file__).parent __development_base_path__ = __module_path__.parents[1] __all__ = [ + "CrawlerBase", "Crawler", - "BaseCrawler", "CCNewsCrawler", "PublisherCollection", "Requires", "RSSFeed", "Sitemap", - "NewsMap" + "NewsMap", ] # On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times, diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py index 71e3e4ee0..bbb6a7bff 100644 --- a/src/fundus/scraping/pipeline.py +++ b/src/fundus/scraping/pipeline.py @@ -3,7 +3,7 @@ import gzip import os import re -from abc import abstractmethod +from abc import abstractmethod, ABC from datetime import datetime from functools import lru_cache, partial, wraps from multiprocessing import Manager @@ -13,7 +13,6 @@ from typing import ( Any, Callable, - Dict, Generic, Iterator, List, @@ -41,7 +40,7 @@ from fundus.scraping.delay import Delay from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter from fundus.scraping.html import CCNewsSource -from fundus.scraping.scraper import CCNewsScraper, Scraper +from fundus.scraping.scraper import CCNewsScraper, WebScraper from fundus.scraping.url import URLSource _T = TypeVar("_T") @@ -112,46 +111,15 @@ def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]: return -class BaseCrawler: - def __init__( - self, - pool_factory: Type[Pool], - processes: int, - args: Tuple[_T, ...], - kwargs: Dict[str, Any], - ): - self._pool_factory = pool_factory - self._args = args - self._kwargs = kwargs - self.processes = os.cpu_count() or 0 if processes == -1 else processes - +class CrawlerBase(ABC): @abstractmethod - def _fetch_articles(self, *args, **kwargs) -> Iterator[Article]: - raise NotImplementedError - - @staticmethod - def _single_crawl(args: Tuple[_T, ...], article_task: Callable[[_T], Iterator[Article]]) -> Iterator[Article]: - for arg in args: - yield from article_task(arg) - - def _parallel_crawl( - self, args: Tuple[_T, ...], article_task: Callable[[_T], Iterator[Article]] + def _build_article_iterator( + self, + error_handling: Literal["suppress", "catch", "raise"], + extraction_filter: Optional[ExtractionFilter], + url_filter: Optional[URLFilter], ) -> Iterator[Article]: - # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually - # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all - # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading. - with Manager() as manager, self._pool_factory(processes=min(self.processes, len(args))) as pool: - article_queue: Queue[Article] = manager.Queue() - - # Because multiprocessing.Pool does not support iterators as targets, - # we wrap the article_task to write the articles to a queue instead of returning them directly. - wrapped_article_task: Callable[[str], None] = queue_wrapper(article_queue, article_task) - - # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill. - serialized_article_task = dill_wrapper(wrapped_article_task) - - # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished. - yield from pool_queue_iter(pool.map_async(serialized_article_task, args), article_queue) + raise NotImplementedError def crawl( self, @@ -177,10 +145,6 @@ def crawl( protocol as an extraction filter or use a boolean. If False, all articles will be yielded, if True, only those with all attributes extracted. Defaults to ExtractionFilter letting through all articles with at least title, body, and publishing_date set. - delay (Optional[Union[float, fundus.scraping.delay.Delay]]): Set a delay time in seconds to be used between article - batches. You can set a delay directly using float or any callable satisfying the Delay - protocol. If set to None, no delay will be used between batches. See Delay for more - information. Defaults to None. url_filter (Optional[URLFilter]): A callable object satisfying the URLFilter protocol to skip URLs before download. This filter applies on both requested and responded URL. Defaults to None. only_unique (bool): If set to True, articles yielded will be unique on the responded URL. @@ -210,21 +174,8 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: response_cache: Set[str] = set() - article_task: Callable[[str], Iterator[Article]] = partial( - self._fetch_articles, - error_handling=error_handling, - extraction_filter=build_extraction_filter(), - url_filter=url_filter, - **self._kwargs, - ) - - if self.processes == 0: - article_iter = self._single_crawl(self._args, article_task) - else: - article_iter = self._parallel_crawl(self._args, article_task) - article_idx = 0 - for article in article_iter: + for article in self._build_article_iterator(error_handling, build_extraction_filter(), url_filter): if not only_unique or article.html.responded_url not in response_cache: response_cache.add(article.html.responded_url) article_idx += 1 @@ -233,12 +184,12 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: break -class Crawler(BaseCrawler): +class Crawler(CrawlerBase): def __init__( self, *publishers: Union[PublisherEnum, Type[PublisherEnum], Type[PublisherCollection]], restrict_sources_to: Optional[List[Type[URLSource]]] = None, - delay: Optional[Union[float, Delay]] = 1., + delay: Optional[Union[float, Delay]] = 1.0, threading: bool = True, ): """Fundus base class for crawling articles from the web. @@ -263,43 +214,78 @@ def __init__( if not publishers: raise ValueError("param of has to be non empty") - collapsed_publishers = tuple(more_itertools.collapse(publishers)) + self.publishers = tuple(more_itertools.collapse(publishers)) + self.restrict_sources_to = restrict_sources_to + self.delay = delay + self.threading = threading + + def _fetch_articles( + self, + publisher: PublisherEnum, + error_handling: Literal["suppress", "catch", "raise"], + extraction_filter: Optional[ExtractionFilter] = None, + url_filter: Optional[URLFilter] = None, + ) -> Iterator[Article]: def build_delay() -> Optional[Delay]: - if isinstance(delay, float): + if isinstance(self.delay, float): def constant_delay() -> float: - return delay # type: ignore[return-value] + return self.delay # type: ignore[return-value] return constant_delay - elif isinstance(delay, Delay): - return delay + elif isinstance(self.delay, Delay): + return self.delay else: raise TypeError("param of ") - super().__init__( - pool_factory=ThreadPool, - processes=len(collapsed_publishers) if threading else 0, - args=collapsed_publishers, - kwargs={"delay": build_delay(), "restrict_sources_to": restrict_sources_to}, - ) + scraper = WebScraper(publisher, self.restrict_sources_to, build_delay()) + yield from scraper.scrape(error_handling, extraction_filter, url_filter) @staticmethod - def _fetch_articles( - publisher: PublisherEnum, + def _single_crawl( + publishers: Tuple[PublisherEnum, ...], article_task: Callable[[PublisherEnum], Iterator[Article]] + ) -> Iterator[Article]: + article_iterators = [article_task(publisher) for publisher in publishers] + while article_iterators: + for iterator in article_iterators: + try: + yield next(iterator) + except StopIteration: + article_iterators.remove(iterator) + + @staticmethod + def _threaded_crawl( + publishers: Tuple[PublisherEnum, ...], article_task: Callable[[PublisherEnum], Iterator[Article]] + ) -> Iterator[Article]: + article_queue: Queue[Article] = Queue() + wrapped_article_task = queue_wrapper(article_queue, article_task) + + with ThreadPool(processes=len(publishers) or None) as pool: + yield from pool_queue_iter(pool.map_async(wrapped_article_task, publishers), article_queue) + + def _build_article_iterator( + self, error_handling: Literal["suppress", "catch", "raise"], - delay: Optional[Delay] = None, - restrict_sources_to: Optional[List[Type[URLSource]]] = None, - extraction_filter: Optional[ExtractionFilter] = None, - url_filter: Optional[URLFilter] = None, + extraction_filter: Optional[ExtractionFilter], + url_filter: Optional[URLFilter], ) -> Iterator[Article]: - scraper = Scraper(publisher, restrict_sources_to, delay) - yield from scraper.scrape(error_handling, extraction_filter, url_filter) + article_task = partial( + self._fetch_articles, + error_handling=error_handling, + extraction_filter=extraction_filter, + url_filter=url_filter, + ) + + if self.threading: + yield from self._threaded_crawl(self.publishers, article_task) + else: + yield from self._single_crawl(self.publishers, article_task) -class CCNewsCrawler(BaseCrawler): +class CCNewsCrawler(CrawlerBase): def __init__( self, *publishers: PublisherEnum, @@ -319,15 +305,11 @@ def __init__( server_address: The CC-NEWS dataset server address. Defaults to 'https://data.commoncrawl.org/'. """ - collapsed_publishers = tuple(more_itertools.collapse(publishers)) - processes = os.cpu_count() or 0 if processes == -1 else processes - warc_paths = tuple( - self._get_warc_paths(start=start, end=end, processes=processes, server_address=server_address) - ) - - super().__init__( - pool_factory=Pool, processes=processes, args=warc_paths, kwargs={"publishers": collapsed_publishers} - ) + self.publishers = tuple(more_itertools.collapse(publishers)) + self.start = start + self.end = end + self.processes = os.cpu_count() or 0 if processes == -1 else processes + self.server_address = server_address @staticmethod def _fetch_articles( @@ -342,24 +324,47 @@ def _fetch_articles( yield from scraper.scrape(error_handling, extraction_filter, url_filter) @staticmethod - def _get_warc_paths( - start: datetime, end: datetime, processes: int, server_address: str = "https://data.commoncrawl.org/" - ) -> List[str]: + def _single_crawl( + warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Article]] + ) -> Iterator[Article]: + for warc_path in warc_paths: + yield from article_task(warc_path) + + def _parallel_crawl( + self, warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Article]] + ) -> Iterator[Article]: + # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually + # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all + # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading. + with Manager() as manager, Pool(processes=min(self.processes, len(warc_paths))) as pool: + article_queue: Queue[Article] = manager.Queue() + + # Because multiprocessing.Pool does not support iterators as targets, + # we wrap the article_task to write the articles to a queue instead of returning them directly. + wrapped_article_task: Callable[[str], None] = queue_wrapper(article_queue, article_task) + + # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill. + serialized_article_task = dill_wrapper(wrapped_article_task) + + # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished. + yield from pool_queue_iter(pool.map_async(serialized_article_task, warc_paths), article_queue) + + def _get_warc_paths(self) -> List[str]: # Date regex examples: https://regex101.com/r/yDX3G6/1 date_pattern: Pattern[str] = re.compile(r"CC-NEWS-(?P\d{14})-") - if start >= end: + if self.start >= self.end: raise ValueError("Start date has to be < end date.") - if start < datetime(2016, 8, 1): + if self.start < datetime(2016, 8, 1): raise ValueError("The default, and earliest possible, start date is 2016/08/01.") - if end > datetime.now(): + if self.end > datetime.now(): raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?") - date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=start, until=end)) + date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=self.start, until=self.end)) urls: List[str] = [ - f"{server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence + f"{self.server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence ] with tqdm(total=len(urls), desc="Loading WARC Paths", leave=False) as bar: @@ -370,19 +375,19 @@ def load_paths(url: str) -> List[str]: bar.update() return paths - if processes == 0: + if self.processes == 0: nested_warc_paths = [load_paths(url) for url in urls] else: # use two threads per process, default two threads per core - max_number_of_threads = processes * 2 + max_number_of_threads = self.processes * 2 with ThreadPool(processes=min(len(urls), max_number_of_threads)) as pool: nested_warc_paths = pool.map(load_paths, urls) warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths) - start_strf = start.strftime("%Y%m%d%H%M%S") - end_strf = end.strftime("%Y%m%d%H%M%S") + start_strf = self.start.strftime("%Y%m%d%H%M%S") + end_strf = self.end.strftime("%Y%m%d%H%M%S") def filter_warc_path_by_date(path: str) -> bool: match: Optional[re.Match[str]] = date_pattern.search(path) @@ -391,6 +396,27 @@ def filter_warc_path_by_date(path: str) -> bool: return start_strf <= match["date"] <= end_strf return sorted( - (f"{server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)), + (f"{self.server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)), reverse=True, ) + + def _build_article_iterator( + self, + error_handling: Literal["suppress", "catch", "raise"], + extraction_filter: Optional[ExtractionFilter], + url_filter: Optional[URLFilter], + ) -> Iterator[Article]: + warc_paths = tuple(self._get_warc_paths()) + + article_task = partial( + self._fetch_articles, + publishers=self.publishers, + error_handling=error_handling, + extraction_filter=extraction_filter, + url_filter=url_filter, + ) + + if self.processes == 0: + yield from self._single_crawl(warc_paths, article_task) + else: + yield from self._parallel_crawl(warc_paths, article_task) diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index 8cde4603a..8f181ba2d 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -51,7 +51,7 @@ def scrape( yield article -class Scraper(BaseScraper): +class WebScraper(BaseScraper): def __init__( self, publisher: PublisherEnum, diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index f6c43a68c..8479fbc12 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -19,11 +19,12 @@ class SessionHandler: tear-downed and the next call to get_session() will build a new session. """ - def __init__(self): - self._session: Optional[requests.Session] = None + def __init__(self, pool_connections: int = 50, pool_maxsize: int = 50): + self.session: Optional[requests.Session] = None + self.pool_connections = pool_connections + self.pool_maxsize = pool_maxsize - @staticmethod - def _session_factory() -> requests.Session: + def _session_factory(self) -> requests.Session: """Builds a new Session This returns a new client session build from pre-defined configurations: @@ -47,13 +48,18 @@ def _response_log(response: requests.Response, *args, **kwargs) -> None: ) # hooks - hooks = {"response": [lambda response, *args, **kwargs: response.raise_for_status(), _response_log]} - session.hooks = hooks + response_hooks = [lambda response, *args, **kwargs: response.raise_for_status(), _response_log] + session.hooks["response"].extend(response_hooks) # adapters - adapter_kwargs = {"pool_connections": 50, "pool_maxsize": 50} - session.mount("http://", requests.adapters.HTTPAdapter(**adapter_kwargs)) - session.mount("https://", requests.adapters.HTTPAdapter(**adapter_kwargs)) + session.mount( + "http://", + requests.adapters.HTTPAdapter(pool_connections=self.pool_connections, pool_maxsize=self.pool_maxsize), + ) + session.mount( + "https://", + requests.adapters.HTTPAdapter(pool_connections=self.pool_connections, pool_maxsize=self.pool_maxsize), + ) return session @@ -67,9 +73,9 @@ def get_session(self) -> requests.Session: Returns: requests.Session: The current build session """ - if not self._session: - self._session = self._session_factory() - return self._session + if not self.session: + self.session = self._session_factory() + return self.session def close_current_session(self) -> None: """Tears down the current build session @@ -80,7 +86,7 @@ def close_current_session(self) -> None: session = self.get_session() basic_logger.debug(f"Close session {session}") session.close() - self._session = None + self.session = None session_handler = SessionHandler() diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py index 5eb414ea7..3d883b2db 100644 --- a/src/fundus/scraping/url.py +++ b/src/fundus/scraping/url.py @@ -104,8 +104,8 @@ class Sitemap(URLSource): _sitemap_selector: ClassVar[XPath] = CSSSelector("sitemap > loc") _url_selector: ClassVar[XPath] = CSSSelector("url > loc") - def __iter__(self) -> AsyncIterator[str]: - def yield_recursive(sitemap_url: str) -> AsyncIterator[str]: + def __iter__(self) -> Iterator[str]: + def yield_recursive(sitemap_url: str) -> Iterator[str]: session = session_handler.get_session() if not validators.url(sitemap_url): basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed") diff --git a/tests/fixtures/fixture_collection.py b/tests/fixtures/fixture_collection.py index d4553fd71..94598285b 100644 --- a/tests/fixtures/fixture_collection.py +++ b/tests/fixtures/fixture_collection.py @@ -72,7 +72,7 @@ class PubEnum(PublisherEnum): @pytest.fixture -def collection_with_validate_publisher_enum(publisher_enum_with_news_map): +def collection_with_valid_publisher_enum(publisher_enum_with_news_map): class CollectionWithValidatePublisherEnum(metaclass=PublisherCollectionMeta): pub = publisher_enum_with_news_map diff --git a/tests/test_collection.py b/tests/test_collection.py index a307efe34..1c2fa308a 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -11,8 +11,8 @@ def test_iter_empty_collection(self, empty_collection): def test_iter_collection_with_empty_publisher_enum(self, collection_with_empty_publisher_enum): assert list(collection_with_empty_publisher_enum) == [] - def test_iter_collection_with_publisher_enum(self, collection_with_validate_publisher_enum): - assert list(collection_with_validate_publisher_enum) == [collection_with_validate_publisher_enum.pub.value] + def test_iter_collection_with_publisher_enum(self, collection_with_valid_publisher_enum): + assert list(collection_with_valid_publisher_enum) == [collection_with_valid_publisher_enum.pub.value] def test_publisher_enum_with_wrong_enum_value(self): with pytest.raises(ValueError): diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 6bae9ec23..0cbfafd5b 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -6,34 +6,28 @@ class TestPipeline: def test_crawler_with_empty_collection(self, collection_with_empty_publisher_enum): crawler = Crawler(collection_with_empty_publisher_enum) - assert crawler.scrapers == tuple() + assert crawler.publishers == tuple() assert next(crawler.crawl(), None) is None with pytest.raises(ValueError): Crawler(*collection_with_empty_publisher_enum) - def test_crawler_with_collection(self, collection_with_validate_publisher_enum): - crawler = Crawler(*collection_with_validate_publisher_enum) - publisher = collection_with_validate_publisher_enum.pub.value - print(crawler.scrapers) - assert len(crawler.scrapers) == 1 - assert len(crawler.scrapers[0].sources) == len( - list(value for value in publisher.source_mapping.values() if value) - ) + def test_crawler_with_collection(self, collection_with_valid_publisher_enum): + crawler = Crawler(*collection_with_valid_publisher_enum) + publisher = collection_with_valid_publisher_enum.pub.value + assert len(crawler.publishers) == 1 def test_crawler_with_publisher_enum(self, publisher_enum_with_rss_feeds, publisher_enum_with_news_map): crawler = Crawler(publisher_enum_with_rss_feeds, publisher_enum_with_news_map) - assert len(crawler.scrapers) == 2 + assert len(crawler.publishers) == 2 crawler = Crawler(publisher_enum_with_rss_feeds, publisher_enum_with_news_map, restrict_sources_to=[RSSFeed]) - assert len(crawler.scrapers) == 1 - assert crawler.scrapers[0].sources == publisher_enum_with_rss_feeds.value.source_mapping[RSSFeed] + assert len(crawler.publishers) == 2 crawler = Crawler(publisher_enum_with_rss_feeds, publisher_enum_with_news_map, restrict_sources_to=[NewsMap]) - assert len(crawler.scrapers) == 1 - assert crawler.scrapers[0].sources == publisher_enum_with_news_map.value.source_mapping[NewsMap] + assert len(crawler.publishers) == 2 - def test_consecutive_calls_to_crawl(self, collection_with_validate_publisher_enum): - crawler = Crawler(collection_with_validate_publisher_enum) + def test_consecutive_calls_to_crawl(self, collection_with_valid_publisher_enum): + crawler = Crawler(collection_with_valid_publisher_enum) next(crawler.crawl(max_articles=0), None) next(crawler.crawl(max_articles=0), None) From 90139b51df5522b302a478ca3cbd583e12e60657 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 15 Feb 2024 14:10:24 +0100 Subject: [PATCH 06/36] remove async code --- pyproject.toml | 2 -- src/fundus/__init__.py | 19 ----------- src/fundus/scraping/url.py | 1 - src/fundus/utils/more_async.py | 61 ---------------------------------- 4 files changed, 83 deletions(-) delete mode 100644 src/fundus/utils/more_async.py diff --git a/pyproject.toml b/pyproject.toml index eb47ff248..a1b42a894 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,8 +31,6 @@ dependencies = [ "colorama>=0.4, <1", "typing-extensions>=4.0, <5", "langdetect>=1.0, <2", - "aiohttp>=3.8, <4", - "aioitertools>=0.11, <1", "validators>=0.20, <1", "requests>=2.28, <3", "tqdm>=4.66, <5", diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py index f5e5e45ff..f95f8b9d1 100644 --- a/src/fundus/__init__.py +++ b/src/fundus/__init__.py @@ -19,22 +19,3 @@ "Sitemap", "NewsMap", ] - -# On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times, -# Python throws an `RuntimeError: Event loop is closed exception` during Python's clean-up phase. - -# To reproduce the error run the following code: -# from fundus import Crawler, PublisherCollection -# crawler = Crawler(PublisherCollection.de.DieWelt) -# for article in crawler.crawl(max_articles=1): -# pass -# for article in crawler.crawl(max_articles=1): -# pass - -# A workaround involves to modify the event loop policy of asyncio on Windows machines. -# Unfortunately, this is a global modification. For further information see: -# https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop -if sys.platform == "win32": - import asyncio - - asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py index 3d883b2db..207b0b721 100644 --- a/src/fundus/scraping/url.py +++ b/src/fundus/scraping/url.py @@ -4,7 +4,6 @@ from dataclasses import dataclass, field from functools import cached_property from typing import ( - AsyncIterator, Callable, ClassVar, Dict, diff --git a/src/fundus/utils/more_async.py b/src/fundus/utils/more_async.py deleted file mode 100644 index 306fb4699..000000000 --- a/src/fundus/utils/more_async.py +++ /dev/null @@ -1,61 +0,0 @@ -import asyncio -from asyncio import AbstractEventLoop -from typing import AsyncIterator, Iterable, TypeVar, Union, overload - -_T = TypeVar("_T") -_VT = TypeVar("_VT") - - -class _Sentinel: - pass - - -__sentinel = _Sentinel() - - -@overload -async def async_next(iterator: AsyncIterator[_T]) -> _T: - ... - - -@overload -async def async_next(iterator: AsyncIterator[_T], default: Union[_VT, _Sentinel]) -> Union[_T, _VT]: - ... - - -async def async_next(iterator: AsyncIterator[_T], default: Union[_VT, _Sentinel] = __sentinel) -> Union[_T, _VT]: - task = iterator.__anext__() - try: - return await task - except StopAsyncIteration: - if not isinstance(default, _Sentinel): - return default - else: - raise StopAsyncIteration - - -async def make_iterable_async(iterable: Iterable[_T]) -> AsyncIterator[_T]: - for nxt in iterable: - yield nxt - - -class ManagedEventLoop: - def __init__(self) -> None: - self.event_loop: AbstractEventLoop - - def __enter__(self) -> AbstractEventLoop: - try: - asyncio.get_running_loop() - raise AssertionError() - except RuntimeError: - self.event_loop = asyncio.new_event_loop() - except AssertionError: - raise RuntimeError( - "There is already an event loop running. If you want to crawl articles inside an " - "async environment use crawl_async() instead." - ) - return self.event_loop - - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - self.event_loop.run_until_complete(self.event_loop.shutdown_asyncgens()) - self.event_loop.close() From f8436c74a3148c65d16d0ce05cc90b18ef582d4e Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 15 Feb 2024 14:11:17 +0100 Subject: [PATCH 07/36] rename pipeline.py -> crawler.py --- src/fundus/__init__.py | 2 +- src/fundus/scraping/{pipeline.py => crawler.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/fundus/scraping/{pipeline.py => crawler.py} (100%) diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py index f95f8b9d1..63f964a5e 100644 --- a/src/fundus/__init__.py +++ b/src/fundus/__init__.py @@ -3,7 +3,7 @@ from fundus.publishers import PublisherCollection from fundus.scraping.filter import Requires -from fundus.scraping.pipeline import CrawlerBase, Crawler, CCNewsCrawler +from fundus.scraping.crawler import CrawlerBase, Crawler, CCNewsCrawler from fundus.scraping.url import NewsMap, RSSFeed, Sitemap __module_path__ = pathlib.Path(__file__).parent diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/crawler.py similarity index 100% rename from src/fundus/scraping/pipeline.py rename to src/fundus/scraping/crawler.py From f1be26b61e213177677f9ef76ad3232f93e9e668 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 15 Feb 2024 14:29:18 +0100 Subject: [PATCH 08/36] update documentation --- README.md | 2 +- docs/1_getting_started.md | 2 -- docs/2_crawl_from_cc_news.md | 10 +++++----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 15c02b605..687495d1a 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ Maybe you want to crawl a specific news source instead. Let's crawl news article from fundus import PublisherCollection, Crawler # initialize the crawler for Washington Times -crawler = Crawler(PublisherCollection.us.WashingtonTimes) +crawler = Crawler(PublisherCollection.us.TheNewYorker) # crawl 2 articles and print for article in crawler.crawl(max_articles=2): diff --git a/docs/1_getting_started.md b/docs/1_getting_started.md index b39ba9f05..f5abbb209 100644 --- a/docs/1_getting_started.md +++ b/docs/1_getting_started.md @@ -46,8 +46,6 @@ You can also initialize a crawler for the entire publisher collection crawler = Crawler(PublisherCollection) ```` -**_NOTE:_** To build a pipeline from low-level `Scraper` objects make use of the `BaseCrawler` class. - # How to crawl articles Now to crawl articles make use of the `crawl()` method of the initialized crawler class. diff --git a/docs/2_crawl_from_cc_news.md b/docs/2_crawl_from_cc_news.md index 0c36a17c0..a43298c65 100644 --- a/docs/2_crawl_from_cc_news.md +++ b/docs/2_crawl_from_cc_news.md @@ -1,12 +1,12 @@ # Table of Contents -* [Crawl articles from CC-NEWS](#crawl-articles-from-cc-news) +* [How to crawl articles from CC-NEWS](#how-to-crawl-articles-from-cc-news) * [The crawler](#the-crawler) * [OS start method](#os-start-method) * [Date range](#date-range) * [Multiprocessing](#multiprocessing) -# Crawl articles from CC-NEWS +# How to crawl articles from CC-NEWS This tutorial explains how to crawl articles from the [CC-NEWS](https://paperswithcode.com/dataset/cc-news) dataset using Fundus. @@ -48,8 +48,8 @@ from datetime import datetime from fundus import CCNewsCrawler, PublisherCollection -crawler = CCNewsCrawler(*PublisherCollection) -for article in crawler.crawl(start=datetime(2020, 1, 1), end=datetime(2020, 3, 1), max_articles=100): +crawler = CCNewsCrawler(*PublisherCollection, start=datetime(2020, 1, 1), end=datetime(2020, 3, 1)) +for article in crawler.crawl(max_articles=100): print(article) ```` @@ -66,7 +66,7 @@ from fundus import CCNewsCrawler, PublisherCollection crawler = CCNewsCrawler(*PublisherCollection, processes=4) ```` -To omit multiprocessing, pass `0` to the `processes` parameter. +To omit multiprocessing, pass `-1` to the `processes` parameter. In the [next section](3_the_article_class.md) we will introduce you to the `Article` class. From 67fafc69375a6a598081bc37ff1777e53f2850b4 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 15 Feb 2024 14:32:46 +0100 Subject: [PATCH 09/36] code cleanup --- src/fundus/__init__.py | 3 +-- src/fundus/scraping/crawler.py | 2 +- src/fundus/scraping/html.py | 2 +- src/fundus/scraping/url.py | 10 +--------- 4 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py index 63f964a5e..d1a4d2482 100644 --- a/src/fundus/__init__.py +++ b/src/fundus/__init__.py @@ -1,9 +1,8 @@ import pathlib -import sys from fundus.publishers import PublisherCollection +from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase from fundus.scraping.filter import Requires -from fundus.scraping.crawler import CrawlerBase, Crawler, CCNewsCrawler from fundus.scraping.url import NewsMap, RSSFeed, Sitemap __module_path__ = pathlib.Path(__file__).parent diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index bbb6a7bff..4356803c4 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -3,7 +3,7 @@ import gzip import os import re -from abc import abstractmethod, ABC +from abc import ABC, abstractmethod from datetime import datetime from functools import lru_cache, partial, wraps from multiprocessing import Manager diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 00725d12f..9f04d869c 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -2,7 +2,7 @@ from abc import abstractmethod from dataclasses import dataclass from datetime import datetime -from typing import Dict, Iterable, Iterator, List, Optional, Protocol, Union +from typing import Dict, Iterable, Iterator, List, Optional, Protocol from urllib.parse import urlparse import chardet diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py index 207b0b721..ea830a27b 100644 --- a/src/fundus/scraping/url.py +++ b/src/fundus/scraping/url.py @@ -3,15 +3,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field from functools import cached_property -from typing import ( - Callable, - ClassVar, - Dict, - Iterable, - Iterator, - List, - Optional, -) +from typing import Callable, ClassVar, Dict, Iterable, Iterator, List, Optional import feedparser import lxml.html From 8c36a71a75de78c595778558aa95eb8d38d221ad Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 15 Feb 2024 15:41:05 +0100 Subject: [PATCH 10/36] add logic to filter publisher not fulfilling extraction requirements --- src/fundus/scraping/crawler.py | 61 +++++++++++++++++---- tests/fixtures/fixture_collection.py | 25 +++++++++ tests/{test_pipeline.py => test_crawler.py} | 18 ++++++ 3 files changed, 92 insertions(+), 12 deletions(-) rename tests/{test_pipeline.py => test_crawler.py} (66%) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index 4356803c4..9e6d8dd9e 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -32,10 +32,10 @@ import requests from dateutil.rrule import MONTHLY, rrule from tqdm import tqdm -from typing_extensions import ParamSpec +from typing_extensions import ParamSpec, TypeAlias -from fundus import PublisherCollection -from fundus.publishers.base_objects import PublisherEnum +from fundus.logging import basic_logger +from fundus.publishers.base_objects import PublisherCollectionMeta, PublisherEnum from fundus.scraping.article import Article from fundus.scraping.delay import Delay from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter @@ -46,6 +46,8 @@ _T = TypeVar("_T") _P = ParamSpec("_P") +Publisher: TypeAlias = Union[PublisherEnum, Type[PublisherEnum], PublisherCollectionMeta] + # noinspection PyPep8Naming class dill_wrapper(Generic[_P, _T]): @@ -112,9 +114,13 @@ def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]: class CrawlerBase(ABC): + def __init__(self, *publishers: Publisher): + self.publishers = tuple(set(more_itertools.collapse(publishers))) + @abstractmethod def _build_article_iterator( self, + publishers: Tuple[PublisherEnum, ...], error_handling: Literal["suppress", "catch", "raise"], extraction_filter: Optional[ExtractionFilter], url_filter: Optional[URLFilter], @@ -174,8 +180,35 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: response_cache: Set[str] = set() + extraction_filter = build_extraction_filter() + fitting_publisher: List[PublisherEnum] = [] + + if isinstance(extraction_filter, Requires): + for publisher in self.publishers: + supported_attributes = set( + more_itertools.flatten( + collection.names for collection in publisher.parser.attribute_mapping.values() + ) + ) + if missing_attributes := extraction_filter.required_attributes - supported_attributes: + basic_logger.warning( + f"The required attribute(s) `{', '.join(missing_attributes)}` " + f"is(are) not supported by {publisher.publisher_name}. Skipping publisher" + ) + else: + fitting_publisher.append(publisher) + + if not fitting_publisher: + basic_logger.error( + f"Could not find any fitting publisher for required attributes " + f"`{', '.join(extraction_filter.required_attributes)}`" + ) + return + article_idx = 0 - for article in self._build_article_iterator(error_handling, build_extraction_filter(), url_filter): + for article in self._build_article_iterator( + tuple(fitting_publisher or self.publishers), error_handling, build_extraction_filter(), url_filter + ): if not only_unique or article.html.responded_url not in response_cache: response_cache.add(article.html.responded_url) article_idx += 1 @@ -187,7 +220,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: class Crawler(CrawlerBase): def __init__( self, - *publishers: Union[PublisherEnum, Type[PublisherEnum], Type[PublisherCollection]], + *publishers: Publisher, restrict_sources_to: Optional[List[Type[URLSource]]] = None, delay: Optional[Union[float, Delay]] = 1.0, threading: bool = True, @@ -196,7 +229,7 @@ def __init__( Examples: >>> from fundus import PublisherCollection, Crawler - >>> crawler = Crawler(PublisherCollection) + >>> crawler = Crawler(*PublisherCollection) >>> # Crawler(PublisherCollection.us) to crawl only english news >>> for article in crawler.crawl(): >>> print(article) @@ -215,7 +248,8 @@ def __init__( if not publishers: raise ValueError("param of has to be non empty") - self.publishers = tuple(more_itertools.collapse(publishers)) + super().__init__(*publishers) + self.restrict_sources_to = restrict_sources_to self.delay = delay self.threading = threading @@ -268,6 +302,7 @@ def _threaded_crawl( def _build_article_iterator( self, + publishers: Tuple[PublisherEnum, ...], error_handling: Literal["suppress", "catch", "raise"], extraction_filter: Optional[ExtractionFilter], url_filter: Optional[URLFilter], @@ -280,15 +315,15 @@ def _build_article_iterator( ) if self.threading: - yield from self._threaded_crawl(self.publishers, article_task) + yield from self._threaded_crawl(publishers, article_task) else: - yield from self._single_crawl(self.publishers, article_task) + yield from self._single_crawl(publishers, article_task) class CCNewsCrawler(CrawlerBase): def __init__( self, - *publishers: PublisherEnum, + *publishers: Publisher, start: datetime = datetime(2016, 8, 1), end: datetime = datetime.now(), processes: int = -1, @@ -305,7 +340,8 @@ def __init__( server_address: The CC-NEWS dataset server address. Defaults to 'https://data.commoncrawl.org/'. """ - self.publishers = tuple(more_itertools.collapse(publishers)) + super().__init__(*publishers) + self.start = start self.end = end self.processes = os.cpu_count() or 0 if processes == -1 else processes @@ -402,6 +438,7 @@ def filter_warc_path_by_date(path: str) -> bool: def _build_article_iterator( self, + publishers: Tuple[PublisherEnum, ...], error_handling: Literal["suppress", "catch", "raise"], extraction_filter: Optional[ExtractionFilter], url_filter: Optional[URLFilter], @@ -410,7 +447,7 @@ def _build_article_iterator( article_task = partial( self._fetch_articles, - publishers=self.publishers, + publishers=publishers, error_handling=error_handling, extraction_filter=extraction_filter, url_filter=url_filter, diff --git a/tests/fixtures/fixture_collection.py b/tests/fixtures/fixture_collection.py index 94598285b..b18879bb2 100644 --- a/tests/fixtures/fixture_collection.py +++ b/tests/fixtures/fixture_collection.py @@ -77,3 +77,28 @@ class CollectionWithValidatePublisherEnum(metaclass=PublisherCollectionMeta): pub = publisher_enum_with_news_map return CollectionWithValidatePublisherEnum + + +@pytest.fixture +def collection_with_two_valid_publisher_enum(parser_proxy_with_version): + class PubEnumNews(PublisherEnum): + news = PublisherSpec( + name="test_pub", + domain="https://test.com/", + sources=[NewsMap("https://test.com/test_newsmap")], + parser=parser_proxy_with_version, + ) + + class PubEnumSitemap(PublisherEnum): + sitemap = PublisherSpec( + name="test_pub", + domain="https://test.com/", + sources=[Sitemap("https://test.com/test_sitemap")], + parser=parser_proxy_with_version, + ) + + class CollectionWithTwoValidatePublisherEnum(metaclass=PublisherCollectionMeta): + enum_news = PubEnumNews + enum_sitemap = PubEnumSitemap + + return CollectionWithTwoValidatePublisherEnum diff --git a/tests/test_pipeline.py b/tests/test_crawler.py similarity index 66% rename from tests/test_pipeline.py rename to tests/test_crawler.py index 0cbfafd5b..53587a951 100644 --- a/tests/test_pipeline.py +++ b/tests/test_crawler.py @@ -17,6 +17,24 @@ def test_crawler_with_collection(self, collection_with_valid_publisher_enum): publisher = collection_with_valid_publisher_enum.pub.value assert len(crawler.publishers) == 1 + def test_crawler_with_two_collections( + self, + collection_with_valid_publisher_enum, + collection_with_empty_publisher_enum, + collection_with_two_valid_publisher_enum, + ): + crawler = Crawler(collection_with_empty_publisher_enum, collection_with_valid_publisher_enum) + assert len(crawler.publishers) == 1 + + crawler = Crawler(collection_with_valid_publisher_enum, collection_with_valid_publisher_enum) + assert len(crawler.publishers) == 1 + + crawler = Crawler(collection_with_two_valid_publisher_enum) + assert len(crawler.publishers) == 2 + + crawler = Crawler(collection_with_valid_publisher_enum, collection_with_two_valid_publisher_enum) + assert len(crawler.publishers) == 3 + def test_crawler_with_publisher_enum(self, publisher_enum_with_rss_feeds, publisher_enum_with_news_map): crawler = Crawler(publisher_enum_with_rss_feeds, publisher_enum_with_news_map) assert len(crawler.publishers) == 2 From 7a392723fad5eb4b2409d1d8b18bc41386703609 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Sat, 17 Feb 2024 17:32:59 +0100 Subject: [PATCH 11/36] remove duplicate error catch --- src/fundus/scraping/html.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 9f04d869c..d70bb5c8d 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -111,9 +111,6 @@ def filter_url(u: str) -> bool: return continue - except ConnectionError as error: - basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'") - except Exception as error: basic_logger.warning(f"Warning! Skipped requested URL '{url}' because of an unexpected error {error}") continue From 082b1d81ba0110791b6dd5452465b495aefa8968 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Fri, 8 Mar 2024 19:38:45 +0100 Subject: [PATCH 12/36] limit queue size --- src/fundus/scraping/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index 9e6d8dd9e..327e027e4 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -294,7 +294,7 @@ def _single_crawl( def _threaded_crawl( publishers: Tuple[PublisherEnum, ...], article_task: Callable[[PublisherEnum], Iterator[Article]] ) -> Iterator[Article]: - article_queue: Queue[Article] = Queue() + article_queue: Queue[Article] = Queue(len(publishers)) wrapped_article_task = queue_wrapper(article_queue, article_task) with ThreadPool(processes=len(publishers) or None) as pool: @@ -373,7 +373,7 @@ def _parallel_crawl( # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading. with Manager() as manager, Pool(processes=min(self.processes, len(warc_paths))) as pool: - article_queue: Queue[Article] = manager.Queue() + article_queue: Queue[Article] = manager.Queue(maxsize=1000) # Because multiprocessing.Pool does not support iterators as targets, # we wrap the article_task to write the articles to a queue instead of returning them directly. From afdda0a0f9d8a7e4df4f339cdcf557e9df8cd8b3 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 25 Mar 2024 14:35:49 +0100 Subject: [PATCH 13/36] finish merge --- scripts/generate_parser_test_files.py | 16 ++++++++-------- src/fundus/scraping/crawler.py | 10 ++-------- src/fundus/scraping/scraper.py | 10 ++++++++-- tests/utility.py | 4 ++-- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py index 83ab0e0ca..8c24b9453 100644 --- a/scripts/generate_parser_test_files.py +++ b/scripts/generate_parser_test_files.py @@ -5,25 +5,25 @@ from tqdm import tqdm -from fundus import BaseCrawler, Crawler, PublisherCollection +from fundus import Crawler, PublisherCollection from fundus.logging import basic_logger from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article -from fundus.scraping.html import FundusSource -from fundus.scraping.scraper import Scraper +from fundus.scraping.filter import RequiresAll +from fundus.scraping.html import WebSource +from fundus.scraping.scraper import BaseScraper, WebScraper from tests.test_parser import attributes_required_to_cover from tests.utility import HTMLTestFile, get_test_case_json, load_html_test_file_mapping def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]: - crawler: BaseCrawler if url is None: crawler = Crawler(enum) + return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None) else: - source = FundusSource([url], publisher=enum.publisher_name) - scraper = Scraper(source, parser=enum.parser) - crawler = BaseCrawler(scraper) - return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None) + source = WebSource([url], publisher=enum.publisher_name) + scraper = BaseScraper(source, parser_mapping={enum.publisher_name: enum.parser}) + return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll())) def parse_arguments() -> Namespace: diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index 327e027e4..a0fb75aaa 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -38,7 +38,7 @@ from fundus.publishers.base_objects import PublisherCollectionMeta, PublisherEnum from fundus.scraping.article import Article from fundus.scraping.delay import Delay -from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter +from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter from fundus.scraping.html import CCNewsSource from fundus.scraping.scraper import CCNewsScraper, WebScraper from fundus.scraping.url import URLSource @@ -168,13 +168,7 @@ def crawl( def build_extraction_filter() -> Optional[ExtractionFilter]: if isinstance(only_complete, bool): - return ( - None - if only_complete is False - else lambda extracted: not all( - bool(v) if not isinstance(v, Exception) else False for _, v in extracted.items() - ) - ) + return None if only_complete is False else RequiresAll() else: return only_complete diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index 84965e437..57ceb3860 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -7,7 +7,11 @@ from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article from fundus.scraping.delay import Delay -from fundus.scraping.filter import ExtractionFilter, FilterResultWithMissingAttributes, URLFilter +from fundus.scraping.filter import ( + ExtractionFilter, + FilterResultWithMissingAttributes, + URLFilter, +) from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource from fundus.scraping.url import URLSource @@ -51,7 +55,9 @@ def scrape( f"{', '.join(filter_result.missing_attributes)!r} is(are) missing" ) else: - basic_logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter") + basic_logger.debug( + f"Skipped article at '{html.requested_url}' because of extraction filter" + ) else: article = Article.from_extracted(html=html, extracted=extraction) yield article diff --git a/tests/utility.py b/tests/utility.py index 91f343b43..22378c4b2 100644 --- a/tests/utility.py +++ b/tests/utility.py @@ -11,7 +11,7 @@ from fundus.parser import BaseParser from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article -from fundus.scraping.html import HTML, HTMLSource +from fundus.scraping.html import HTML, SourceInfo from scripts.generate_tables import supported_publishers_markdown_path from tests.resources.parser.test_data import __module_path__ as test_resource_path @@ -28,7 +28,7 @@ def get_test_articles(publisher: PublisherEnum) -> List[Article]: crawl_date=html_test_file.crawl_date, requested_url=html_test_file.url, responded_url=html_test_file.url, - source=HTMLSource(publisher.publisher_name), + source=SourceInfo(publisher.publisher_name), ) article = Article.from_extracted(extracted=extraction, html=html) articles.append(article) From 6574ac3bfbe9c2ca8bd7ab0db89805faea960c4a Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:00:13 +0200 Subject: [PATCH 14/36] Apply suggestions from code review Co-authored-by: Adrian Breiding --- src/fundus/scraping/crawler.py | 7 ++++--- src/fundus/scraping/scraper.py | 2 +- src/fundus/scraping/session.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index a0fb75aaa..add008291 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -224,12 +224,12 @@ def __init__( Examples: >>> from fundus import PublisherCollection, Crawler >>> crawler = Crawler(*PublisherCollection) - >>> # Crawler(PublisherCollection.us) to crawl only english news + >>> # Crawler(PublisherCollection.us) to crawl only american news >>> for article in crawler.crawl(): >>> print(article) Args: - *publishers (Union[PublisherEnum, Type[PublisherEnum]]): The publishers to crawl. + *publishers (Union[PublisherEnum, Type[PublisherEnum], PublisherCollectionMeta]): The publishers to crawl. restrict_sources_to (Optional[List[Type[URLSource]]]): Lets you restrict sources defined in the publisher specs. If set, only articles from given source types will be yielded. @@ -257,9 +257,10 @@ def _fetch_articles( ) -> Iterator[Article]: def build_delay() -> Optional[Delay]: if isinstance(self.delay, float): + delay = self.delay def constant_delay() -> float: - return self.delay # type: ignore[return-value] + return delay return constant_delay diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index 57ceb3860..f8753280d 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -86,7 +86,7 @@ def __init__( ) for url_source in url_sources ] - parser_mapping = {publisher.publisher_name: publisher.parser} + parser_mapping: Dict[str, ParserProxy] = {publisher.publisher_name: publisher.parser} super().__init__(*html_sources, parser_mapping=parser_mapping) diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index 8479fbc12..8df837994 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -33,7 +33,7 @@ def _session_factory(self) -> requests.Session: - hooks = {'request': lambda request:} Returns: - An new ClientSession + A new requests.Session """ session = requests.Session() From 10d1a560228cc3f7379ee2ad956a825acc48f52d Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:06:48 +0200 Subject: [PATCH 15/36] fix indentation --- src/fundus/scraping/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index add008291..d64f58445 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -257,7 +257,7 @@ def _fetch_articles( ) -> Iterator[Article]: def build_delay() -> Optional[Delay]: if isinstance(self.delay, float): - delay = self.delay + delay = self.delay def constant_delay() -> float: return delay From 4cfd4313531f175f414ea6d13397ca8552b3442d Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:09:08 +0200 Subject: [PATCH 16/36] clean imports in documentation --- docs/4_how_to_filter_articles.md | 3 +-- docs/5_how_to_search_for_publishers.md | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/4_how_to_filter_articles.md b/docs/4_how_to_filter_articles.md index 786e6a387..d6233d18f 100644 --- a/docs/4_how_to_filter_articles.md +++ b/docs/4_how_to_filter_articles.md @@ -184,8 +184,7 @@ You can preselect the source for your articles when initializing a new `Crawler` Let's initiate a crawler who only crawls from `NewsMaps`'s. ````python -from fundus import Crawler, PublisherCollection -from fundus.scraping.url import NewsMap +from fundus import Crawler, PublisherCollection, NewsMap crawler = Crawler(PublisherCollection.us, restrict_sources_to=[NewsMap]) ```` diff --git a/docs/5_how_to_search_for_publishers.md b/docs/5_how_to_search_for_publishers.md index 3d09bb5f5..bbbbf79ee 100644 --- a/docs/5_how_to_search_for_publishers.md +++ b/docs/5_how_to_search_for_publishers.md @@ -15,8 +15,7 @@ You can search through the collection to get only publishers fitting your use ca Let's get some publishers based in the US, supporting an attribute called `topics` and `NewsMap` as a source, and use them to initialize a crawler afterward. ````python -from fundus import Crawler, PublisherCollection -from fundus.scraping.url import NewsMap +from fundus import Crawler, PublisherCollection, NewsMap fitting_publishers = PublisherCollection.us.search(attributes=["topics"], source_types=[NewsMap]) crawler = Crawler(fitting_publishers) From 0c57fc1f7f060fe88624728005665a57fbb85049 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:12:43 +0200 Subject: [PATCH 17/36] add `None` as default to `next` for test file generation --- scripts/generate_parser_test_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py index 8c24b9453..adbf2e773 100644 --- a/scripts/generate_parser_test_files.py +++ b/scripts/generate_parser_test_files.py @@ -23,7 +23,7 @@ def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional else: source = WebSource([url], publisher=enum.publisher_name) scraper = BaseScraper(source, parser_mapping={enum.publisher_name: enum.parser}) - return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll())) + return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None) def parse_arguments() -> Namespace: From 53550fd1739279d8b4e3963062622be73bd3d0f3 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:22:44 +0200 Subject: [PATCH 18/36] add `tmp` variable for secure iteration --- src/fundus/scraping/crawler.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index d64f58445..e483a6c19 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -199,15 +199,15 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: ) return - article_idx = 0 + article_count = 0 for article in self._build_article_iterator( tuple(fitting_publisher or self.publishers), error_handling, build_extraction_filter(), url_filter ): if not only_unique or article.html.responded_url not in response_cache: response_cache.add(article.html.responded_url) - article_idx += 1 + article_count += 1 yield article - if article_idx == max_articles: + if article_count == max_articles: break @@ -237,6 +237,9 @@ def __init__( downloads. You can set a delay directly using float or any callable satisfying the Delay protocol. If set to None, no delay will be used between batches. See Delay for more information. Defaults to None. + threading (bool): If True, the crawler will use a dedicated thread per publisher, if set to False, + the crawler will use a single thread for a publishers and load articles succesively. This will greatly + influence performance, and it is highly recommended to use a threaded crawler. Deafults to True. """ if not publishers: @@ -279,7 +282,8 @@ def _single_crawl( ) -> Iterator[Article]: article_iterators = [article_task(publisher) for publisher in publishers] while article_iterators: - for iterator in article_iterators: + tmp = article_iterators + for iterator in tmp: try: yield next(iterator) except StopIteration: From 0dba3632849435c18105552ca4674d9f54ee1a6f Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:30:55 +0200 Subject: [PATCH 19/36] adjust docstrings --- src/fundus/scraping/session.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index 8df837994..327ed01b8 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -8,14 +8,14 @@ class SessionHandler: - """Object for handling project global aiohttp.ClientSessions + """Object for handling project global request.Session The session life cycle consists of three steps which can be repeated indefinitely: Build, Supply, Teardown. Initially there is no session build within the session handler. When a session is requested with get_session() either a new one is created with _session_factory() or the session handler's existing one returned. Every subsequent call to get_session() will return the same - aiohttp.ClientSession object. If close_current_session() is called, the current session will be + response.Session object. If close_current_session() is called, the current session will be tear-downed and the next call to get_session() will build a new session. """ @@ -30,12 +30,13 @@ def _session_factory(self) -> requests.Session: This returns a new client session build from pre-defined configurations: - pool_connections: 50 - pool_maxsize: 50 - - hooks = {'request': lambda request:} + - hooks = {'response': raise_for_status(), _response_log():} Returns: A new requests.Session """ + basic_logger.debug("Creating new session") session = requests.Session() def _response_log(response: requests.Response, *args, **kwargs) -> None: From 54974a8a9a3f23f609ba9d903bf0281d79ce7fbd Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:31:26 +0200 Subject: [PATCH 20/36] close session after crawler is being used --- src/fundus/scraping/crawler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index e483a6c19..5f9253426 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -41,6 +41,7 @@ from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter from fundus.scraping.html import CCNewsSource from fundus.scraping.scraper import CCNewsScraper, WebScraper +from fundus.scraping.session import session_handler from fundus.scraping.url import URLSource _T = TypeVar("_T") @@ -210,6 +211,8 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: if article_count == max_articles: break + session_handler.close_current_session() + class Crawler(CrawlerBase): def __init__( From 06a61d85961dcad8016011d139402fa37684d665 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:33:43 +0200 Subject: [PATCH 21/36] apply 4857a1c to branch --- src/fundus/scraping/url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py index ea830a27b..6b9231af2 100644 --- a/src/fundus/scraping/url.py +++ b/src/fundus/scraping/url.py @@ -45,7 +45,7 @@ def __post_init__(self): if not self._request_header: self._request_header = _default_header if not validators.url(self.url): - raise ValueError(f"Invalid url '{self.url}'") + basic_logger.error(f"{type(self).__name__} initialized with invalid URL {self.url}") def set_header(self, request_header: Dict[str, str]) -> None: self._request_header = request_header From 940bc976d496a91c69975c841e6ab64b6feea10d Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 4 Apr 2024 14:41:55 +0200 Subject: [PATCH 22/36] add log message if skipping entire publisher due to server errors --- src/fundus/scraping/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index d70bb5c8d..028ad3e72 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -108,7 +108,7 @@ def filter_url(u: str) -> bool: except (HTTPError, ConnectionError) as error: basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'") if isinstance(error, HTTPError) and error.response.status_code >= 500: - return + basic_logger.info(f"Skipped {self.publisher} due to server errors: '{error}'") continue except Exception as error: From 6353352aa68b315343dd6b7682b92e2e01858c63 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Fri, 5 Apr 2024 15:31:04 +0200 Subject: [PATCH 23/36] change log level for error message in test case script to error --- scripts/generate_parser_test_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py index adbf2e773..de8c10783 100644 --- a/scripts/generate_parser_test_files.py +++ b/scripts/generate_parser_test_files.py @@ -106,7 +106,7 @@ def main() -> None: if arguments.overwrite or not html_mapping.get(publisher.parser.latest_version): if not (article := get_test_article(publisher, url)): - basic_logger.warning(f"Couldn't get article for {publisher.name}. Skipping") + basic_logger.error(f"Couldn't get article for {publisher.name}. Skipping") continue html = HTMLTestFile( url=article.html.responded_url, From afa4c462f663d0925ae298594dd8d2e0b02cf34f Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 15 Apr 2024 13:58:41 +0200 Subject: [PATCH 24/36] Apply suggestions from code review Co-authored-by: Conrad Dobberstein --- src/fundus/scraping/crawler.py | 4 ++-- src/fundus/scraping/session.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index 5f9253426..078a62cea 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -195,7 +195,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: if not fitting_publisher: basic_logger.error( - f"Could not find any fitting publisher for required attributes " + f"Could not find any fitting publishers for required attributes " f"`{', '.join(extraction_filter.required_attributes)}`" ) return @@ -241,7 +241,7 @@ def __init__( protocol. If set to None, no delay will be used between batches. See Delay for more information. Defaults to None. threading (bool): If True, the crawler will use a dedicated thread per publisher, if set to False, - the crawler will use a single thread for a publishers and load articles succesively. This will greatly + the crawler will use a single thread for all publishers and load articles successively. This will greatly influence performance, and it is highly recommended to use a threaded crawler. Deafults to True. """ diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index 327ed01b8..440574cde 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -8,7 +8,7 @@ class SessionHandler: - """Object for handling project global request.Session + """Object for handling project global request.Session The session life cycle consists of three steps which can be repeated indefinitely: Build, Supply, Teardown. From 153c0ebf27ec3065c657d100ad172256e523526c Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 15 Apr 2024 19:56:39 +0200 Subject: [PATCH 25/36] fix README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 687495d1a..a092f0f74 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ Maybe you want to crawl a specific news source instead. Let's crawl news article ```python from fundus import PublisherCollection, Crawler -# initialize the crawler for Washington Times +# initialize the crawler for The New Yorker crawler = Crawler(PublisherCollection.us.TheNewYorker) # crawl 2 articles and print From f90c52e7496e27dd0994a0bf378dbcd63ccf026b Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 15 Apr 2024 19:56:58 +0200 Subject: [PATCH 26/36] remove leftover in `HTMLSource` --- src/fundus/scraping/html.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 028ad3e72..52497bbf2 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -59,9 +59,7 @@ class WebSourceInfo(SourceInfo): class HTMLSource(Protocol): - @abstractmethod - def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: - ... + def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: ... class WebSource: From f9ba37f89a09b73be188f5fd347a8224aa63a649 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 15 Apr 2024 19:57:47 +0200 Subject: [PATCH 27/36] add contextmanager to session handler and some more minor fixes --- src/fundus/scraping/crawler.py | 30 ++++++++++++------------- src/fundus/scraping/session.py | 41 ++++++++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index 078a62cea..18e952710 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -31,6 +31,7 @@ import more_itertools import requests from dateutil.rrule import MONTHLY, rrule +from more_itertools import roundrobin from tqdm import tqdm from typing_extensions import ParamSpec, TypeAlias @@ -116,7 +117,11 @@ def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]: class CrawlerBase(ABC): def __init__(self, *publishers: Publisher): - self.publishers = tuple(set(more_itertools.collapse(publishers))) + + if not publishers: + raise ValueError("param of has to be non empty") + + self.publishers: List[PublisherEnum] = list(set(more_itertools.collapse(publishers))) @abstractmethod def _build_article_iterator( @@ -176,7 +181,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: response_cache: Set[str] = set() extraction_filter = build_extraction_filter() - fitting_publisher: List[PublisherEnum] = [] + fitting_publishers: List[PublisherEnum] = [] if isinstance(extraction_filter, Requires): for publisher in self.publishers: @@ -191,18 +196,20 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: f"is(are) not supported by {publisher.publisher_name}. Skipping publisher" ) else: - fitting_publisher.append(publisher) + fitting_publishers.append(publisher) - if not fitting_publisher: + if not fitting_publishers: basic_logger.error( f"Could not find any fitting publishers for required attributes " f"`{', '.join(extraction_filter.required_attributes)}`" ) return + else: + fitting_publishers = self.publishers article_count = 0 for article in self._build_article_iterator( - tuple(fitting_publisher or self.publishers), error_handling, build_extraction_filter(), url_filter + tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter ): if not only_unique or article.html.responded_url not in response_cache: response_cache.add(article.html.responded_url) @@ -245,9 +252,6 @@ def __init__( influence performance, and it is highly recommended to use a threaded crawler. Deafults to True. """ - if not publishers: - raise ValueError("param of has to be non empty") - super().__init__(*publishers) self.restrict_sources_to = restrict_sources_to @@ -284,13 +288,7 @@ def _single_crawl( publishers: Tuple[PublisherEnum, ...], article_task: Callable[[PublisherEnum], Iterator[Article]] ) -> Iterator[Article]: article_iterators = [article_task(publisher) for publisher in publishers] - while article_iterators: - tmp = article_iterators - for iterator in tmp: - try: - yield next(iterator) - except StopIteration: - article_iterators.remove(iterator) + yield from roundrobin(*article_iterators) @staticmethod def _threaded_crawl( @@ -299,7 +297,7 @@ def _threaded_crawl( article_queue: Queue[Article] = Queue(len(publishers)) wrapped_article_task = queue_wrapper(article_queue, article_task) - with ThreadPool(processes=len(publishers) or None) as pool: + with ThreadPool(processes=len(publishers) or None) as pool, session_handler.context(len(publishers), 1): yield from pool_queue_iter(pool.map_async(wrapped_article_task, publishers), article_queue) def _build_article_iterator( diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index 440574cde..d746cfe67 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -1,6 +1,8 @@ -from typing import Optional +from contextlib import contextmanager +from typing import Iterator, Optional import requests.adapters +from typing_extensions import Self from fundus.logging import basic_logger @@ -19,7 +21,7 @@ class SessionHandler: tear-downed and the next call to get_session() will build a new session. """ - def __init__(self, pool_connections: int = 50, pool_maxsize: int = 50): + def __init__(self, pool_connections: int = 50, pool_maxsize: int = 1): self.session: Optional[requests.Session] = None self.pool_connections = pool_connections self.pool_maxsize = pool_maxsize @@ -30,7 +32,7 @@ def _session_factory(self) -> requests.Session: This returns a new client session build from pre-defined configurations: - pool_connections: 50 - pool_maxsize: 50 - - hooks = {'response': raise_for_status(), _response_log():} + - hooks = {'response': raise_for_status(), _response_log()} Returns: A new requests.Session @@ -84,10 +86,35 @@ def close_current_session(self) -> None: Returns: None """ - session = self.get_session() - basic_logger.debug(f"Close session {session}") - session.close() - self.session = None + if self.session is not None: + session = self.get_session() + basic_logger.debug(f"Close session {session}") + session.close() + self.session = None + + @contextmanager + def context(self, pool_connections: int, pool_maxsize: int) -> Self: + """Context manager to temporarily overwrite parameter and build new session. + + Args: + pool_connections: see requests.Session documentation. + pool_maxsize: see requests.Session documentation. + + Returns: + SessionHandler: The session handler instance. + """ + previous_pool_connections = self.pool_connections + previous_pool_maxsize = self.pool_maxsize + + self.close_current_session() + + try: + self.pool_connections = pool_connections + self.pool_maxsize = pool_maxsize + yield self + finally: + self.pool_connections = previous_pool_connections + self.pool_maxsize = previous_pool_maxsize session_handler = SessionHandler() From 23269fa1f2c297669e596c7a42a4da47faf83fe9 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 15 Apr 2024 19:58:56 +0200 Subject: [PATCH 28/36] rename `HTML.source` -> `HTML.source_info` --- src/fundus/scraping/article.py | 2 +- src/fundus/scraping/html.py | 8 ++++---- src/fundus/scraping/scraper.py | 2 +- tests/utility.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py index e8e6a2629..77eb9d770 100644 --- a/src/fundus/scraping/article.py +++ b/src/fundus/scraping/article.py @@ -83,7 +83,7 @@ def __str__(self): f'\n- Title: "{wrapped_title}"' f'\n- Text: "{wrapped_plaintext}"' f"\n- URL: {self.html.requested_url}" - f"\n- From: {self.html.source.publisher}" + f"\n- From: {self.html.source_info.publisher}" f'{" (" + self.publishing_date.strftime("%Y-%m-%d %H:%M") + ")" if self.publishing_date else ""}' ) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 52497bbf2..40ec3510f 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -37,7 +37,7 @@ class HTML: responded_url: str content: str crawl_date: datetime - source: "SourceInfo" + source_info: "SourceInfo" @dataclass(frozen=True) @@ -122,7 +122,7 @@ def filter_url(u: str) -> bool: if response.history: basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}") - source = ( + source_info = ( WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url) if isinstance(self.url_source, URLSource) else SourceInfo(self.publisher) @@ -133,7 +133,7 @@ def filter_url(u: str) -> bool: responded_url=str(response.url), content=html, crawl_date=datetime.now(), - source=source, + source_info=source_info, ) if self.delay: @@ -213,7 +213,7 @@ def extract_content(record: WarcRecord) -> Optional[str]: responded_url=target_url, content=content, crawl_date=warc_record.record_date, - source=WarcSourceInfo( + source_info=WarcSourceInfo( publisher=publisher.publisher_name, warc_path=self.warc_path, warc_headers=dict(warc_record.headers), diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py index f8753280d..b7f8f77ea 100644 --- a/src/fundus/scraping/scraper.py +++ b/src/fundus/scraping/scraper.py @@ -29,7 +29,7 @@ def scrape( ) -> Iterator[Article]: for source in self.sources: for html in source.fetch(url_filter=url_filter): - parser = self.parser_mapping[html.source.publisher] + parser = self.parser_mapping[html.source_info.publisher] try: extraction = parser(html.crawl_date).parse(html.content, error_handling) diff --git a/tests/utility.py b/tests/utility.py index 22378c4b2..42e9097b7 100644 --- a/tests/utility.py +++ b/tests/utility.py @@ -28,7 +28,7 @@ def get_test_articles(publisher: PublisherEnum) -> List[Article]: crawl_date=html_test_file.crawl_date, requested_url=html_test_file.url, responded_url=html_test_file.url, - source=SourceInfo(publisher.publisher_name), + source_info=SourceInfo(publisher.publisher_name), ) article = Article.from_extracted(extracted=extraction, html=html) articles.append(article) From aaf0f57288b6e18147560afcaf0e04930813d018 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 15 Apr 2024 20:00:52 +0200 Subject: [PATCH 29/36] fix types + lint --- src/fundus/scraping/crawler.py | 3 +-- src/fundus/scraping/html.py | 3 ++- src/fundus/scraping/session.py | 2 +- tests/test_crawler.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py index 18e952710..890c0bfc5 100644 --- a/src/fundus/scraping/crawler.py +++ b/src/fundus/scraping/crawler.py @@ -117,7 +117,6 @@ def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]: class CrawlerBase(ABC): def __init__(self, *publishers: Publisher): - if not publishers: raise ValueError("param of has to be non empty") @@ -209,7 +208,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]: article_count = 0 for article in self._build_article_iterator( - tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter + tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter ): if not only_unique or article.html.responded_url not in response_cache: response_cache.add(article.html.responded_url) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 40ec3510f..245135302 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -59,7 +59,8 @@ class WebSourceInfo(SourceInfo): class HTMLSource(Protocol): - def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: ... + def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: + ... class WebSource: diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index d746cfe67..36630cf4f 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -93,7 +93,7 @@ def close_current_session(self) -> None: self.session = None @contextmanager - def context(self, pool_connections: int, pool_maxsize: int) -> Self: + def context(self, pool_connections: int, pool_maxsize: int) -> Iterator[Self]: """Context manager to temporarily overwrite parameter and build new session. Args: diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 53587a951..20e8e1fe3 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -6,7 +6,7 @@ class TestPipeline: def test_crawler_with_empty_collection(self, collection_with_empty_publisher_enum): crawler = Crawler(collection_with_empty_publisher_enum) - assert crawler.publishers == tuple() + assert crawler.publishers == list() assert next(crawler.crawl(), None) is None with pytest.raises(ValueError): From f1b9a082681e9b3cb9a7b151a8d2aa645ba5cdc4 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 15 Apr 2024 20:01:54 +0200 Subject: [PATCH 30/36] fix imports after merge --- src/fundus/publishers/lt/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/publishers/lt/__init__.py b/src/fundus/publishers/lt/__init__.py index 3fc3c7a2a..3fe649e38 100644 --- a/src/fundus/publishers/lt/__init__.py +++ b/src/fundus/publishers/lt/__init__.py @@ -1,5 +1,5 @@ from fundus.publishers.base_objects import PublisherEnum, PublisherSpec -from fundus.scraping.html import RSSFeed, Sitemap +from fundus.scraping.url import RSSFeed, Sitemap from .lrt import LRTParser From 053c9fef1bbc05b6dc4f24895faf158dd28d45e5 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 16 Apr 2024 13:37:15 +0200 Subject: [PATCH 31/36] fix bug leading to a potential `KeyError` in `Sitemap` --- src/fundus/scraping/url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py index 6b9231af2..3db8e750a 100644 --- a/src/fundus/scraping/url.py +++ b/src/fundus/scraping/url.py @@ -106,7 +106,7 @@ def yield_recursive(sitemap_url: str) -> Iterator[str]: basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}") return content = response.content - if (content_type := response.headers["content-type"]) in self._decompressor.supported_file_formats: + if (content_type := response.headers.get("content-type")) in self._decompressor.supported_file_formats: content = self._decompressor.decompress(content, content_type) if not content: basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'") From 484d0c03fcb0fde4af130f8e99abc3cb7afc623f Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 16 Apr 2024 13:37:41 +0200 Subject: [PATCH 32/36] fix crawler delay for `WebSource` --- src/fundus/scraping/html.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 245135302..98cdb5927 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -85,12 +85,16 @@ def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: [url_filter] if url_filter else [] ) - timestamp = time.time() + timestamp = time.time() + self.delay() if self.delay is not None else time.time() def filter_url(u: str) -> bool: return any(f(u) for f in combined_filters) for url in self.url_source: + if self.delay: + time.sleep(max(0.0, self.delay() - time.time() + timestamp)) + timestamp = time.time() + if not validators.url(url): basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed") continue @@ -137,10 +141,6 @@ def filter_url(u: str) -> bool: source_info=source_info, ) - if self.delay: - time.sleep(max(0.0, self.delay() - time.time() + timestamp)) - timestamp = time.time() - class CCNewsSource: def __init__(self, *publishers: PublisherEnum, warc_path: str, headers: Optional[Dict[str, str]] = None): From e5b865b8a7c7c6e8a51f3fcf5e96481359a11278 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 16 Apr 2024 13:46:13 +0200 Subject: [PATCH 33/36] use a thread lock for `SessionHandler.get_session` --- src/fundus/scraping/session.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index 36630cf4f..abcc2057f 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -1,3 +1,4 @@ +import threading from contextlib import contextmanager from typing import Iterator, Optional @@ -25,6 +26,7 @@ def __init__(self, pool_connections: int = 50, pool_maxsize: int = 1): self.session: Optional[requests.Session] = None self.pool_connections = pool_connections self.pool_maxsize = pool_maxsize + self.lock = threading.Lock() def _session_factory(self) -> requests.Session: """Builds a new Session @@ -76,9 +78,11 @@ def get_session(self) -> requests.Session: Returns: requests.Session: The current build session """ - if not self.session: - self.session = self._session_factory() - return self.session + + with self.lock: + if not self.session: + self.session = self._session_factory() + return self.session def close_current_session(self) -> None: """Tears down the current build session From 8e9f3d7f348bd769fc80ed791117f5bbfcc14e03 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Tue, 16 Apr 2024 14:00:36 +0200 Subject: [PATCH 34/36] remove leftover imports --- src/fundus/parser/data.py | 2 -- src/fundus/publishers/fr/le_monde.py | 1 - src/fundus/publishers/na/the_namibian.py | 1 - src/fundus/scraping/delay.py | 1 - src/fundus/scraping/html.py | 1 - 5 files changed, 6 deletions(-) diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index 1168e7793..3a6013c25 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -16,8 +16,6 @@ from typing_extensions import TypeAlias -from fundus.logging import basic_logger - LDMappingValue: TypeAlias = Union[List[Dict[str, Any]], Dict[str, Any]] diff --git a/src/fundus/publishers/fr/le_monde.py b/src/fundus/publishers/fr/le_monde.py index dfef8b6b2..6d4757ae3 100644 --- a/src/fundus/publishers/fr/le_monde.py +++ b/src/fundus/publishers/fr/le_monde.py @@ -9,7 +9,6 @@ extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, - generic_topic_parsing, ) diff --git a/src/fundus/publishers/na/the_namibian.py b/src/fundus/publishers/na/the_namibian.py index d711620fd..c52b96183 100644 --- a/src/fundus/publishers/na/the_namibian.py +++ b/src/fundus/publishers/na/the_namibian.py @@ -2,7 +2,6 @@ from datetime import datetime from typing import List, Optional, Pattern -from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute diff --git a/src/fundus/scraping/delay.py b/src/fundus/scraping/delay.py index c83cd6918..ee2a70830 100644 --- a/src/fundus/scraping/delay.py +++ b/src/fundus/scraping/delay.py @@ -1,6 +1,5 @@ from __future__ import annotations -import random from typing import Protocol, runtime_checkable diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py index 98cdb5927..d393d0aa3 100644 --- a/src/fundus/scraping/html.py +++ b/src/fundus/scraping/html.py @@ -1,5 +1,4 @@ import time -from abc import abstractmethod from dataclasses import dataclass from datetime import datetime from typing import Dict, Iterable, Iterator, List, Optional, Protocol From 4facafecac3c700a5236425be2ed68b2035f8ebb Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Wed, 17 Apr 2024 13:07:02 +0200 Subject: [PATCH 35/36] rearrange code --- scripts/generate_parser_test_files.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py index b9da059c8..5a3afcf89 100644 --- a/scripts/generate_parser_test_files.py +++ b/scripts/generate_parser_test_files.py @@ -17,14 +17,14 @@ def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]: - if url is None: - crawler = Crawler(enum) - return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None) - else: + if url is not None: source = WebSource([url], publisher=enum.publisher_name) scraper = BaseScraper(source, parser_mapping={enum.publisher_name: enum.parser}) return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None) + crawler = Crawler(enum) + return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None) + def parse_arguments() -> Namespace: parser = ArgumentParser( From 6d12cad3f9b6d5911f38c215db44946607ec2d6b Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Wed, 17 Apr 2024 13:07:23 +0200 Subject: [PATCH 36/36] adjust some docstrings --- src/fundus/scraping/session.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py index abcc2057f..4a804fbe0 100644 --- a/src/fundus/scraping/session.py +++ b/src/fundus/scraping/session.py @@ -32,9 +32,9 @@ def _session_factory(self) -> requests.Session: """Builds a new Session This returns a new client session build from pre-defined configurations: - - pool_connections: 50 - - pool_maxsize: 50 - - hooks = {'response': raise_for_status(), _response_log()} + - pool_connections: + - pool_maxsize: + - hooks: (1) Hook to raise an `HTTPError` if one occurred. (2) Hook to log the request responses. Returns: A new requests.Session @@ -98,7 +98,7 @@ def close_current_session(self) -> None: @contextmanager def context(self, pool_connections: int, pool_maxsize: int) -> Iterator[Self]: - """Context manager to temporarily overwrite parameter and build new session. + """Context manager to temporarily overwrite parameter and build a new session. Args: pool_connections: see requests.Session documentation.