From e88ee2bb943aa73331fb845107107f080ec4dde4 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Sun, 4 Feb 2024 13:43:43 +0100
Subject: [PATCH 01/36] first running draft

---
 docs/4_how_to_filter_articles.md             |   3 +-
 docs/5_how_to_search_for_publishers.md       |   3 +-
 docs/how_to_add_a_publisher.md               |  10 +-
 src/fundus/__init__.py                       |   8 +-
 src/fundus/publishers/at/__init__.py         |   2 +-
 src/fundus/publishers/base_objects.py        |  13 +-
 src/fundus/publishers/de/__init__.py         |   2 +-
 src/fundus/publishers/fr/__init__.py         |   3 +-
 src/fundus/publishers/na/__init__.py         |   2 +-
 src/fundus/publishers/uk/__init__.py         |   2 +-
 src/fundus/publishers/us/__init__.py         |   2 +-
 src/fundus/scraping/common_crawl/__init__.py |   3 -
 src/fundus/scraping/common_crawl/html.py     |  92 ----
 src/fundus/scraping/common_crawl/pipeline.py | 300 -------------
 src/fundus/scraping/common_crawl/scraper.py  |  48 ---
 src/fundus/scraping/delay.py                 |  26 ++
 src/fundus/scraping/html.py                  | 419 ++++++------------
 src/fundus/scraping/pipeline.py              | 428 ++++++++++++-------
 src/fundus/scraping/scraper.py               | 100 +++--
 src/fundus/scraping/session.py               | 108 +++++
 src/fundus/scraping/url.py                   | 139 ++++++
 21 files changed, 764 insertions(+), 949 deletions(-)
 delete mode 100644 src/fundus/scraping/common_crawl/__init__.py
 delete mode 100644 src/fundus/scraping/common_crawl/html.py
 delete mode 100644 src/fundus/scraping/common_crawl/pipeline.py
 delete mode 100644 src/fundus/scraping/common_crawl/scraper.py
 create mode 100644 src/fundus/scraping/delay.py
 create mode 100644 src/fundus/scraping/session.py
 create mode 100644 src/fundus/scraping/url.py

diff --git a/docs/4_how_to_filter_articles.md b/docs/4_how_to_filter_articles.md
index d6233d18f..786e6a387 100644
--- a/docs/4_how_to_filter_articles.md
+++ b/docs/4_how_to_filter_articles.md
@@ -184,7 +184,8 @@ You can preselect the source for your articles when initializing a new `Crawler`
 Let's initiate a crawler who only crawls from `NewsMaps`'s.
 
 ````python
-from fundus import Crawler, PublisherCollection, NewsMap
+from fundus import Crawler, PublisherCollection
+from fundus.scraping.url import NewsMap
 
 crawler = Crawler(PublisherCollection.us, restrict_sources_to=[NewsMap])
 ````
diff --git a/docs/5_how_to_search_for_publishers.md b/docs/5_how_to_search_for_publishers.md
index bbbbf79ee..3d09bb5f5 100644
--- a/docs/5_how_to_search_for_publishers.md
+++ b/docs/5_how_to_search_for_publishers.md
@@ -15,7 +15,8 @@ You can search through the collection to get only publishers fitting your use ca
 Let's get some publishers based in the US, supporting an attribute called `topics` and `NewsMap` as a source, and use them to initialize a crawler afterward.
 
 ````python
-from fundus import Crawler, PublisherCollection, NewsMap
+from fundus import Crawler, PublisherCollection
+from fundus.scraping.url import NewsMap
 
 fitting_publishers = PublisherCollection.us.search(attributes=["topics"], source_types=[NewsMap])
 crawler = Crawler(fitting_publishers)
diff --git a/docs/how_to_add_a_publisher.md b/docs/how_to_add_a_publisher.md
index 0c9cf14cb..d7c97fa1d 100644
--- a/docs/how_to_add_a_publisher.md
+++ b/docs/how_to_add_a_publisher.md
@@ -106,8 +106,11 @@ To instantiate an object inheriting from URLSource like `RSSFeed` or `Sitemap`,
 Getting links for RSS feeds can vary from publisher to publisher.
 Most of the time, you can find them through a quick browser search.
 Building an `RSSFeed` looks like this:
+
 ````python
-from fundus.scraping.html import RSSFeed
+
+from fundus import RSSFeed
+
 RSSFeed("https://theintercept.com/feed/?rss")
 ````
 
@@ -159,8 +162,11 @@ You can alter this behavior or reverse the order in which sitemaps are processed
 **_NOTE:_** If you wonder why you should reverse your sources from time to time, `URLSource`'s should, if possible, yield URLs in descending order by publishing date.
 
 Now building a new `URLSource` for a `NewsMap` covering the LA Times looks like this:
+
 ````python
-from fundus.scraping.html import NewsMap
+
+from fundus import NewsMap
+
 NewsMap("https://www.latimes.com/news-sitemap.xml", reverse=True)
 ````
 
diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
index 07fa55e96..fb5e46405 100644
--- a/src/fundus/__init__.py
+++ b/src/fundus/__init__.py
@@ -2,10 +2,9 @@
 import sys
 
 from fundus.publishers import PublisherCollection
-from fundus.scraping.common_crawl import CCNewsCrawler
 from fundus.scraping.filter import Requires
-from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
-from fundus.scraping.pipeline import BaseCrawler, Crawler
+from fundus.scraping.pipeline import BaseCrawler, CCNewsCrawler, Crawler
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 __module_path__ = pathlib.Path(__file__).parent
 __development_base_path__ = __module_path__.parents[1]
@@ -16,9 +15,6 @@
     "CCNewsCrawler",
     "PublisherCollection",
     "Requires",
-    "RSSFeed",
-    "Sitemap",
-    "NewsMap",
 ]
 
 # On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times,
diff --git a/src/fundus/publishers/at/__init__.py b/src/fundus/publishers/at/__init__.py
index 07e7a35ee..970da6295 100644
--- a/src/fundus/publishers/at/__init__.py
+++ b/src/fundus/publishers/at/__init__.py
@@ -1,5 +1,5 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
-from fundus.scraping.html import RSSFeed
+from fundus.scraping.url import RSSFeed
 
 from .orf import OrfParser
 
diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py
index 198464bdd..c1a0505eb 100644
--- a/src/fundus/publishers/base_objects.py
+++ b/src/fundus/publishers/base_objects.py
@@ -5,7 +5,7 @@
 
 from fundus.parser.base_parser import ParserProxy
 from fundus.scraping.filter import URLFilter
-from fundus.scraping.html import FundusSource, NewsMap, RSSFeed, Sitemap, URLSource
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
 from fundus.utils.iteration import iterate_all_subclasses
 
 
@@ -34,10 +34,11 @@ def __init__(self, spec: PublisherSpec):
         self.parser = spec.parser()
         self.publisher_name = spec.name
         self.url_filter = spec.url_filter
+        self.request_header = spec.request_header
 
         # we define the dict here manually instead of using default dict so that we can control
         # the order in which sources are proceeded.
-        source_mapping: Dict[Type[URLSource], List[FundusSource]] = {
+        source_mapping: Dict[Type[URLSource], List[URLSource]] = {
             RSSFeed: [],
             NewsMap: [],
             Sitemap: [],
@@ -49,13 +50,7 @@ def __init__(self, spec: PublisherSpec):
                     f"Unexpected type '{type(url_source).__name__}' as source for {self.name}. "
                     f"Allowed are '{', '.join(cls.__name__ for cls in iterate_all_subclasses(URLSource))}'"
                 )
-            source: FundusSource = FundusSource(
-                url_source=url_source,
-                publisher=self.publisher_name,
-                url_filter=spec.url_filter,
-                request_header=spec.request_header,
-            )
-            source_mapping[type(url_source)].append(source)
+            source_mapping[type(url_source)].append(url_source)
 
         self.source_mapping = source_mapping
 
diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py
index 14192bde1..112805fd9 100644
--- a/src/fundus/publishers/de/__init__.py
+++ b/src/fundus/publishers/de/__init__.py
@@ -2,7 +2,7 @@
 
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.scraping.filter import regex_filter
-from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 from .berliner_zeitung import BerlinerZeitungParser
 from .bild import BildParser
diff --git a/src/fundus/publishers/fr/__init__.py b/src/fundus/publishers/fr/__init__.py
index 2c6f3e868..71445369d 100644
--- a/src/fundus/publishers/fr/__init__.py
+++ b/src/fundus/publishers/fr/__init__.py
@@ -1,7 +1,6 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.publishers.fr.le_monde import LeMondeParser
-from fundus.scraping.filter import inverse, regex_filter
-from fundus.scraping.html import NewsMap, Sitemap
+from fundus.scraping.url import NewsMap, Sitemap
 
 
 class FR(PublisherEnum):
diff --git a/src/fundus/publishers/na/__init__.py b/src/fundus/publishers/na/__init__.py
index 8a7ee1b7f..bfb8e354f 100644
--- a/src/fundus/publishers/na/__init__.py
+++ b/src/fundus/publishers/na/__init__.py
@@ -1,6 +1,6 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.scraping.filter import inverse, regex_filter
-from fundus.scraping.html import RSSFeed, Sitemap
+from fundus.scraping.url import RSSFeed, Sitemap
 
 from .the_namibian import TheNamibianParser
 
diff --git a/src/fundus/publishers/uk/__init__.py b/src/fundus/publishers/uk/__init__.py
index 2f19868f3..e4e14051d 100644
--- a/src/fundus/publishers/uk/__init__.py
+++ b/src/fundus/publishers/uk/__init__.py
@@ -2,7 +2,7 @@
 
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.scraping.filter import inverse, regex_filter
-from fundus.scraping.html import NewsMap, Sitemap
+from fundus.scraping.url import NewsMap, Sitemap
 
 from .i_news import INewsParser
 from .the_guardian import TheGuardianParser
diff --git a/src/fundus/publishers/us/__init__.py b/src/fundus/publishers/us/__init__.py
index 8f52892ae..a6ce67906 100644
--- a/src/fundus/publishers/us/__init__.py
+++ b/src/fundus/publishers/us/__init__.py
@@ -1,6 +1,6 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.scraping.filter import inverse, regex_filter
-from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 from .ap_news import APNewsParser
 from .cnbc import CNBCParser
diff --git a/src/fundus/scraping/common_crawl/__init__.py b/src/fundus/scraping/common_crawl/__init__.py
deleted file mode 100644
index cf839eecf..000000000
--- a/src/fundus/scraping/common_crawl/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .pipeline import CCNewsCrawler
-
-__all__ = ["CCNewsCrawler"]
diff --git a/src/fundus/scraping/common_crawl/html.py b/src/fundus/scraping/common_crawl/html.py
deleted file mode 100644
index 2bca7b893..000000000
--- a/src/fundus/scraping/common_crawl/html.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from typing import Dict, Iterator, Optional
-from urllib.parse import urlparse
-
-import chardet
-import requests
-from fastwarc import ArchiveIterator, WarcRecord, WarcRecordType
-
-from fundus.logging import basic_logger
-from fundus.publishers.base_objects import PublisherEnum
-from fundus.scraping.filter import URLFilter
-from fundus.scraping.html import HTML, WarcSource, _default_header
-
-
-class CCNewsSource:
-    def __init__(self, *publishers: PublisherEnum, warc_path: str, headers: Optional[Dict[str, str]] = None):
-        self.publishers = publishers
-        self.warc_path = warc_path
-        self.headers = headers or _default_header
-
-        self._publisher_mapping: Dict[str, PublisherEnum] = {
-            urlparse(publisher.domain).netloc: publisher for publisher in publishers
-        }
-
-    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
-        def extract_content(record: WarcRecord) -> Optional[str]:
-            warc_body: bytes = record.reader.read()
-
-            try:
-                return str(warc_body, encoding=record.http_charset)
-            except (UnicodeDecodeError, TypeError):
-                encoding: Optional[str] = chardet.detect(warc_body)["encoding"]
-
-                if encoding is not None:
-                    basic_logger.debug(
-                        f"Trying to decode record {record.record_id!r} from {target_url!r} "
-                        f"using detected encoding {encoding}."
-                    )
-
-                    try:
-                        return str(warc_body, encoding=encoding)
-                    except UnicodeDecodeError:
-                        basic_logger.warning(
-                            f"Couldn't decode record {record.record_id!r} from {target_url!r} with "
-                            f"original charset {record.http_charset!r} using detected charset {encoding!r}."
-                        )
-                else:
-                    basic_logger.warning(
-                        f"Couldn't detect charset for record {record.record_id!r} from {target_url!r} "
-                        f"with invalid original charset {record.http_charset!r}."
-                    )
-
-            return None
-
-        with requests.Session() as session:
-            stream = session.get(self.warc_path, stream=True, headers=self.headers).raw
-
-            for warc_record in ArchiveIterator(stream, record_types=WarcRecordType.response, verify_digests=True):
-                target_url = str(warc_record.headers["WARC-Target-URI"])
-
-                if url_filter is not None and url_filter(target_url):
-                    basic_logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter")
-                    continue
-
-                publisher_domain: str = urlparse(target_url).netloc
-
-                if publisher_domain not in self._publisher_mapping:
-                    continue
-
-                publisher = self._publisher_mapping[publisher_domain]
-
-                if publisher.url_filter is not None and publisher.url_filter(target_url):
-                    basic_logger.debug(
-                        f"Skipped WARC record with target URI {target_url!r} because of "
-                        f"publisher specific URL filter"
-                    )
-                    continue
-
-                if (content := extract_content(warc_record)) is None:
-                    continue
-
-                yield HTML(
-                    requested_url=target_url,
-                    responded_url=target_url,
-                    content=content,
-                    crawl_date=warc_record.record_date,
-                    source=WarcSource(
-                        publisher=publisher.publisher_name,
-                        warc_path=self.warc_path,
-                        warc_headers=dict(warc_record.headers),
-                        http_headers=dict(warc_record.http_headers),
-                    ),
-                )
diff --git a/src/fundus/scraping/common_crawl/pipeline.py b/src/fundus/scraping/common_crawl/pipeline.py
deleted file mode 100644
index 4aa66070c..000000000
--- a/src/fundus/scraping/common_crawl/pipeline.py
+++ /dev/null
@@ -1,300 +0,0 @@
-from __future__ import annotations
-
-import gzip
-import os
-import re
-from datetime import datetime
-from functools import lru_cache, partial, wraps
-from multiprocessing import Manager
-from multiprocessing.context import TimeoutError
-from multiprocessing.pool import MapResult, Pool, ThreadPool
-from queue import Empty, Queue
-from typing import (
-    Any,
-    Callable,
-    Generic,
-    Iterator,
-    List,
-    Literal,
-    Optional,
-    Pattern,
-    Set,
-    Tuple,
-    TypeVar,
-    Union,
-    cast,
-)
-
-import dill
-import more_itertools
-import requests
-from dateutil.rrule import MONTHLY, rrule
-from tqdm import tqdm
-from typing_extensions import ParamSpec
-
-from fundus.publishers.base_objects import PublisherEnum
-from fundus.scraping.article import Article
-from fundus.scraping.common_crawl.html import CCNewsSource
-from fundus.scraping.common_crawl.scraper import CCNewsScraper
-from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter
-
-_T = TypeVar("_T")
-_P = ParamSpec("_P")
-
-
-# noinspection PyPep8Naming
-class dill_wrapper(Generic[_P, _T]):
-    def __init__(self, target: Callable[_P, _T]):
-        """Wraps function in dill serialization.
-
-        This is in order to use unpickable functions within multiprocessing.
-
-        Args:
-            target: The function to wrap.
-        """
-        self._serialized_target: bytes = dill.dumps(target)
-
-    @lru_cache
-    def _deserialize(self) -> Callable[_P, _T]:
-        return cast(Callable[_P, _T], dill.loads(self._serialized_target))
-
-    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
-        return self._deserialize()(*args, **kwargs)
-
-
-def queue_wrapper(queue: Queue[_T], target: Callable[_P, Iterator[_T]]) -> Callable[_P, None]:
-    """Wraps the target callable to add its results to the queue instead of returning them directly.
-
-    Args:
-        queue: The buffer queue.
-        target: A target callable.
-
-    Returns:
-        (Callable[_P, None]) The wrapped target.
-    """
-
-    @wraps(target)
-    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
-        for obj in target(*args, **kwargs):
-            queue.put(obj)
-
-    return wrapper
-
-
-def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]:
-    """Utility function to iterate exhaustively over a pool queue.
-
-    The underlying iterator of this function repeatedly exhausts the given queue.
-    Then, if the queue is empty only if all the pool's jobs have finished, the iterator reruns.
-    Otherwise, it waits for the queue to be populated with the next result from the pool.
-
-    Args:
-        handle (MapResult[Any]):  A handle o the MappedResult of the underling multiprocessing pool.
-        queue (Queue[_T]): The pool queue.
-
-    Returns:
-        Iterator[_T]: The iterator over the queue as it is populated.
-    """
-    while True:
-        try:
-            yield queue.get(timeout=0.1)
-        except Empty:
-            try:
-                handle.get(timeout=0.1)
-            except TimeoutError:
-                continue
-            return
-
-
-class CCNewsCrawler:
-    def __init__(
-        self,
-        *publishers: PublisherEnum,
-        processes: int = -1,
-        server_address: str = "https://data.commoncrawl.org/",
-    ):
-        """Initializes a crawler for the CC-NEWS dataset.
-
-        Args:
-            *publishers: The publishers to crawl.
-            processes: Number of additional process to use for crawling.
-                If -1, the number of processes is set to `os.cpu_count()`.
-                If `os.cpu_count()` is not available, the number of processes is set to 0.
-                If 0, only the main process is used. Defaults to -1.
-            server_address: The CC-NEWS dataset server address. Defaults to 'https://data.commoncrawl.org/'.
-        """
-        self.publishers = tuple(more_itertools.collapse(publishers))
-        self.processes = os.cpu_count() or 0 if processes == -1 else processes
-        self.server_address = server_address
-
-    def _get_warc_paths(self, start: datetime, end: datetime) -> List[str]:
-        # Date regex examples: https://regex101.com/r/yDX3G6/1
-        date_pattern: Pattern[str] = re.compile(r"CC-NEWS-(?P<date>\d{14})-")
-
-        if start >= end:
-            raise ValueError("Start date has to be < end date.")
-
-        if start < datetime(2016, 8, 1):
-            raise ValueError("The default, and earliest possible, start date is 2016/08/01.")
-
-        if end > datetime.now():
-            raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?")
-
-        date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=start, until=end))
-        urls: List[str] = [
-            f"{self.server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence
-        ]
-
-        with tqdm(total=len(urls), desc="Loading WARC Paths", leave=False) as bar:
-
-            def load_paths(url: str) -> List[str]:
-                with requests.Session() as session:
-                    paths = gzip.decompress(session.get(url).content).decode("utf-8").split()
-                    bar.update()
-                    return paths
-
-            if self.processes == 0:
-                nested_warc_paths = [load_paths(url) for url in urls]
-            else:
-                # use two threads per process, default two threads per core
-                max_number_of_threads = self.processes * 2
-
-                with ThreadPool(processes=min(len(urls), max_number_of_threads)) as pool:
-                    nested_warc_paths = pool.map(load_paths, urls)
-
-        warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths)
-
-        start_strf = start.strftime("%Y%m%d%H%M%S")
-        end_strf = end.strftime("%Y%m%d%H%M%S")
-
-        def filter_warc_path_by_date(path: str) -> bool:
-            match: Optional[re.Match[str]] = date_pattern.search(path)
-            if match is None:
-                raise AssertionError(f"Invalid WARC path {path!r}")
-            return start_strf <= match["date"] <= end_strf
-
-        return sorted(
-            (f"{self.server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)),
-            reverse=True,
-        )
-
-    @staticmethod
-    def _fetch_articles(
-        warc_path: str,
-        publishers: Tuple[PublisherEnum, ...],
-        error_handling: Literal["suppress", "catch", "raise"],
-        extraction_filter: Optional[ExtractionFilter] = None,
-        url_filter: Optional[URLFilter] = None,
-    ) -> Iterator[Article]:
-        source = CCNewsSource(*publishers, warc_path=warc_path)
-        scraper = CCNewsScraper(source)
-        yield from scraper.scrape(error_handling, extraction_filter, url_filter)
-
-    @staticmethod
-    def _single_crawl(warc_paths: List[str], article_task: Callable[[str], Iterator[Article]]) -> Iterator[Article]:
-        for warc_path in warc_paths:
-            yield from article_task(warc_path)
-
-    def _parallel_crawl(
-        self, warc_paths: List[str], article_task: Callable[[str], Iterator[Article]]
-    ) -> Iterator[Article]:
-        # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually
-        # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all
-        # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading.
-        with Manager() as manager, Pool(processes=min(self.processes, len(warc_paths))) as pool:
-            article_queue: Queue[Article] = manager.Queue()
-
-            # Because multiprocessing.Pool does not support iterators as targets,
-            # we wrap the article_task to write the articles to a queue instead of returning them directly.
-            wrapped_article_task: Callable[[str], None] = queue_wrapper(article_queue, article_task)
-
-            # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill.
-            serialized_article_task = dill_wrapper(wrapped_article_task)
-
-            # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished.
-            yield from pool_queue_iter(pool.map_async(serialized_article_task, warc_paths), article_queue)
-
-    def crawl(
-        self,
-        start: datetime = datetime(2016, 8, 1),
-        end: datetime = datetime.now(),
-        max_articles: Optional[int] = None,
-        error_handling: Literal["suppress", "catch", "raise"] = "suppress",
-        only_complete: Union[bool, ExtractionFilter] = Requires("title", "body", "publishing_date"),
-        url_filter: Optional[URLFilter] = None,
-        only_unique: bool = True,
-    ) -> Iterator[Article]:
-        """Yields articles crawled from the CC-NEWS server.
-
-        This method provides the same functionality as the fundus standard crawler,
-        except this one fetches articles from the CC-News corpus.
-        Specify a date range from <start> to <end> to fetch only articles crawled in this range.
-        The default range is 2016/8/1 -> datetime.now().
-        These dates correspond to the crawl date of the CC-News crawler, not the publishing date.
-        To filter on publishing dates, use the <only_complete> parameter and refer to the docs about filtering articles.
-
-        Args:
-            start: (datetime): Earliest possible crawl date for retrieved articles. Defaults to 2016/8/1.
-            end: (datetime): Latest possible crawl date for retrieved articles. Defaults to datetime.now().
-            max_articles (Optional[int]): Number of articles to crawl. If there are fewer articles
-                than max_articles the Iterator will stop before max_articles. If None, all retrievable
-                articles are returned. Defaults to None.
-            error_handling (Literal["suppress", "catch", "raise"]): Define how to handle errors
-                encountered during extraction. If set to "suppress", all errors will be skipped, either
-                with None values for respective attributes in the extraction or by skipping entire articles.
-                If set to "catch", errors will be caught as attribute values or, if an entire article fails,
-                through Article.exception. If set to "raise", all errors encountered during extraction will
-                be raised. Defaults to "suppress".
-            only_complete (Union[bool, ExtractionFilter]): Set a callable satisfying the ExtractionFilter
-                protocol as an extraction filter or use a boolean. If False, all articles will be yielded,
-                if True, only those with all attributes extracted. Defaults to ExtractionFilter letting
-                through all articles with at least title, body, and publishing_date set.
-            url_filter (Optional[URLFilter]): A callable object satisfying the URLFilter protocol to skip
-                URLs before download. This filter applies on both requested and responded URL. Defaults to None.
-            only_unique (bool): If set to True, articles yielded will be unique on the responded URL.
-                Always returns the first encountered article. Defaults to True.
-
-        Returns:
-            Iterator[Article]: An iterator yielding objects of type Article.
-        """
-
-        if max_articles == 0:
-            return
-
-        if max_articles is None:
-            max_articles = -1
-
-        def build_extraction_filter() -> Optional[ExtractionFilter]:
-            if isinstance(only_complete, bool):
-                return (
-                    None
-                    if only_complete is False
-                    else lambda extracted: not all(
-                        bool(v) if not isinstance(v, Exception) else False for _, v in extracted.items()
-                    )
-                )
-            else:
-                return only_complete
-
-        warc_paths = self._get_warc_paths(start, end)
-        response_cache: Set[str] = set()
-
-        article_task: Callable[[str], Iterator[Article]] = partial(
-            self._fetch_articles,
-            publishers=self.publishers,
-            error_handling=error_handling,
-            extraction_filter=build_extraction_filter(),
-            url_filter=url_filter,
-        )
-
-        if self.processes == 0:
-            article_iter = self._single_crawl(warc_paths, article_task)
-        else:
-            article_iter = self._parallel_crawl(warc_paths, article_task)
-
-        for article_idx, article in enumerate(article_iter, start=1):
-            if not only_unique or article.html.responded_url not in response_cache:
-                response_cache.add(article.html.responded_url)
-                yield article
-            if article_idx == max_articles:
-                break
diff --git a/src/fundus/scraping/common_crawl/scraper.py b/src/fundus/scraping/common_crawl/scraper.py
deleted file mode 100644
index ecae0a559..000000000
--- a/src/fundus/scraping/common_crawl/scraper.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import Dict, Iterator, Literal, Optional
-
-from fundus.logging import basic_logger
-from fundus.parser import ParserProxy
-from fundus.scraping.article import Article
-from fundus.scraping.common_crawl.html import CCNewsSource
-from fundus.scraping.filter import ExtractionFilter, URLFilter
-
-
-class CCNewsScraper:
-    def __init__(self, source: CCNewsSource):
-        self.source = source
-        self._parser_mapping: Dict[str, ParserProxy] = {
-            publisher.publisher_name: publisher.parser for publisher in source.publishers
-        }
-
-    def scrape(
-        self,
-        error_handling: Literal["suppress", "catch", "raise"],
-        extraction_filter: Optional[ExtractionFilter] = None,
-        url_filter: Optional[URLFilter] = None,
-    ) -> Iterator[Article]:
-        # TODO: Once we decided on weather to continue fundus with async functionality or not, refactor this to
-        #   be suitable for a BaseScraper class
-        for html in self.source.fetch(url_filter):
-            parser = self._parser_mapping[html.source.publisher]
-            try:
-                extracted = parser(html.crawl_date).parse(html.content, error_handling)
-
-            except Exception as err:
-                if error_handling == "raise":
-                    error_message = f"Run into an error processing article '{html.requested_url}'"
-                    basic_logger.error(error_message)
-                    err.args = (f"{err}\n\n{error_message}",)
-                    raise err
-                elif error_handling == "catch":
-                    yield Article(html=html, exception=err)
-                elif error_handling == "suppress":
-                    basic_logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}")
-                else:
-                    raise ValueError(f"Unknown value '{error_handling}' for parameter <error_handling>'")
-
-            else:
-                if extraction_filter is not None and extraction_filter(extracted):
-                    basic_logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter")
-                else:
-                    article = Article.from_extracted(html=html, extracted=extracted)
-                    yield article
diff --git a/src/fundus/scraping/delay.py b/src/fundus/scraping/delay.py
new file mode 100644
index 000000000..c83cd6918
--- /dev/null
+++ b/src/fundus/scraping/delay.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import random
+from typing import Protocol, runtime_checkable
+
+
+@runtime_checkable
+class Delay(Protocol):
+    """Protocol to define crawl delays between batches."""
+
+    def __call__(self) -> float:
+        """Yields a float specifying the minimum crawler delay for the current article batch in seconds.
+
+        The effective delay does include crawling execution time between batches,
+        i.e. the effective delay is max(execution_time, delay).
+
+        Examples:
+            >>> import random
+            >>> delay: Delay = lambda: random.random()
+            Will use a random delay in [0, 1) seconds.
+
+        Returns:
+            float: The delay time in seconds.
+
+        """
+        ...
diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 607a88d90..ee587f503 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -1,271 +1,34 @@
-import gzip
 import time
-import types
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
+from abc import abstractmethod
+from dataclasses import dataclass
 from datetime import datetime
-from functools import cached_property
-from typing import (
-    AsyncIterable,
-    AsyncIterator,
-    Callable,
-    ClassVar,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Union,
-)
-
-import aiohttp
-import feedparser
-import lxml.html
+from typing import Dict, Iterable, Iterator, List, Optional, Protocol, Union
+from urllib.parse import urlparse
+
+import chardet
+import requests
 import validators
-from aiohttp.client_exceptions import ClientError
-from aiohttp.http_exceptions import HttpProcessingError
-from aiohttp.web_exceptions import HTTPError
-from lxml.cssselect import CSSSelector
-from lxml.etree import XPath
+from fastwarc import ArchiveIterator, WarcRecord, WarcRecordType
+from requests import ConnectionError, HTTPError
 
 from fundus.logging import basic_logger
-from fundus.scraping.filter import URLFilter, inverse
-from fundus.utils.more_async import ManagedEventLoop, async_next, make_iterable_async
+from fundus.publishers.base_objects import PublisherEnum
+from fundus.scraping.delay import Delay
+from fundus.scraping.filter import URLFilter
+from fundus.scraping.session import _default_header
 
 __all__ = [
-    "URLSource",
-    "RSSFeed",
-    "Sitemap",
-    "NewsMap",
     "HTML",
+    "SourceInfo",
+    "WarcSourceInfo",
+    "WebSourceInfo",
     "HTMLSource",
-    "WarcSource",
     "WebSource",
-    "FundusSource",
+    "CCNewsSource",
 ]
 
-_default_header = {"user-agent": "Fundus"}
-
-
-class SessionHandler:
-    """Object for handling  project global aiohttp.ClientSessions
-
-    The session life cycle consists of three steps which can be repeated indefinitely:
-    Build, Supply, Teardown.
-    Initially there is no session build within the session handler. When a session is requested
-    with get_session() either a new one is created with _session_factory() or the session handler's
-    existing one returned. Every subsequent call to get_session() will return the same
-    aiohttp.ClientSession object. If close_current_session() is called, the current session will be
-    tear-downed and the next call to get_session() will build a new session.
-    """
-
-    def __init__(self):
-        self._session: Optional[aiohttp.ClientSession] = None
-
-    @staticmethod
-    async def _session_factory() -> aiohttp.ClientSession:
-        """Builds a new ClientSession
-
-        This returns a new client session build from pre-defined configurations
-        and trace configs set. These trace configs are: on_request_start, on_request_end
-
-        Returns:
-            An new ClientSession
-        """
-        timings: Dict[Optional[str], float] = dict()
-
-        async def on_request_start(
-            session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestStartParams
-        ):
-            timings[params.url.host] = time.time()
-
-        async def on_request_end(
-            session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestEndParams
-        ):
-            assert params.url.host
-            history = params.response.history
-            previous_status_codes = [f"({response.status})" for response in history] if history else []
-            status_code_chain = " -> ".join(previous_status_codes + [f"({params.response.status})"])
-            basic_logger.debug(
-                f"{status_code_chain} <{params.method} {params.url!r}> "
-                f"took {time.time() - timings[params.url.host if not history else history[0].url.host]} second(s)"
-            )
-
-        async def on_request_exception(
-            session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestExceptionParams
-        ):
-            basic_logger.debug(
-                f"FAILED: <{params.method} {params.url}> with {str(params.exception) or type(params.exception)}"
-            )
-
-        trace_config = aiohttp.TraceConfig()
-        trace_config.on_request_start.append(on_request_start)
-        trace_config.on_request_end.append(on_request_end)
-        trace_config.on_request_exception.append(on_request_exception)
-
-        _connector = aiohttp.TCPConnector(limit=50)
-        async_session = aiohttp.ClientSession(
-            connector=_connector, trace_configs=[trace_config], timeout=aiohttp.ClientTimeout(total=30)
-        )
-        return async_session
-
-    async def get_session(self) -> aiohttp.ClientSession:
-        """Requests the current build session
-
-        If called for the first time or after close_current_session was called,
-        this function will build a new session. Every subsequent call will return
-        the same session object until the session is closed with close_current_session().
-
-        Returns:
-            aiohttp.ClientSession: The current build session
-        """
-        if not self._session:
-            self._session = await self._session_factory()
-        return self._session
-
-    async def close_current_session(self) -> None:
-        """Tears down the current build session
-
-        Returns:
-            None
-        """
-        session = await self.get_session()
-        basic_logger.debug(f"Close session {session}")
-        await session.close()
-        self._session = None
-
-
-session_handler = SessionHandler()
-
-
-class _ArchiveDecompressor:
-    def __init__(self):
-        self.archive_mapping: Dict[str, Callable[[bytes], bytes]] = {"application/x-gzip": self._decompress_gzip}
-
-    @staticmethod
-    def _decompress_gzip(compressed_content: bytes) -> bytes:
-        decompressed_content = gzip.decompress(compressed_content)
-        return decompressed_content
-
-    def decompress(self, content: bytes, file_format: "str") -> bytes:
-        decompress_function = self.archive_mapping[file_format]
-        return decompress_function(content)
-
-    @cached_property
-    def supported_file_formats(self) -> List[str]:
-        return list(self.archive_mapping.keys())
-
-
-@dataclass
-class URLSource(AsyncIterable[str], ABC):
-    url: str
-
-    _request_header: Dict[str, str] = field(default_factory=dict)
-
-    def __post_init__(self):
-        if not self._request_header:
-            self._request_header = _default_header
-        if not validators.url(self.url):
-            raise ValueError(f"Invalid url '{self.url}'")
-
-    def set_header(self, request_header: Dict[str, str]) -> None:
-        self._request_header = request_header
-
-    @abstractmethod
-    def _get_pre_filtered_urls(self) -> AsyncIterator[str]:
-        pass
-
-    async def __aiter__(self) -> AsyncIterator[str]:
-        async for url in self._get_pre_filtered_urls():
-            yield url
-
-    def get_urls(self, max_urls: int = -1) -> Iterator[str]:
-        """Returns a generator yielding up to <max_urls> URLs from <self>.
-
-
-        Args:
-            max_urls (int): Number of max URLs to return. Set value is
-                an upper bound and not necessarily the actual number of
-                URLs. If set < 0, the source will be exhausted until
-                StopAsyncIteration is hit. Defaults to -1.
-
-        Yields:
-            str: The next URL.
-        """
-        async_url_gen = self.__aiter__()
-        counter = 0
-        with ManagedEventLoop() as runner:
-            while True:
-                if counter == max_urls:
-                    break
-                try:
-                    yield runner.run_until_complete(async_next(async_url_gen))
-                except StopAsyncIteration:
-                    break
-                counter += 1
-
-
-@dataclass
-class RSSFeed(URLSource):
-    async def _get_pre_filtered_urls(self) -> AsyncIterator[str]:
-        session = await session_handler.get_session()
-        async with session.get(self.url, headers=self._request_header) as response:
-            html = await response.text()
-            rss_feed = feedparser.parse(html)
-            if exception := rss_feed.get("bozo_exception"):
-                basic_logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}")
-                return
-            else:
-                for url in (entry["link"] for entry in rss_feed["entries"]):
-                    yield url
-
-
-@dataclass
-class Sitemap(URLSource):
-    recursive: bool = True
-    reverse: bool = False
-    sitemap_filter: URLFilter = lambda url: not bool(url)
-
-    _decompressor: ClassVar[_ArchiveDecompressor] = _ArchiveDecompressor()
-    _sitemap_selector: ClassVar[XPath] = CSSSelector("sitemap > loc")
-    _url_selector: ClassVar[XPath] = CSSSelector("url > loc")
-
-    async def _get_pre_filtered_urls(self) -> AsyncIterator[str]:
-        async def yield_recursive(sitemap_url: str) -> AsyncIterator[str]:
-            session = await session_handler.get_session()
-            if not validators.url(sitemap_url):
-                basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
-            async with session.get(url=sitemap_url, headers=self._request_header) as response:
-                try:
-                    response.raise_for_status()
-                except (HTTPError, ClientError, HttpProcessingError) as error:
-                    basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
-                    return
-                content = await response.content.read()
-                if response.content_type in self._decompressor.supported_file_formats:
-                    content = self._decompressor.decompress(content, response.content_type)
-                if not content:
-                    basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")
-                    return
-                tree = lxml.html.fromstring(content)
-                urls = [node.text_content() for node in self._url_selector(tree)]
-                if urls:
-                    for new_url in reversed(urls) if self.reverse else urls:
-                        yield new_url
-                elif self.recursive:
-                    sitemap_locs = [node.text_content() for node in self._sitemap_selector(tree)]
-                    filtered_locs = list(filter(inverse(self.sitemap_filter), sitemap_locs))
-                    for loc in reversed(filtered_locs) if self.reverse else filtered_locs:
-                        async for new_url in yield_recursive(loc):
-                            yield new_url
-
-        async for url in yield_recursive(self.url):
-            yield url
-
-
-@dataclass
-class NewsMap(Sitemap):
-    pass
+from fundus.scraping.session import session_handler
+from fundus.scraping.url import URLSource
 
 
 @dataclass(frozen=True)
@@ -274,93 +37,94 @@ class HTML:
     responded_url: str
     content: str
     crawl_date: datetime
-    source: "HTMLSource"
+    source: "SourceInfo"
 
 
 @dataclass(frozen=True)
-class HTMLSource:
+class SourceInfo:
     publisher: str
 
 
 @dataclass(frozen=True)
-class WarcSource(HTMLSource):
+class WarcSourceInfo(SourceInfo):
     warc_path: str
     warc_headers: Dict[str, str]
     http_headers: Dict[str, str]
 
 
 @dataclass(frozen=True)
-class WebSource(HTMLSource):
+class WebSourceInfo(SourceInfo):
     type: str
     url: str
 
 
-class FundusSource:
+class HTMLSource(Protocol):
+    @abstractmethod
+    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
+        ...
+
+
+class WebSource:
     def __init__(
         self,
-        url_source: Union[URLSource, Iterable[str]],
+        url_source: Iterable[str],
         publisher: str,
         url_filter: Optional[URLFilter] = None,
         request_header: Optional[Dict[str, str]] = None,
+        delay: Optional[Delay] = None,
     ):
-        self.url_source: Union[URLSource, AsyncIterator[str]]
-        if isinstance(url_source, URLSource):
-            self.url_source = url_source
-        else:
-            self.url_source = make_iterable_async(url_source)
+        self.url_source = url_source
         self.publisher = publisher
         self.url_filter = url_filter
         self.request_header = request_header or _default_header
         if isinstance(url_source, URLSource):
             url_source.set_header(self.request_header)
+        self.delay = delay
 
-    async def fetch(self, url_filter: Optional[URLFilter] = None) -> AsyncIterator[Optional[HTML]]:
+    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
         combined_filters: List[URLFilter] = ([self.url_filter] if self.url_filter else []) + (
             [url_filter] if url_filter else []
         )
 
+        timestamp = time.time()
+
         def filter_url(u: str) -> bool:
             return any(f(u) for f in combined_filters)
 
-        async for url in self.url_source:
+        for url in self.url_source:
             if not validators.url(url):
                 basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
-                yield None
                 continue
 
             if filter_url(url):
                 basic_logger.debug(f"Skipped requested URL '{url}' because of URL filter")
-                yield None
                 continue
 
-            session = await session_handler.get_session()
+            session = session_handler.get_session()
 
             try:
-                async with session.get(url, headers=self.request_header) as response:
-                    if filter_url(str(response.url)):
-                        basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
-                        yield None
-                        continue
-                    html = await response.text()
-                    response.raise_for_status()
-
-            except (HTTPError, ClientError, HttpProcessingError, UnicodeError) as error:
+                response = session.get(url, headers=self.request_header)
+
+            except (HTTPError, ConnectionError) as error:
                 basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'")
-                yield None
                 continue
 
             except Exception as error:
                 basic_logger.warning(f"Warning! Skipped  requested URL '{url}' because of an unexpected error {error}")
-                yield None
                 continue
 
+            if filter_url(str(response.url)):
+                basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
+                continue
+            html = response.text
+
             if response.history:
                 basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")
 
             source = (
-                WebSource(self.publisher, type(self.url_source).__name__, self.url_source.url)
+                WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url)
                 if isinstance(self.url_source, URLSource)
-                else HTMLSource(self.publisher)
+                else SourceInfo(self.publisher)
             )
 
             yield HTML(
@@ -370,3 +134,88 @@ def filter_url(u: str) -> bool:
                 crawl_date=datetime.now(),
                 source=source,
             )
+
+            if self.delay:
+                time.sleep(max(0.0, self.delay() - time.time() + timestamp))
+                timestamp = time.time()
+
+
+class CCNewsSource:
+    def __init__(self, *publishers: PublisherEnum, warc_path: str, headers: Optional[Dict[str, str]] = None):
+        self.publishers = publishers
+        self.warc_path = warc_path
+        self.headers = headers or _default_header
+
+        self._publisher_mapping: Dict[str, PublisherEnum] = {
+            urlparse(publisher.domain).netloc: publisher for publisher in publishers
+        }
+
+    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
+        def extract_content(record: WarcRecord) -> Optional[str]:
+            warc_body: bytes = record.reader.read()
+
+            try:
+                return str(warc_body, encoding=record.http_charset)
+            except (UnicodeDecodeError, TypeError):
+                encoding: Optional[str] = chardet.detect(warc_body)["encoding"]
+
+                if encoding is not None:
+                    basic_logger.debug(
+                        f"Trying to decode record {record.record_id!r} from {target_url!r} "
+                        f"using detected encoding {encoding}."
+                    )
+
+                    try:
+                        return str(warc_body, encoding=encoding)
+                    except UnicodeDecodeError:
+                        basic_logger.warning(
+                            f"Couldn't decode record {record.record_id!r} from {target_url!r} with "
+                            f"original charset {record.http_charset!r} using detected charset {encoding!r}."
+                        )
+                else:
+                    basic_logger.warning(
+                        f"Couldn't detect charset for record {record.record_id!r} from {target_url!r} "
+                        f"with invalid original charset {record.http_charset!r}."
+                    )
+
+            return None
+
+        with requests.Session() as session:
+            stream = session.get(self.warc_path, stream=True, headers=self.headers).raw
+
+            for warc_record in ArchiveIterator(stream, record_types=WarcRecordType.response, verify_digests=True):
+                target_url = str(warc_record.headers["WARC-Target-URI"])
+
+                if url_filter is not None and url_filter(target_url):
+                    basic_logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter")
+                    continue
+
+                publisher_domain: str = urlparse(target_url).netloc
+
+                if publisher_domain not in self._publisher_mapping:
+                    continue
+
+                publisher = self._publisher_mapping[publisher_domain]
+
+                if publisher.url_filter is not None and publisher.url_filter(target_url):
+                    basic_logger.debug(
+                        f"Skipped WARC record with target URI {target_url!r} because of "
+                        f"publisher specific URL filter"
+                    )
+                    continue
+
+                if (content := extract_content(warc_record)) is None:
+                    continue
+
+                yield HTML(
+                    requested_url=target_url,
+                    responded_url=target_url,
+                    content=content,
+                    crawl_date=warc_record.record_date,
+                    source=WarcSourceInfo(
+                        publisher=publisher.publisher_name,
+                        warc_path=self.warc_path,
+                        warc_headers=dict(warc_record.headers),
+                        http_headers=dict(warc_record.http_headers),
+                    ),
+                )
diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py
index 568fb97a9..c71fe86a7 100644
--- a/src/fundus/scraping/pipeline.py
+++ b/src/fundus/scraping/pipeline.py
@@ -1,171 +1,163 @@
-import asyncio
-import time
+from __future__ import annotations
+
+import gzip
+import os
+import re
+from abc import abstractmethod
+from datetime import datetime
+from functools import lru_cache, partial, wraps
+from multiprocessing import Manager
+from multiprocessing.context import TimeoutError
+from multiprocessing.pool import MapResult, Pool, ThreadPool
+from queue import Empty, Queue
 from typing import (
-    AsyncIterator,
+    Any,
+    Callable,
+    Dict,
+    Generic,
     Iterator,
     List,
     Literal,
     Optional,
-    Protocol,
+    Pattern,
     Set,
     Tuple,
     Type,
+    TypeVar,
     Union,
-    runtime_checkable,
+    cast,
 )
 
-import aioitertools
+import dill
 import more_itertools
+import requests
+from dateutil.rrule import MONTHLY, rrule
+from tqdm import tqdm
+from typing_extensions import ParamSpec
 
 from fundus import PublisherCollection
-from fundus.logging import basic_logger
 from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.article import Article
+from fundus.scraping.delay import Delay
 from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter
-from fundus.scraping.html import URLSource, session_handler
-from fundus.scraping.scraper import Scraper
-from fundus.utils.more_async import ManagedEventLoop, async_next
+from fundus.scraping.html import CCNewsSource
+from fundus.scraping.scraper import CCNewsScraper, Scraper
+from fundus.scraping.url import URLSource
 
+_T = TypeVar("_T")
+_P = ParamSpec("_P")
 
-@runtime_checkable
-class Delay(Protocol):
-    """Protocol to define crawl delays between batches."""
 
-    def __call__(self) -> float:
-        """Yields a float specifying the minimum crawler delay for the current article batch in seconds.
+# noinspection PyPep8Naming
+class dill_wrapper(Generic[_P, _T]):
+    def __init__(self, target: Callable[_P, _T]):
+        """Wraps function in dill serialization.
 
-        The effective delay does include crawling execution time between batches,
-        i.e. the effective delay is max(execution_time, delay).
-
-        Examples:
-            >>> import random
-            >>> delay: Delay = lambda: random.random()
-            Will use a random delay in [0, 1) seconds.
-
-        Returns:
-            float: The delay time in seconds.
+        This is in order to use unpickable functions within multiprocessing.
 
+        Args:
+            target: The function to wrap.
         """
-        ...
+        self._serialized_target: bytes = dill.dumps(target)
 
+    @lru_cache
+    def _deserialize(self) -> Callable[_P, _T]:
+        return cast(Callable[_P, _T], dill.loads(self._serialized_target))
 
-class BaseCrawler:
-    def __init__(self, *scrapers: Scraper):
-        """Basic crawler to utilize scrapers.
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+        return self._deserialize()(*args, **kwargs)
 
-        Because scrapers are implemented asynchronously, this class handles the necessary event loops
-        and program logic to download articles in batches asynchronously.
 
-        Args:
-            *scrapers (Scraper): The scrapers which should be used.
-        """
-        self.scrapers: Tuple[Scraper, ...] = scrapers
+def queue_wrapper(queue: Queue[_T], target: Callable[_P, Iterator[_T]]) -> Callable[_P, None]:
+    """Wraps the target callable to add its results to the queue instead of returning them directly.
 
-    async def crawl_async(
-        self,
-        max_articles: Optional[int] = None,
-        error_handling: Literal["suppress", "catch", "raise"] = "suppress",
-        only_complete: Union[bool, ExtractionFilter] = Requires("title", "body", "publishing_date"),
-        delay: Optional[Union[float, Delay]] = None,
-        url_filter: Optional[URLFilter] = None,
-        only_unique: bool = True,
-    ) -> AsyncIterator[Article]:
-        """Async variant of the crawl() method.
+    Args:
+        queue: The buffer queue.
+        target: A target callable.
 
-        See docstring for crawl(). for detailed information about the parameters.
+    Returns:
+        (Callable[_P, None]) The wrapped target.
+    """
 
-        Args:
-            max_articles (Optional[int]): Number of articles to crawl. Defaults to None.
-            error_handling (Literal["suppress", "catch", "raise"]): Set error handling. Defaults to "suppress".
-            only_complete (Union[bool, ExtractionFilter]): Set extraction filters. Defaults to
-                Requires("title", "body", "publishing_date").
-            delay (Optional[Union[float, Delay]]): Set delay time between article batches. Defaults to None.
-            url_filter (Optional[URLFilter]): Set URLFilter. Defaults to None.
-            only_unique (bool): If true return only unique responses. Defaults to True.
+    @wraps(target)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
+        for obj in target(*args, **kwargs):
+            queue.put(obj)
 
-        Returns:
-            AsyncIterator[Article]: An iterator yielding objects of type Article.
-        """
+    return wrapper
 
-        response_cache: Set[str] = set()
 
-        def build_extraction_filter() -> Optional[ExtractionFilter]:
-            if isinstance(only_complete, bool):
-                return (
-                    None
-                    if only_complete is False
-                    else lambda extracted: not all(
-                        bool(v) if not isinstance(v, Exception) else False for _, v in extracted.items()
-                    )
-                )
-            else:
-                return only_complete
+def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]:
+    """Utility function to iterate exhaustively over a pool queue.
 
-        def build_delay() -> Optional[Delay]:
-            if isinstance(delay, float):
+    The underlying iterator of this function repeatedly exhausts the given queue.
+    Then, if the queue is empty only if all the pool's jobs have finished, the iterator reruns.
+    Otherwise, it waits for the queue to be populated with the next result from the pool.
 
-                def constant_delay() -> float:
-                    return delay  # type: ignore[return-value]
-
-                return constant_delay
-            else:
-                return delay
+    Args:
+        handle (MapResult[Any]):  A handle o the MappedResult of the underling multiprocessing pool.
+        queue (Queue[_T]): The pool queue.
 
-        def build_url_filter() -> URLFilter:
-            def _filter(url: str) -> bool:
-                return (url_filter is not None and url_filter(url)) or (only_unique and url in response_cache)
+    Returns:
+        Iterator[_T]: The iterator over the queue as it is populated.
+    """
+    while True:
+        try:
+            yield queue.get(timeout=0.1)
+        except Empty:
+            try:
+                handle.get(timeout=0.1)
+            except TimeoutError:
+                continue
+            return
 
-            return _filter
 
-        final_delay = build_delay()
+class BaseCrawler:
+    def __init__(
+        self,
+        pool_factory: Type[Pool],
+        processes: int,
+        args: Tuple[_T, ...],
+        kwargs: Dict[str, Any],
+    ):
+        self._pool_factory = pool_factory
+        self._args = args
+        self._kwargs = kwargs
+        self.processes = os.cpu_count() or 0 if processes == -1 else processes
+
+    @abstractmethod
+    def _fetch_articles(self, *args, **kwargs) -> Iterator[Article]:
+        raise NotImplementedError
+
+    @staticmethod
+    def _single_crawl(args: Tuple[_T, ...], article_task: Callable[[_T], Iterator[Article]]) -> Iterator[Article]:
+        for arg in args:
+            yield from article_task(arg)
+
+    def _parallel_crawl(
+        self, args: Tuple[_T, ...], article_task: Callable[[_T], Iterator[Article]]
+    ) -> Iterator[Article]:
+        # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually
+        # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all
+        # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading.
+        with Manager() as manager, self._pool_factory(processes=min(self.processes, len(args))) as pool:
+            article_queue: Queue[Article] = manager.Queue()
 
-        async_article_iterators: List[AsyncIterator[Optional[Article]]] = [
-            scraper.scrape(
-                error_handling=error_handling,
-                extraction_filter=build_extraction_filter(),
-                url_filter=build_url_filter(),
-            )
-            for scraper in self.scrapers
-        ]
+            # Because multiprocessing.Pool does not support iterators as targets,
+            # we wrap the article_task to write the articles to a queue instead of returning them directly.
+            wrapped_article_task: Callable[[str], None] = queue_wrapper(article_queue, article_task)
 
-        # we use this custom variant of interleave_longest in order to be able
-        # to delay the program flow between batches
-        async def _async_article_interleave_longest() -> AsyncIterator[Article]:
-            batches: AsyncIterator[Tuple[Optional[Article], ...]] = aioitertools.itertools.zip_longest(
-                *async_article_iterators
-            )
-            start_time = time.time()
-            async for batch in batches:
-                basic_logger.debug(f"Batch took {time.time() - start_time} seconds")
-                for next_article in batch:
-                    if next_article is not None:
-                        response_cache.add(next_article.html.responded_url)
-                        yield next_article
-                if final_delay:
-                    await asyncio.sleep(max(0.0, final_delay() - time.time() + start_time))
-                start_time = time.time()
+            # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill.
+            serialized_article_task = dill_wrapper(wrapped_article_task)
 
-        if max_articles is None:
-            max_articles = -1
-        elif max_articles == 0:
-            return
-
-        try:
-            async for article_index, article in aioitertools.builtins.enumerate(
-                _async_article_interleave_longest(), start=1
-            ):
-                yield article
-                if article_index == max_articles:
-                    break
-        finally:
-            await session_handler.close_current_session()
+            # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished.
+            yield from pool_queue_iter(pool.map_async(serialized_article_task, args), article_queue)
 
     def crawl(
         self,
         max_articles: Optional[int] = None,
         error_handling: Literal["suppress", "catch", "raise"] = "suppress",
         only_complete: Union[bool, ExtractionFilter] = Requires("title", "body", "publishing_date"),
-        delay: Optional[Union[float, Delay]] = 0.1,
         url_filter: Optional[URLFilter] = None,
         only_unique: bool = True,
     ) -> Iterator[Article]:
@@ -185,7 +177,7 @@ def crawl(
                 protocol as an extraction filter or use a boolean. If False, all articles will be yielded,
                 if True, only those with all attributes extracted. Defaults to ExtractionFilter letting
                 through all articles with at least title, body, and publishing_date set.
-            delay (Optional[Union[float, Delay]]): Set a delay time in seconds to be used between article
+            delay (Optional[Union[float, fundus.scraping.delay.Delay]]): Set a delay time in seconds to be used between article
                 batches. You can set a delay directly using float or any callable satisfying the Delay
                 protocol. If set to None, no delay will be used between batches. See Delay for more
                 information. Defaults to None.
@@ -198,21 +190,45 @@ def crawl(
             Iterator[Article]: An iterator yielding objects of type Article.
         """
 
-        async_article_iter = self.crawl_async(
-            max_articles=max_articles,
+        if max_articles == 0:
+            return
+
+        if max_articles is None:
+            max_articles = -1
+
+        def build_extraction_filter() -> Optional[ExtractionFilter]:
+            if isinstance(only_complete, bool):
+                return (
+                    None
+                    if only_complete is False
+                    else lambda extracted: not all(
+                        bool(v) if not isinstance(v, Exception) else False for _, v in extracted.items()
+                    )
+                )
+            else:
+                return only_complete
+
+        response_cache: Set[str] = set()
+
+        article_task: Callable[[str], Iterator[Article]] = partial(
+            self._fetch_articles,
             error_handling=error_handling,
-            only_complete=only_complete,
-            delay=delay,
+            extraction_filter=build_extraction_filter(),
             url_filter=url_filter,
-            only_unique=only_unique,
+            **self._kwargs,
         )
 
-        with ManagedEventLoop() as runner:
-            while True:
-                try:
-                    yield runner.run_until_complete(async_next(async_article_iter))
-                except StopAsyncIteration:
-                    break
+        if self.processes == 0:
+            article_iter = self._single_crawl(self._args, article_task)
+        else:
+            article_iter = self._parallel_crawl(self._args, article_task)
+
+        for article_idx, article in enumerate(article_iter, start=1):
+            if not only_unique or article.html.responded_url not in response_cache:
+                response_cache.add(article.html.responded_url)
+                yield article
+            if article_idx == max_articles:
+                break
 
 
 class Crawler(BaseCrawler):
@@ -220,6 +236,7 @@ def __init__(
         self,
         *publishers: Union[PublisherEnum, Type[PublisherEnum], Type[PublisherCollection]],
         restrict_sources_to: Optional[List[Type[URLSource]]] = None,
+        delay: Optional[Union[float, Delay]] = 0.1,
     ):
         """Fundus base class for crawling articles from the web.
 
@@ -235,28 +252,137 @@ def __init__(
             restrict_sources_to (Optional[List[Type[URLSource]]]): Lets you restrict
                 sources defined in the publisher specs. If set, only articles from given source types
                 will be yielded.
+            delay (Optional[Union[float, Delay]]): Set a delay time in seconds to be used between article
+                downloads. You can set a delay directly using float or any callable satisfying the Delay
+                protocol. If set to None, no delay will be used between batches. See Delay for more
+                information. Defaults to None.
         """
 
         if not publishers:
             raise ValueError("param <publishers> of <Crawler.__init__> has to be non empty")
-        collapsed_publishers = more_itertools.collapse(publishers)
-
-        # build scraper
-        scrapers: List[Scraper] = []
-        for spec in collapsed_publishers:
-            if restrict_sources_to:
-                sources = tuple(
-                    more_itertools.flatten(spec.source_mapping[source_type] for source_type in restrict_sources_to)
-                )
+        collapsed_publishers = tuple(more_itertools.collapse(publishers))
+
+        def build_delay() -> Optional[Delay]:
+            if isinstance(delay, float):
+
+                def constant_delay() -> float:
+                    return delay  # type: ignore[return-value]
+
+                return constant_delay
             else:
-                sources = tuple(more_itertools.flatten(spec.source_mapping.values()))
+                return delay
 
-            if sources:
-                scrapers.append(
-                    Scraper(
-                        *sources,
-                        parser=spec.parser,
-                    )
-                )
+        super().__init__(
+            pool_factory=ThreadPool,
+            processes=len(collapsed_publishers),
+            args=collapsed_publishers,
+            kwargs={"delay": build_delay(), "restrict_sources_to": restrict_sources_to},
+        )
+
+    @staticmethod
+    def _fetch_articles(
+        publisher: PublisherEnum,
+        error_handling: Literal["suppress", "catch", "raise"],
+        delay: Optional[Delay] = None,
+        restrict_sources_to: Optional[List[Type[URLSource]]] = None,
+        extraction_filter: Optional[ExtractionFilter] = None,
+        url_filter: Optional[URLFilter] = None,
+    ) -> Iterator[Article]:
+        scraper = Scraper(publisher, restrict_sources_to, delay)
+        yield from scraper.scrape(error_handling, extraction_filter, url_filter)
+
+
+class CCNewsCrawler(BaseCrawler):
+    def __init__(
+        self,
+        *publishers: PublisherEnum,
+        start: datetime = datetime(2016, 8, 1),
+        end: datetime = datetime.now(),
+        processes: int = -1,
+        server_address: str = "https://data.commoncrawl.org/",
+    ):
+        """Initializes a crawler for the CC-NEWS dataset.
+
+        Args:
+            *publishers: The publishers to crawl.
+            processes: Number of additional process to use for crawling.
+                If -1, the number of processes is set to `os.cpu_count()`.
+                If `os.cpu_count()` is not available, the number of processes is set to 0.
+                If 0, only the main process is used. Defaults to -1.
+            server_address: The CC-NEWS dataset server address. Defaults to 'https://data.commoncrawl.org/'.
+        """
+
+        collapsed_publishers = tuple(more_itertools.collapse(publishers))
+        processes = os.cpu_count() or 0 if processes == -1 else processes
+        warc_paths = tuple(
+            self._get_warc_paths(start=start, end=end, processes=processes, server_address=server_address)
+        )
+
+        super().__init__(
+            pool_factory=Pool, processes=processes, args=warc_paths, kwargs={"publishers": collapsed_publishers}
+        )
+
+    @staticmethod
+    def _fetch_articles(
+        warc_path: str,
+        publishers: Tuple[PublisherEnum, ...],
+        error_handling: Literal["suppress", "catch", "raise"],
+        extraction_filter: Optional[ExtractionFilter] = None,
+        url_filter: Optional[URLFilter] = None,
+    ) -> Iterator[Article]:
+        source = CCNewsSource(*publishers, warc_path=warc_path)
+        scraper = CCNewsScraper(source)
+        yield from scraper.scrape(error_handling, extraction_filter, url_filter)
 
-        super().__init__(*scrapers)
+    def _get_warc_paths(
+        self, start: datetime, end: datetime, processes: int, server_address: str = "https://data.commoncrawl.org/"
+    ) -> List[str]:
+        # Date regex examples: https://regex101.com/r/yDX3G6/1
+        date_pattern: Pattern[str] = re.compile(r"CC-NEWS-(?P<date>\d{14})-")
+
+        if start >= end:
+            raise ValueError("Start date has to be < end date.")
+
+        if start < datetime(2016, 8, 1):
+            raise ValueError("The default, and earliest possible, start date is 2016/08/01.")
+
+        if end > datetime.now():
+            raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?")
+
+        date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=start, until=end))
+        urls: List[str] = [
+            f"{server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence
+        ]
+
+        with tqdm(total=len(urls), desc="Loading WARC Paths", leave=False) as bar:
+
+            def load_paths(url: str) -> List[str]:
+                with requests.Session() as session:
+                    paths = gzip.decompress(session.get(url).content).decode("utf-8").split()
+                    bar.update()
+                    return paths
+
+            if processes == 0:
+                nested_warc_paths = [load_paths(url) for url in urls]
+            else:
+                # use two threads per process, default two threads per core
+                max_number_of_threads = processes * 2
+
+                with ThreadPool(processes=min(len(urls), max_number_of_threads)) as pool:
+                    nested_warc_paths = pool.map(load_paths, urls)
+
+        warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths)
+
+        start_strf = start.strftime("%Y%m%d%H%M%S")
+        end_strf = end.strftime("%Y%m%d%H%M%S")
+
+        def filter_warc_path_by_date(path: str) -> bool:
+            match: Optional[re.Match[str]] = date_pattern.search(path)
+            if match is None:
+                raise AssertionError(f"Invalid WARC path {path!r}")
+            return start_strf <= match["date"] <= end_strf
+
+        return sorted(
+            (f"{server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)),
+            reverse=True,
+        )
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index 004f447ce..8cde4603a 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -1,55 +1,34 @@
-from typing import AsyncIterator, Literal, Optional
+from typing import Dict, Iterator, List, Literal, Optional, Type
 
 import more_itertools
 
 from fundus.logging import basic_logger
 from fundus.parser import ParserProxy
+from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.article import Article
-from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter
-from fundus.scraping.html import FundusSource
+from fundus.scraping.delay import Delay
+from fundus.scraping.filter import ExtractionFilter, URLFilter
+from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource
+from fundus.scraping.url import URLSource
 
 
-class Scraper:
-    def __init__(self, *sources: FundusSource, parser: ParserProxy):
-        self.sources = list(sources)
+class BaseScraper:
+    def __init__(self, *sources: HTMLSource, parser_mapping: Dict[str, ParserProxy]):
+        self.sources = sources
+        self.parser_mapping = parser_mapping
 
-        if not parser:
-            raise ValueError(f"the given parser {type(parser).__name__} is empty")
-
-        self.parser = parser
-
-    async def scrape(
+    def scrape(
         self,
         error_handling: Literal["suppress", "catch", "raise"],
         extraction_filter: Optional[ExtractionFilter] = None,
         url_filter: Optional[URLFilter] = None,
-    ) -> AsyncIterator[Optional[Article]]:
-        # TODO: add docstring; especially explain why returned Article is Optional
-        if isinstance(extraction_filter, Requires):
-            supported_attributes = set(
-                more_itertools.flatten(collection.names for collection in self.parser.attribute_mapping.values())
-            )
-            if missing_attributes := extraction_filter.required_attributes - supported_attributes:
-                if len(missing_attributes) == 1:
-                    basic_logger.warning(
-                        f"The required attribute `{missing_attributes}` "
-                        f"is not supported by {type(self.parser).__name__}. Skipping Scraper"
-                    )
-                else:
-                    basic_logger.warning(
-                        f"The required attributes `{', '.join(missing_attributes)}` "
-                        f"are not supported by {type(self.parser).__name__}. Skipping Scraper"
-                    )
+    ) -> Iterator[Article]:
+        for source in self.sources:
+            for html in source.fetch(url_filter=url_filter):
+                parser = self.parser_mapping[html.source.publisher]
 
-                return
-
-        for html_source in self.sources:
-            async for html in html_source.fetch(url_filter=url_filter):
-                if html is None:
-                    yield None
-                    continue
                 try:
-                    extraction = self.parser(html.crawl_date).parse(html.content, error_handling)
+                    extraction = parser(html.crawl_date).parse(html.content, error_handling)
 
                 except Exception as err:
                     if error_handling == "raise":
@@ -59,16 +38,49 @@ async def scrape(
                         raise err
                     elif error_handling == "catch":
                         yield Article(html=html, exception=err)
-                        continue
                     elif error_handling == "suppress":
                         basic_logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}")
-                        yield None
                     else:
                         raise ValueError(f"Unknown value '{error_handling}' for parameter <error_handling>'")
 
-                if extraction_filter and extraction_filter(extraction):
-                    basic_logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter")
-                    yield None
                 else:
-                    article = Article.from_extracted(html=html, extracted=extraction)
-                    yield article
+                    if extraction_filter and extraction_filter(extraction):
+                        basic_logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter")
+                    else:
+                        article = Article.from_extracted(html=html, extracted=extraction)
+                        yield article
+
+
+class Scraper(BaseScraper):
+    def __init__(
+        self,
+        publisher: PublisherEnum,
+        restrict_sources_to: Optional[List[Type[URLSource]]] = None,
+        delay: Optional[Delay] = None,
+    ):
+        if restrict_sources_to:
+            url_sources = tuple(
+                more_itertools.flatten(publisher.source_mapping[source_type] for source_type in restrict_sources_to)
+            )
+        else:
+            url_sources = tuple(more_itertools.flatten(publisher.source_mapping.values()))
+
+        html_sources = [
+            WebSource(
+                url_source=url_source,
+                publisher=publisher.publisher_name,
+                request_header=publisher.request_header,
+                delay=delay,
+            )
+            for url_source in url_sources
+        ]
+        parser_mapping = {publisher.publisher_name: publisher.parser}
+        super().__init__(*html_sources, parser_mapping=parser_mapping)
+
+
+class CCNewsScraper(BaseScraper):
+    def __init__(self, source: CCNewsSource):
+        parser_mapping: Dict[str, ParserProxy] = {
+            publisher.publisher_name: publisher.parser for publisher in source.publishers
+        }
+        super().__init__(source, parser_mapping=parser_mapping)
diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
new file mode 100644
index 000000000..e1f010c82
--- /dev/null
+++ b/src/fundus/scraping/session.py
@@ -0,0 +1,108 @@
+from typing import Optional
+
+import requests
+
+from fundus.logging import basic_logger
+
+_default_header = {"user-agent": "Fundus"}
+
+
+class SessionHandler:
+    """Object for handling  project global aiohttp.ClientSessions
+
+    The session life cycle consists of three steps which can be repeated indefinitely:
+    Build, Supply, Teardown.
+    Initially there is no session build within the session handler. When a session is requested
+    with get_session() either a new one is created with _session_factory() or the session handler's
+    existing one returned. Every subsequent call to get_session() will return the same
+    aiohttp.ClientSession object. If close_current_session() is called, the current session will be
+    tear-downed and the next call to get_session() will build a new session.
+    """
+
+    def __init__(self):
+        self._session: Optional[requests.Session] = None
+
+    @staticmethod
+    def _session_factory() -> requests.Session:
+        """Builds a new Session
+
+        This returns a new client session build from pre-defined configurations:
+        - pool_connections: 50
+        - pool_maxsize: 50
+        - hooks = {'request': lambda request:}
+
+        Returns:
+            An new ClientSession
+        """
+
+        # timings: Dict[Optional[str], float] = dict()
+        #
+        # async def on_request_start(
+        #     session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestStartParams
+        # ):
+        #     timings[params.url.host] = time.time()
+        #
+        # async def on_request_end(
+        #     session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestEndParams
+        # ):
+        #     assert params.url.host
+        #     history = params.response.history
+        #     previous_status_codes = [f"({response.status})" for response in history] if history else []
+        #     status_code_chain = " -> ".join(previous_status_codes + [f"({params.response.status})"])
+        #     basic_logger.debug(
+        #         f"{status_code_chain} <{params.method} {params.url!r}> "
+        #         f"took {time.time() - timings[params.url.host if not history else history[0].url.host]} second(s)"
+        #     )
+        #
+        # async def on_request_exception(
+        #     session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestExceptionParams
+        # ):
+        #     basic_logger.debug(
+        #         f"FAILED: <{params.method} {params.url}> with {str(params.exception) or type(params.exception)}"
+        #     )
+        #
+        # trace_config = aiohttp.TraceConfig()
+        # trace_config.on_request_start.append(on_request_start)
+        # trace_config.on_request_end.append(on_request_end)
+        # trace_config.on_request_exception.append(on_request_exception)
+
+        session = requests.Session()
+
+        # hooks
+        hooks = {"response": lambda response, *args, **kwargs: response.raise_for_status()}
+        session.hooks = hooks
+
+        # adapters
+        adapter_kwargs = {"pool_connections": 50, "pool_maxsize": 50}
+        session.mount("http://", requests.adapters.HTTPAdapter(**adapter_kwargs))
+        session.mount("https://", requests.adapters.HTTPAdapter(**adapter_kwargs))
+
+        return session
+
+    def get_session(self) -> requests.Session:
+        """Requests the current build session
+
+        If called for the first time or after close_current_session was called,
+        this function will build a new session. Every subsequent call will return
+        the same session object until the session is closed with close_current_session().
+
+        Returns:
+            requests.Session: The current build session
+        """
+        if not self._session:
+            self._session = self._session_factory()
+        return self._session
+
+    def close_current_session(self) -> None:
+        """Tears down the current build session
+
+        Returns:
+            None
+        """
+        session = self.get_session()
+        basic_logger.debug(f"Close session {session}")
+        session.close()
+        self._session = None
+
+
+session_handler = SessionHandler()
diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
new file mode 100644
index 000000000..5eb414ea7
--- /dev/null
+++ b/src/fundus/scraping/url.py
@@ -0,0 +1,139 @@
+import gzip
+import itertools
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from functools import cached_property
+from typing import (
+    AsyncIterator,
+    Callable,
+    ClassVar,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+)
+
+import feedparser
+import lxml.html
+import validators
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+from requests import ConnectionError, HTTPError
+
+from fundus.logging import basic_logger
+from fundus.scraping.filter import URLFilter, inverse
+from fundus.scraping.session import _default_header, session_handler
+
+
+class _ArchiveDecompressor:
+    def __init__(self):
+        self.archive_mapping: Dict[str, Callable[[bytes], bytes]] = {"application/x-gzip": self._decompress_gzip}
+
+    @staticmethod
+    def _decompress_gzip(compressed_content: bytes) -> bytes:
+        decompressed_content = gzip.decompress(compressed_content)
+        return decompressed_content
+
+    def decompress(self, content: bytes, file_format: "str") -> bytes:
+        decompress_function = self.archive_mapping[file_format]
+        return decompress_function(content)
+
+    @cached_property
+    def supported_file_formats(self) -> List[str]:
+        return list(self.archive_mapping.keys())
+
+
+@dataclass
+class URLSource(Iterable[str], ABC):
+    url: str
+
+    _request_header: Dict[str, str] = field(default_factory=dict)
+
+    def __post_init__(self):
+        if not self._request_header:
+            self._request_header = _default_header
+        if not validators.url(self.url):
+            raise ValueError(f"Invalid url '{self.url}'")
+
+    def set_header(self, request_header: Dict[str, str]) -> None:
+        self._request_header = request_header
+
+    @abstractmethod
+    def __iter__(self) -> Iterator[str]:
+        raise NotImplemented
+
+    def get_urls(self, max_urls: Optional[int] = None) -> Iterator[str]:
+        """Returns a generator yielding up to <max_urls> URLs from <self>.
+
+
+        Args:
+            max_urls (int): Number of max URLs to return. Set value is
+                an upper bound and not necessarily the actual number of
+                URLs. If set to None, the source will be exhausted until
+                StopIteration is hit. Defaults to None.
+
+        Yields:
+            str: The next URL.
+        """
+        return itertools.islice(self, max_urls)
+
+
+@dataclass
+class RSSFeed(URLSource):
+    def __iter__(self) -> Iterator[str]:
+        session = session_handler.get_session()
+        response = session.get(self.url, headers=self._request_header)
+        html = response.text
+        rss_feed = feedparser.parse(html)
+        if exception := rss_feed.get("bozo_exception"):
+            basic_logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}")
+            return
+        else:
+            for url in (entry["link"] for entry in rss_feed["entries"]):
+                yield url
+
+
+@dataclass
+class Sitemap(URLSource):
+    recursive: bool = True
+    reverse: bool = False
+    sitemap_filter: URLFilter = lambda url: not bool(url)
+
+    _decompressor: ClassVar[_ArchiveDecompressor] = _ArchiveDecompressor()
+    _sitemap_selector: ClassVar[XPath] = CSSSelector("sitemap > loc")
+    _url_selector: ClassVar[XPath] = CSSSelector("url > loc")
+
+    def __iter__(self) -> AsyncIterator[str]:
+        def yield_recursive(sitemap_url: str) -> AsyncIterator[str]:
+            session = session_handler.get_session()
+            if not validators.url(sitemap_url):
+                basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
+            try:
+                response = session.get(url=sitemap_url, headers=self._request_header)
+            except (HTTPError, ConnectionError) as error:
+                basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
+                return
+            content = response.content
+            if (content_type := response.headers["content-type"]) in self._decompressor.supported_file_formats:
+                content = self._decompressor.decompress(content, content_type)
+            if not content:
+                basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")
+                return
+            tree = lxml.html.fromstring(content)
+            urls = [node.text_content() for node in self._url_selector(tree)]
+            if urls:
+                for new_url in reversed(urls) if self.reverse else urls:
+                    yield new_url
+            elif self.recursive:
+                sitemap_locs = [node.text_content() for node in self._sitemap_selector(tree)]
+                filtered_locs = list(filter(inverse(self.sitemap_filter), sitemap_locs))
+                for loc in reversed(filtered_locs) if self.reverse else filtered_locs:
+                    yield from yield_recursive(loc)
+
+        yield from yield_recursive(self.url)
+
+
+@dataclass
+class NewsMap(Sitemap):
+    pass

From 60737fc9dcba2e867a5fc2fb6fbaaeb0a2fbf20a Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Sun, 4 Feb 2024 16:18:02 +0100
Subject: [PATCH 02/36] bug fixes

---
 src/fundus/scraping/html.py     | 48 ++++++++++++++++++---------------
 src/fundus/scraping/pipeline.py | 18 +++++++++----
 2 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index ee587f503..00725d12f 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -107,33 +107,39 @@ def filter_url(u: str) -> bool:
 
             except (HTTPError, ConnectionError) as error:
                 basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'")
+                if isinstance(error, HTTPError) and error.response.status_code >= 500:
+                    return
                 continue
 
+            except ConnectionError as error:
+                basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'")
+
             except Exception as error:
                 basic_logger.warning(f"Warning! Skipped  requested URL '{url}' because of an unexpected error {error}")
                 continue
 
-            if filter_url(str(response.url)):
-                basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
-                continue
-            html = response.text
-
-            if response.history:
-                basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")
-
-            source = (
-                WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url)
-                if isinstance(self.url_source, URLSource)
-                else SourceInfo(self.publisher)
-            )
-
-            yield HTML(
-                requested_url=url,
-                responded_url=str(response.url),
-                content=html,
-                crawl_date=datetime.now(),
-                source=source,
-            )
+            else:
+                if filter_url(str(response.url)):
+                    basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
+                    continue
+                html = response.text
+
+                if response.history:
+                    basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")
+
+                source = (
+                    WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url)
+                    if isinstance(self.url_source, URLSource)
+                    else SourceInfo(self.publisher)
+                )
+
+                yield HTML(
+                    requested_url=url,
+                    responded_url=str(response.url),
+                    content=html,
+                    crawl_date=datetime.now(),
+                    source=source,
+                )
 
             if self.delay:
                 time.sleep(max(0.0, self.delay() - time.time() + timestamp))
diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py
index c71fe86a7..71e3e4ee0 100644
--- a/src/fundus/scraping/pipeline.py
+++ b/src/fundus/scraping/pipeline.py
@@ -223,9 +223,11 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
         else:
             article_iter = self._parallel_crawl(self._args, article_task)
 
-        for article_idx, article in enumerate(article_iter, start=1):
+        article_idx = 0
+        for article in article_iter:
             if not only_unique or article.html.responded_url not in response_cache:
                 response_cache.add(article.html.responded_url)
+                article_idx += 1
                 yield article
             if article_idx == max_articles:
                 break
@@ -236,7 +238,8 @@ def __init__(
         self,
         *publishers: Union[PublisherEnum, Type[PublisherEnum], Type[PublisherCollection]],
         restrict_sources_to: Optional[List[Type[URLSource]]] = None,
-        delay: Optional[Union[float, Delay]] = 0.1,
+        delay: Optional[Union[float, Delay]] = 1.,
+        threading: bool = True,
     ):
         """Fundus base class for crawling articles from the web.
 
@@ -269,12 +272,16 @@ def constant_delay() -> float:
                     return delay  # type: ignore[return-value]
 
                 return constant_delay
-            else:
+
+            elif isinstance(delay, Delay):
                 return delay
 
+            else:
+                raise TypeError("param <delay> of <Crawler.__init__>")
+
         super().__init__(
             pool_factory=ThreadPool,
-            processes=len(collapsed_publishers),
+            processes=len(collapsed_publishers) if threading else 0,
             args=collapsed_publishers,
             kwargs={"delay": build_delay(), "restrict_sources_to": restrict_sources_to},
         )
@@ -334,8 +341,9 @@ def _fetch_articles(
         scraper = CCNewsScraper(source)
         yield from scraper.scrape(error_handling, extraction_filter, url_filter)
 
+    @staticmethod
     def _get_warc_paths(
-        self, start: datetime, end: datetime, processes: int, server_address: str = "https://data.commoncrawl.org/"
+        start: datetime, end: datetime, processes: int, server_address: str = "https://data.commoncrawl.org/"
     ) -> List[str]:
         # Date regex examples: https://regex101.com/r/yDX3G6/1
         date_pattern: Pattern[str] = re.compile(r"CC-NEWS-(?P<date>\d{14})-")

From a5be52276a8c4ee8bff89b055ff77eaaa1ebda77 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Sun, 4 Feb 2024 18:51:52 +0100
Subject: [PATCH 03/36] add request logging

---
 src/fundus/scraping/session.py | 44 +++++++++-------------------------
 1 file changed, 11 insertions(+), 33 deletions(-)

diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index e1f010c82..f6c43a68c 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -1,6 +1,6 @@
 from typing import Optional
 
-import requests
+import requests.adapters
 
 from fundus.logging import basic_logger
 
@@ -35,41 +35,19 @@ def _session_factory() -> requests.Session:
             An new ClientSession
         """
 
-        # timings: Dict[Optional[str], float] = dict()
-        #
-        # async def on_request_start(
-        #     session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestStartParams
-        # ):
-        #     timings[params.url.host] = time.time()
-        #
-        # async def on_request_end(
-        #     session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestEndParams
-        # ):
-        #     assert params.url.host
-        #     history = params.response.history
-        #     previous_status_codes = [f"({response.status})" for response in history] if history else []
-        #     status_code_chain = " -> ".join(previous_status_codes + [f"({params.response.status})"])
-        #     basic_logger.debug(
-        #         f"{status_code_chain} <{params.method} {params.url!r}> "
-        #         f"took {time.time() - timings[params.url.host if not history else history[0].url.host]} second(s)"
-        #     )
-        #
-        # async def on_request_exception(
-        #     session: aiohttp.ClientSession, context: types.SimpleNamespace, params: aiohttp.TraceRequestExceptionParams
-        # ):
-        #     basic_logger.debug(
-        #         f"FAILED: <{params.method} {params.url}> with {str(params.exception) or type(params.exception)}"
-        #     )
-        #
-        # trace_config = aiohttp.TraceConfig()
-        # trace_config.on_request_start.append(on_request_start)
-        # trace_config.on_request_end.append(on_request_end)
-        # trace_config.on_request_exception.append(on_request_exception)
-
         session = requests.Session()
 
+        def _response_log(response: requests.Response, *args, **kwargs) -> None:
+            history = response.history
+            previous_status_codes = [f"({response.status_code})" for response in history] if history else []
+            status_code_chain = " -> ".join(previous_status_codes + [f"({response.status_code})"])
+            basic_logger.debug(
+                f"{status_code_chain} <{response.request.method} {response.url!r}> "
+                f"took {response.elapsed.total_seconds()} second(s)"
+            )
+
         # hooks
-        hooks = {"response": lambda response, *args, **kwargs: response.raise_for_status()}
+        hooks = {"response": [lambda response, *args, **kwargs: response.raise_for_status(), _response_log]}
         session.hooks = hooks
 
         # adapters

From 6fc5f90468786d97656f101e901669508ff781a3 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 5 Feb 2024 14:18:58 +0100
Subject: [PATCH 04/36] fix imports

---
 src/fundus/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
index fb5e46405..7c5cd5bbe 100644
--- a/src/fundus/__init__.py
+++ b/src/fundus/__init__.py
@@ -4,7 +4,7 @@
 from fundus.publishers import PublisherCollection
 from fundus.scraping.filter import Requires
 from fundus.scraping.pipeline import BaseCrawler, CCNewsCrawler, Crawler
-from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
+from fundus.scraping.url import RSSFeed, Sitemap, NewsMap
 
 __module_path__ = pathlib.Path(__file__).parent
 __development_base_path__ = __module_path__.parents[1]
@@ -15,6 +15,9 @@
     "CCNewsCrawler",
     "PublisherCollection",
     "Requires",
+    "RSSFeed",
+    "Sitemap",
+    "NewsMap"
 ]
 
 # On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times,

From c70b36565294b7e7507627cac1642ce11eab312d Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 15 Feb 2024 14:04:26 +0100
Subject: [PATCH 05/36] finish Pool based crawler implementation

---
 src/fundus/__init__.py               |   8 +-
 src/fundus/scraping/pipeline.py      | 232 +++++++++++++++------------
 src/fundus/scraping/scraper.py       |   2 +-
 src/fundus/scraping/session.py       |  32 ++--
 src/fundus/scraping/url.py           |   4 +-
 tests/fixtures/fixture_collection.py |   2 +-
 tests/test_collection.py             |   4 +-
 tests/test_pipeline.py               |  26 ++-
 8 files changed, 168 insertions(+), 142 deletions(-)

diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
index 7c5cd5bbe..f5e5e45ff 100644
--- a/src/fundus/__init__.py
+++ b/src/fundus/__init__.py
@@ -3,21 +3,21 @@
 
 from fundus.publishers import PublisherCollection
 from fundus.scraping.filter import Requires
-from fundus.scraping.pipeline import BaseCrawler, CCNewsCrawler, Crawler
-from fundus.scraping.url import RSSFeed, Sitemap, NewsMap
+from fundus.scraping.pipeline import CrawlerBase, Crawler, CCNewsCrawler
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 __module_path__ = pathlib.Path(__file__).parent
 __development_base_path__ = __module_path__.parents[1]
 
 __all__ = [
+    "CrawlerBase",
     "Crawler",
-    "BaseCrawler",
     "CCNewsCrawler",
     "PublisherCollection",
     "Requires",
     "RSSFeed",
     "Sitemap",
-    "NewsMap"
+    "NewsMap",
 ]
 
 # On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times,
diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/pipeline.py
index 71e3e4ee0..bbb6a7bff 100644
--- a/src/fundus/scraping/pipeline.py
+++ b/src/fundus/scraping/pipeline.py
@@ -3,7 +3,7 @@
 import gzip
 import os
 import re
-from abc import abstractmethod
+from abc import abstractmethod, ABC
 from datetime import datetime
 from functools import lru_cache, partial, wraps
 from multiprocessing import Manager
@@ -13,7 +13,6 @@
 from typing import (
     Any,
     Callable,
-    Dict,
     Generic,
     Iterator,
     List,
@@ -41,7 +40,7 @@
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter
 from fundus.scraping.html import CCNewsSource
-from fundus.scraping.scraper import CCNewsScraper, Scraper
+from fundus.scraping.scraper import CCNewsScraper, WebScraper
 from fundus.scraping.url import URLSource
 
 _T = TypeVar("_T")
@@ -112,46 +111,15 @@ def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]:
             return
 
 
-class BaseCrawler:
-    def __init__(
-        self,
-        pool_factory: Type[Pool],
-        processes: int,
-        args: Tuple[_T, ...],
-        kwargs: Dict[str, Any],
-    ):
-        self._pool_factory = pool_factory
-        self._args = args
-        self._kwargs = kwargs
-        self.processes = os.cpu_count() or 0 if processes == -1 else processes
-
+class CrawlerBase(ABC):
     @abstractmethod
-    def _fetch_articles(self, *args, **kwargs) -> Iterator[Article]:
-        raise NotImplementedError
-
-    @staticmethod
-    def _single_crawl(args: Tuple[_T, ...], article_task: Callable[[_T], Iterator[Article]]) -> Iterator[Article]:
-        for arg in args:
-            yield from article_task(arg)
-
-    def _parallel_crawl(
-        self, args: Tuple[_T, ...], article_task: Callable[[_T], Iterator[Article]]
+    def _build_article_iterator(
+        self,
+        error_handling: Literal["suppress", "catch", "raise"],
+        extraction_filter: Optional[ExtractionFilter],
+        url_filter: Optional[URLFilter],
     ) -> Iterator[Article]:
-        # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually
-        # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all
-        # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading.
-        with Manager() as manager, self._pool_factory(processes=min(self.processes, len(args))) as pool:
-            article_queue: Queue[Article] = manager.Queue()
-
-            # Because multiprocessing.Pool does not support iterators as targets,
-            # we wrap the article_task to write the articles to a queue instead of returning them directly.
-            wrapped_article_task: Callable[[str], None] = queue_wrapper(article_queue, article_task)
-
-            # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill.
-            serialized_article_task = dill_wrapper(wrapped_article_task)
-
-            # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished.
-            yield from pool_queue_iter(pool.map_async(serialized_article_task, args), article_queue)
+        raise NotImplementedError
 
     def crawl(
         self,
@@ -177,10 +145,6 @@ def crawl(
                 protocol as an extraction filter or use a boolean. If False, all articles will be yielded,
                 if True, only those with all attributes extracted. Defaults to ExtractionFilter letting
                 through all articles with at least title, body, and publishing_date set.
-            delay (Optional[Union[float, fundus.scraping.delay.Delay]]): Set a delay time in seconds to be used between article
-                batches. You can set a delay directly using float or any callable satisfying the Delay
-                protocol. If set to None, no delay will be used between batches. See Delay for more
-                information. Defaults to None.
             url_filter (Optional[URLFilter]): A callable object satisfying the URLFilter protocol to skip
                 URLs before download. This filter applies on both requested and responded URL. Defaults to None.
             only_unique (bool): If set to True, articles yielded will be unique on the responded URL.
@@ -210,21 +174,8 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
 
         response_cache: Set[str] = set()
 
-        article_task: Callable[[str], Iterator[Article]] = partial(
-            self._fetch_articles,
-            error_handling=error_handling,
-            extraction_filter=build_extraction_filter(),
-            url_filter=url_filter,
-            **self._kwargs,
-        )
-
-        if self.processes == 0:
-            article_iter = self._single_crawl(self._args, article_task)
-        else:
-            article_iter = self._parallel_crawl(self._args, article_task)
-
         article_idx = 0
-        for article in article_iter:
+        for article in self._build_article_iterator(error_handling, build_extraction_filter(), url_filter):
             if not only_unique or article.html.responded_url not in response_cache:
                 response_cache.add(article.html.responded_url)
                 article_idx += 1
@@ -233,12 +184,12 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
                 break
 
 
-class Crawler(BaseCrawler):
+class Crawler(CrawlerBase):
     def __init__(
         self,
         *publishers: Union[PublisherEnum, Type[PublisherEnum], Type[PublisherCollection]],
         restrict_sources_to: Optional[List[Type[URLSource]]] = None,
-        delay: Optional[Union[float, Delay]] = 1.,
+        delay: Optional[Union[float, Delay]] = 1.0,
         threading: bool = True,
     ):
         """Fundus base class for crawling articles from the web.
@@ -263,43 +214,78 @@ def __init__(
 
         if not publishers:
             raise ValueError("param <publishers> of <Crawler.__init__> has to be non empty")
-        collapsed_publishers = tuple(more_itertools.collapse(publishers))
 
+        self.publishers = tuple(more_itertools.collapse(publishers))
+        self.restrict_sources_to = restrict_sources_to
+        self.delay = delay
+        self.threading = threading
+
+    def _fetch_articles(
+        self,
+        publisher: PublisherEnum,
+        error_handling: Literal["suppress", "catch", "raise"],
+        extraction_filter: Optional[ExtractionFilter] = None,
+        url_filter: Optional[URLFilter] = None,
+    ) -> Iterator[Article]:
         def build_delay() -> Optional[Delay]:
-            if isinstance(delay, float):
+            if isinstance(self.delay, float):
 
                 def constant_delay() -> float:
-                    return delay  # type: ignore[return-value]
+                    return self.delay  # type: ignore[return-value]
 
                 return constant_delay
 
-            elif isinstance(delay, Delay):
-                return delay
+            elif isinstance(self.delay, Delay):
+                return self.delay
 
             else:
                 raise TypeError("param <delay> of <Crawler.__init__>")
 
-        super().__init__(
-            pool_factory=ThreadPool,
-            processes=len(collapsed_publishers) if threading else 0,
-            args=collapsed_publishers,
-            kwargs={"delay": build_delay(), "restrict_sources_to": restrict_sources_to},
-        )
+        scraper = WebScraper(publisher, self.restrict_sources_to, build_delay())
+        yield from scraper.scrape(error_handling, extraction_filter, url_filter)
 
     @staticmethod
-    def _fetch_articles(
-        publisher: PublisherEnum,
+    def _single_crawl(
+        publishers: Tuple[PublisherEnum, ...], article_task: Callable[[PublisherEnum], Iterator[Article]]
+    ) -> Iterator[Article]:
+        article_iterators = [article_task(publisher) for publisher in publishers]
+        while article_iterators:
+            for iterator in article_iterators:
+                try:
+                    yield next(iterator)
+                except StopIteration:
+                    article_iterators.remove(iterator)
+
+    @staticmethod
+    def _threaded_crawl(
+        publishers: Tuple[PublisherEnum, ...], article_task: Callable[[PublisherEnum], Iterator[Article]]
+    ) -> Iterator[Article]:
+        article_queue: Queue[Article] = Queue()
+        wrapped_article_task = queue_wrapper(article_queue, article_task)
+
+        with ThreadPool(processes=len(publishers) or None) as pool:
+            yield from pool_queue_iter(pool.map_async(wrapped_article_task, publishers), article_queue)
+
+    def _build_article_iterator(
+        self,
         error_handling: Literal["suppress", "catch", "raise"],
-        delay: Optional[Delay] = None,
-        restrict_sources_to: Optional[List[Type[URLSource]]] = None,
-        extraction_filter: Optional[ExtractionFilter] = None,
-        url_filter: Optional[URLFilter] = None,
+        extraction_filter: Optional[ExtractionFilter],
+        url_filter: Optional[URLFilter],
     ) -> Iterator[Article]:
-        scraper = Scraper(publisher, restrict_sources_to, delay)
-        yield from scraper.scrape(error_handling, extraction_filter, url_filter)
+        article_task = partial(
+            self._fetch_articles,
+            error_handling=error_handling,
+            extraction_filter=extraction_filter,
+            url_filter=url_filter,
+        )
+
+        if self.threading:
+            yield from self._threaded_crawl(self.publishers, article_task)
+        else:
+            yield from self._single_crawl(self.publishers, article_task)
 
 
-class CCNewsCrawler(BaseCrawler):
+class CCNewsCrawler(CrawlerBase):
     def __init__(
         self,
         *publishers: PublisherEnum,
@@ -319,15 +305,11 @@ def __init__(
             server_address: The CC-NEWS dataset server address. Defaults to 'https://data.commoncrawl.org/'.
         """
 
-        collapsed_publishers = tuple(more_itertools.collapse(publishers))
-        processes = os.cpu_count() or 0 if processes == -1 else processes
-        warc_paths = tuple(
-            self._get_warc_paths(start=start, end=end, processes=processes, server_address=server_address)
-        )
-
-        super().__init__(
-            pool_factory=Pool, processes=processes, args=warc_paths, kwargs={"publishers": collapsed_publishers}
-        )
+        self.publishers = tuple(more_itertools.collapse(publishers))
+        self.start = start
+        self.end = end
+        self.processes = os.cpu_count() or 0 if processes == -1 else processes
+        self.server_address = server_address
 
     @staticmethod
     def _fetch_articles(
@@ -342,24 +324,47 @@ def _fetch_articles(
         yield from scraper.scrape(error_handling, extraction_filter, url_filter)
 
     @staticmethod
-    def _get_warc_paths(
-        start: datetime, end: datetime, processes: int, server_address: str = "https://data.commoncrawl.org/"
-    ) -> List[str]:
+    def _single_crawl(
+        warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Article]]
+    ) -> Iterator[Article]:
+        for warc_path in warc_paths:
+            yield from article_task(warc_path)
+
+    def _parallel_crawl(
+        self, warc_paths: Tuple[str, ...], article_task: Callable[[str], Iterator[Article]]
+    ) -> Iterator[Article]:
+        # As one could think, because we're downloading a bunch of files, this task is IO-bound, but it is actually
+        # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all
+        # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading.
+        with Manager() as manager, Pool(processes=min(self.processes, len(warc_paths))) as pool:
+            article_queue: Queue[Article] = manager.Queue()
+
+            # Because multiprocessing.Pool does not support iterators as targets,
+            # we wrap the article_task to write the articles to a queue instead of returning them directly.
+            wrapped_article_task: Callable[[str], None] = queue_wrapper(article_queue, article_task)
+
+            # To avoid restricting the article_task to use only pickleable objects, we serialize it using dill.
+            serialized_article_task = dill_wrapper(wrapped_article_task)
+
+            # Finally, we build an iterator around the queue, exhausting the queue until the pool is finished.
+            yield from pool_queue_iter(pool.map_async(serialized_article_task, warc_paths), article_queue)
+
+    def _get_warc_paths(self) -> List[str]:
         # Date regex examples: https://regex101.com/r/yDX3G6/1
         date_pattern: Pattern[str] = re.compile(r"CC-NEWS-(?P<date>\d{14})-")
 
-        if start >= end:
+        if self.start >= self.end:
             raise ValueError("Start date has to be < end date.")
 
-        if start < datetime(2016, 8, 1):
+        if self.start < datetime(2016, 8, 1):
             raise ValueError("The default, and earliest possible, start date is 2016/08/01.")
 
-        if end > datetime.now():
+        if self.end > datetime.now():
             raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?")
 
-        date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=start, until=end))
+        date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=self.start, until=self.end))
         urls: List[str] = [
-            f"{server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence
+            f"{self.server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence
         ]
 
         with tqdm(total=len(urls), desc="Loading WARC Paths", leave=False) as bar:
@@ -370,19 +375,19 @@ def load_paths(url: str) -> List[str]:
                     bar.update()
                     return paths
 
-            if processes == 0:
+            if self.processes == 0:
                 nested_warc_paths = [load_paths(url) for url in urls]
             else:
                 # use two threads per process, default two threads per core
-                max_number_of_threads = processes * 2
+                max_number_of_threads = self.processes * 2
 
                 with ThreadPool(processes=min(len(urls), max_number_of_threads)) as pool:
                     nested_warc_paths = pool.map(load_paths, urls)
 
         warc_paths: Iterator[str] = more_itertools.flatten(nested_warc_paths)
 
-        start_strf = start.strftime("%Y%m%d%H%M%S")
-        end_strf = end.strftime("%Y%m%d%H%M%S")
+        start_strf = self.start.strftime("%Y%m%d%H%M%S")
+        end_strf = self.end.strftime("%Y%m%d%H%M%S")
 
         def filter_warc_path_by_date(path: str) -> bool:
             match: Optional[re.Match[str]] = date_pattern.search(path)
@@ -391,6 +396,27 @@ def filter_warc_path_by_date(path: str) -> bool:
             return start_strf <= match["date"] <= end_strf
 
         return sorted(
-            (f"{server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)),
+            (f"{self.server_address}{warc_path}" for warc_path in filter(filter_warc_path_by_date, warc_paths)),
             reverse=True,
         )
+
+    def _build_article_iterator(
+        self,
+        error_handling: Literal["suppress", "catch", "raise"],
+        extraction_filter: Optional[ExtractionFilter],
+        url_filter: Optional[URLFilter],
+    ) -> Iterator[Article]:
+        warc_paths = tuple(self._get_warc_paths())
+
+        article_task = partial(
+            self._fetch_articles,
+            publishers=self.publishers,
+            error_handling=error_handling,
+            extraction_filter=extraction_filter,
+            url_filter=url_filter,
+        )
+
+        if self.processes == 0:
+            yield from self._single_crawl(warc_paths, article_task)
+        else:
+            yield from self._parallel_crawl(warc_paths, article_task)
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index 8cde4603a..8f181ba2d 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -51,7 +51,7 @@ def scrape(
                         yield article
 
 
-class Scraper(BaseScraper):
+class WebScraper(BaseScraper):
     def __init__(
         self,
         publisher: PublisherEnum,
diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index f6c43a68c..8479fbc12 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -19,11 +19,12 @@ class SessionHandler:
     tear-downed and the next call to get_session() will build a new session.
     """
 
-    def __init__(self):
-        self._session: Optional[requests.Session] = None
+    def __init__(self, pool_connections: int = 50, pool_maxsize: int = 50):
+        self.session: Optional[requests.Session] = None
+        self.pool_connections = pool_connections
+        self.pool_maxsize = pool_maxsize
 
-    @staticmethod
-    def _session_factory() -> requests.Session:
+    def _session_factory(self) -> requests.Session:
         """Builds a new Session
 
         This returns a new client session build from pre-defined configurations:
@@ -47,13 +48,18 @@ def _response_log(response: requests.Response, *args, **kwargs) -> None:
             )
 
         # hooks
-        hooks = {"response": [lambda response, *args, **kwargs: response.raise_for_status(), _response_log]}
-        session.hooks = hooks
+        response_hooks = [lambda response, *args, **kwargs: response.raise_for_status(), _response_log]
+        session.hooks["response"].extend(response_hooks)
 
         # adapters
-        adapter_kwargs = {"pool_connections": 50, "pool_maxsize": 50}
-        session.mount("http://", requests.adapters.HTTPAdapter(**adapter_kwargs))
-        session.mount("https://", requests.adapters.HTTPAdapter(**adapter_kwargs))
+        session.mount(
+            "http://",
+            requests.adapters.HTTPAdapter(pool_connections=self.pool_connections, pool_maxsize=self.pool_maxsize),
+        )
+        session.mount(
+            "https://",
+            requests.adapters.HTTPAdapter(pool_connections=self.pool_connections, pool_maxsize=self.pool_maxsize),
+        )
 
         return session
 
@@ -67,9 +73,9 @@ def get_session(self) -> requests.Session:
         Returns:
             requests.Session: The current build session
         """
-        if not self._session:
-            self._session = self._session_factory()
-        return self._session
+        if not self.session:
+            self.session = self._session_factory()
+        return self.session
 
     def close_current_session(self) -> None:
         """Tears down the current build session
@@ -80,7 +86,7 @@ def close_current_session(self) -> None:
         session = self.get_session()
         basic_logger.debug(f"Close session {session}")
         session.close()
-        self._session = None
+        self.session = None
 
 
 session_handler = SessionHandler()
diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
index 5eb414ea7..3d883b2db 100644
--- a/src/fundus/scraping/url.py
+++ b/src/fundus/scraping/url.py
@@ -104,8 +104,8 @@ class Sitemap(URLSource):
     _sitemap_selector: ClassVar[XPath] = CSSSelector("sitemap > loc")
     _url_selector: ClassVar[XPath] = CSSSelector("url > loc")
 
-    def __iter__(self) -> AsyncIterator[str]:
-        def yield_recursive(sitemap_url: str) -> AsyncIterator[str]:
+    def __iter__(self) -> Iterator[str]:
+        def yield_recursive(sitemap_url: str) -> Iterator[str]:
             session = session_handler.get_session()
             if not validators.url(sitemap_url):
                 basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
diff --git a/tests/fixtures/fixture_collection.py b/tests/fixtures/fixture_collection.py
index d4553fd71..94598285b 100644
--- a/tests/fixtures/fixture_collection.py
+++ b/tests/fixtures/fixture_collection.py
@@ -72,7 +72,7 @@ class PubEnum(PublisherEnum):
 
 
 @pytest.fixture
-def collection_with_validate_publisher_enum(publisher_enum_with_news_map):
+def collection_with_valid_publisher_enum(publisher_enum_with_news_map):
     class CollectionWithValidatePublisherEnum(metaclass=PublisherCollectionMeta):
         pub = publisher_enum_with_news_map
 
diff --git a/tests/test_collection.py b/tests/test_collection.py
index a307efe34..1c2fa308a 100644
--- a/tests/test_collection.py
+++ b/tests/test_collection.py
@@ -11,8 +11,8 @@ def test_iter_empty_collection(self, empty_collection):
     def test_iter_collection_with_empty_publisher_enum(self, collection_with_empty_publisher_enum):
         assert list(collection_with_empty_publisher_enum) == []
 
-    def test_iter_collection_with_publisher_enum(self, collection_with_validate_publisher_enum):
-        assert list(collection_with_validate_publisher_enum) == [collection_with_validate_publisher_enum.pub.value]
+    def test_iter_collection_with_publisher_enum(self, collection_with_valid_publisher_enum):
+        assert list(collection_with_valid_publisher_enum) == [collection_with_valid_publisher_enum.pub.value]
 
     def test_publisher_enum_with_wrong_enum_value(self):
         with pytest.raises(ValueError):
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 6bae9ec23..0cbfafd5b 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -6,34 +6,28 @@
 class TestPipeline:
     def test_crawler_with_empty_collection(self, collection_with_empty_publisher_enum):
         crawler = Crawler(collection_with_empty_publisher_enum)
-        assert crawler.scrapers == tuple()
+        assert crawler.publishers == tuple()
         assert next(crawler.crawl(), None) is None
 
         with pytest.raises(ValueError):
             Crawler(*collection_with_empty_publisher_enum)
 
-    def test_crawler_with_collection(self, collection_with_validate_publisher_enum):
-        crawler = Crawler(*collection_with_validate_publisher_enum)
-        publisher = collection_with_validate_publisher_enum.pub.value
-        print(crawler.scrapers)
-        assert len(crawler.scrapers) == 1
-        assert len(crawler.scrapers[0].sources) == len(
-            list(value for value in publisher.source_mapping.values() if value)
-        )
+    def test_crawler_with_collection(self, collection_with_valid_publisher_enum):
+        crawler = Crawler(*collection_with_valid_publisher_enum)
+        publisher = collection_with_valid_publisher_enum.pub.value
+        assert len(crawler.publishers) == 1
 
     def test_crawler_with_publisher_enum(self, publisher_enum_with_rss_feeds, publisher_enum_with_news_map):
         crawler = Crawler(publisher_enum_with_rss_feeds, publisher_enum_with_news_map)
-        assert len(crawler.scrapers) == 2
+        assert len(crawler.publishers) == 2
 
         crawler = Crawler(publisher_enum_with_rss_feeds, publisher_enum_with_news_map, restrict_sources_to=[RSSFeed])
-        assert len(crawler.scrapers) == 1
-        assert crawler.scrapers[0].sources == publisher_enum_with_rss_feeds.value.source_mapping[RSSFeed]
+        assert len(crawler.publishers) == 2
 
         crawler = Crawler(publisher_enum_with_rss_feeds, publisher_enum_with_news_map, restrict_sources_to=[NewsMap])
-        assert len(crawler.scrapers) == 1
-        assert crawler.scrapers[0].sources == publisher_enum_with_news_map.value.source_mapping[NewsMap]
+        assert len(crawler.publishers) == 2
 
-    def test_consecutive_calls_to_crawl(self, collection_with_validate_publisher_enum):
-        crawler = Crawler(collection_with_validate_publisher_enum)
+    def test_consecutive_calls_to_crawl(self, collection_with_valid_publisher_enum):
+        crawler = Crawler(collection_with_valid_publisher_enum)
         next(crawler.crawl(max_articles=0), None)
         next(crawler.crawl(max_articles=0), None)

From 90139b51df5522b302a478ca3cbd583e12e60657 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 15 Feb 2024 14:10:24 +0100
Subject: [PATCH 06/36] remove async code

---
 pyproject.toml                 |  2 --
 src/fundus/__init__.py         | 19 -----------
 src/fundus/scraping/url.py     |  1 -
 src/fundus/utils/more_async.py | 61 ----------------------------------
 4 files changed, 83 deletions(-)
 delete mode 100644 src/fundus/utils/more_async.py

diff --git a/pyproject.toml b/pyproject.toml
index eb47ff248..a1b42a894 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,8 +31,6 @@ dependencies = [
     "colorama>=0.4, <1",
     "typing-extensions>=4.0, <5",
     "langdetect>=1.0, <2",
-    "aiohttp>=3.8, <4",
-    "aioitertools>=0.11, <1",
     "validators>=0.20, <1",
     "requests>=2.28, <3",
     "tqdm>=4.66, <5",
diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
index f5e5e45ff..f95f8b9d1 100644
--- a/src/fundus/__init__.py
+++ b/src/fundus/__init__.py
@@ -19,22 +19,3 @@
     "Sitemap",
     "NewsMap",
 ]
-
-# On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times,
-# Python throws an `RuntimeError: Event loop is closed exception` during Python's clean-up phase.
-
-# To reproduce the error run the following code:
-# from fundus import Crawler, PublisherCollection
-# crawler = Crawler(PublisherCollection.de.DieWelt)
-# for article in crawler.crawl(max_articles=1):
-#     pass
-# for article in crawler.crawl(max_articles=1):
-#     pass
-
-# A workaround involves to modify the event loop policy of asyncio on Windows machines.
-# Unfortunately, this is a global modification. For further information see:
-# https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
-if sys.platform == "win32":
-    import asyncio
-
-    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
index 3d883b2db..207b0b721 100644
--- a/src/fundus/scraping/url.py
+++ b/src/fundus/scraping/url.py
@@ -4,7 +4,6 @@
 from dataclasses import dataclass, field
 from functools import cached_property
 from typing import (
-    AsyncIterator,
     Callable,
     ClassVar,
     Dict,
diff --git a/src/fundus/utils/more_async.py b/src/fundus/utils/more_async.py
deleted file mode 100644
index 306fb4699..000000000
--- a/src/fundus/utils/more_async.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import asyncio
-from asyncio import AbstractEventLoop
-from typing import AsyncIterator, Iterable, TypeVar, Union, overload
-
-_T = TypeVar("_T")
-_VT = TypeVar("_VT")
-
-
-class _Sentinel:
-    pass
-
-
-__sentinel = _Sentinel()
-
-
-@overload
-async def async_next(iterator: AsyncIterator[_T]) -> _T:
-    ...
-
-
-@overload
-async def async_next(iterator: AsyncIterator[_T], default: Union[_VT, _Sentinel]) -> Union[_T, _VT]:
-    ...
-
-
-async def async_next(iterator: AsyncIterator[_T], default: Union[_VT, _Sentinel] = __sentinel) -> Union[_T, _VT]:
-    task = iterator.__anext__()
-    try:
-        return await task
-    except StopAsyncIteration:
-        if not isinstance(default, _Sentinel):
-            return default
-        else:
-            raise StopAsyncIteration
-
-
-async def make_iterable_async(iterable: Iterable[_T]) -> AsyncIterator[_T]:
-    for nxt in iterable:
-        yield nxt
-
-
-class ManagedEventLoop:
-    def __init__(self) -> None:
-        self.event_loop: AbstractEventLoop
-
-    def __enter__(self) -> AbstractEventLoop:
-        try:
-            asyncio.get_running_loop()
-            raise AssertionError()
-        except RuntimeError:
-            self.event_loop = asyncio.new_event_loop()
-        except AssertionError:
-            raise RuntimeError(
-                "There is already an event loop running. If you want to crawl articles inside an "
-                "async environment use crawl_async() instead."
-            )
-        return self.event_loop
-
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        self.event_loop.run_until_complete(self.event_loop.shutdown_asyncgens())
-        self.event_loop.close()

From f8436c74a3148c65d16d0ce05cc90b18ef582d4e Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 15 Feb 2024 14:11:17 +0100
Subject: [PATCH 07/36] rename pipeline.py -> crawler.py

---
 src/fundus/__init__.py                          | 2 +-
 src/fundus/scraping/{pipeline.py => crawler.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename src/fundus/scraping/{pipeline.py => crawler.py} (100%)

diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
index f95f8b9d1..63f964a5e 100644
--- a/src/fundus/__init__.py
+++ b/src/fundus/__init__.py
@@ -3,7 +3,7 @@
 
 from fundus.publishers import PublisherCollection
 from fundus.scraping.filter import Requires
-from fundus.scraping.pipeline import CrawlerBase, Crawler, CCNewsCrawler
+from fundus.scraping.crawler import CrawlerBase, Crawler, CCNewsCrawler
 from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 __module_path__ = pathlib.Path(__file__).parent
diff --git a/src/fundus/scraping/pipeline.py b/src/fundus/scraping/crawler.py
similarity index 100%
rename from src/fundus/scraping/pipeline.py
rename to src/fundus/scraping/crawler.py

From f1be26b61e213177677f9ef76ad3232f93e9e668 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 15 Feb 2024 14:29:18 +0100
Subject: [PATCH 08/36] update documentation

---
 README.md                    |  2 +-
 docs/1_getting_started.md    |  2 --
 docs/2_crawl_from_cc_news.md | 10 +++++-----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 15c02b605..687495d1a 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ Maybe you want to crawl a specific news source instead. Let's crawl news article
 from fundus import PublisherCollection, Crawler
 
 # initialize the crawler for Washington Times
-crawler = Crawler(PublisherCollection.us.WashingtonTimes)
+crawler = Crawler(PublisherCollection.us.TheNewYorker)
 
 # crawl 2 articles and print
 for article in crawler.crawl(max_articles=2):
diff --git a/docs/1_getting_started.md b/docs/1_getting_started.md
index b39ba9f05..f5abbb209 100644
--- a/docs/1_getting_started.md
+++ b/docs/1_getting_started.md
@@ -46,8 +46,6 @@ You can also initialize a crawler for the entire publisher collection
 crawler = Crawler(PublisherCollection)
 ````
 
-**_NOTE:_** To build a pipeline from low-level `Scraper` objects make use of the `BaseCrawler` class.
-
 # How to crawl articles
 
 Now to crawl articles make use of the `crawl()` method of the initialized crawler class.
diff --git a/docs/2_crawl_from_cc_news.md b/docs/2_crawl_from_cc_news.md
index 0c36a17c0..a43298c65 100644
--- a/docs/2_crawl_from_cc_news.md
+++ b/docs/2_crawl_from_cc_news.md
@@ -1,12 +1,12 @@
 # Table of Contents
 
-* [Crawl articles from CC-NEWS](#crawl-articles-from-cc-news)
+* [How to crawl articles from CC-NEWS](#how-to-crawl-articles-from-cc-news)
   * [The crawler](#the-crawler)
     * [OS start method](#os-start-method)
   * [Date range](#date-range)
   * [Multiprocessing](#multiprocessing)
 
-# Crawl articles from CC-NEWS
+# How to crawl articles from CC-NEWS
 
 This tutorial explains how to crawl articles from the [CC-NEWS](https://paperswithcode.com/dataset/cc-news) dataset using Fundus.
 
@@ -48,8 +48,8 @@ from datetime import datetime
 
 from fundus import CCNewsCrawler, PublisherCollection
 
-crawler = CCNewsCrawler(*PublisherCollection)
-for article in crawler.crawl(start=datetime(2020, 1, 1), end=datetime(2020, 3, 1), max_articles=100):
+crawler = CCNewsCrawler(*PublisherCollection, start=datetime(2020, 1, 1), end=datetime(2020, 3, 1))
+for article in crawler.crawl(max_articles=100):
     print(article)
 ````
 
@@ -66,7 +66,7 @@ from fundus import CCNewsCrawler, PublisherCollection
 crawler = CCNewsCrawler(*PublisherCollection, processes=4)
 ````
 
-To omit multiprocessing, pass `0` to the `processes` parameter.
+To omit multiprocessing, pass `-1` to the `processes` parameter.
 
 In the [next section](3_the_article_class.md) we will introduce you to the `Article` class.
 

From 67fafc69375a6a598081bc37ff1777e53f2850b4 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 15 Feb 2024 14:32:46 +0100
Subject: [PATCH 09/36] code cleanup

---
 src/fundus/__init__.py         |  3 +--
 src/fundus/scraping/crawler.py |  2 +-
 src/fundus/scraping/html.py    |  2 +-
 src/fundus/scraping/url.py     | 10 +---------
 4 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
index 63f964a5e..d1a4d2482 100644
--- a/src/fundus/__init__.py
+++ b/src/fundus/__init__.py
@@ -1,9 +1,8 @@
 import pathlib
-import sys
 
 from fundus.publishers import PublisherCollection
+from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase
 from fundus.scraping.filter import Requires
-from fundus.scraping.crawler import CrawlerBase, Crawler, CCNewsCrawler
 from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 __module_path__ = pathlib.Path(__file__).parent
diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index bbb6a7bff..4356803c4 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -3,7 +3,7 @@
 import gzip
 import os
 import re
-from abc import abstractmethod, ABC
+from abc import ABC, abstractmethod
 from datetime import datetime
 from functools import lru_cache, partial, wraps
 from multiprocessing import Manager
diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 00725d12f..9f04d869c 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -2,7 +2,7 @@
 from abc import abstractmethod
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Dict, Iterable, Iterator, List, Optional, Protocol, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Protocol
 from urllib.parse import urlparse
 
 import chardet
diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
index 207b0b721..ea830a27b 100644
--- a/src/fundus/scraping/url.py
+++ b/src/fundus/scraping/url.py
@@ -3,15 +3,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from functools import cached_property
-from typing import (
-    Callable,
-    ClassVar,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-)
+from typing import Callable, ClassVar, Dict, Iterable, Iterator, List, Optional
 
 import feedparser
 import lxml.html

From 8c36a71a75de78c595778558aa95eb8d38d221ad Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 15 Feb 2024 15:41:05 +0100
Subject: [PATCH 10/36] add logic to filter publisher not fulfilling extraction
 requirements

---
 src/fundus/scraping/crawler.py              | 61 +++++++++++++++++----
 tests/fixtures/fixture_collection.py        | 25 +++++++++
 tests/{test_pipeline.py => test_crawler.py} | 18 ++++++
 3 files changed, 92 insertions(+), 12 deletions(-)
 rename tests/{test_pipeline.py => test_crawler.py} (66%)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 4356803c4..9e6d8dd9e 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -32,10 +32,10 @@
 import requests
 from dateutil.rrule import MONTHLY, rrule
 from tqdm import tqdm
-from typing_extensions import ParamSpec
+from typing_extensions import ParamSpec, TypeAlias
 
-from fundus import PublisherCollection
-from fundus.publishers.base_objects import PublisherEnum
+from fundus.logging import basic_logger
+from fundus.publishers.base_objects import PublisherCollectionMeta, PublisherEnum
 from fundus.scraping.article import Article
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter
@@ -46,6 +46,8 @@
 _T = TypeVar("_T")
 _P = ParamSpec("_P")
 
+Publisher: TypeAlias = Union[PublisherEnum, Type[PublisherEnum], PublisherCollectionMeta]
+
 
 # noinspection PyPep8Naming
 class dill_wrapper(Generic[_P, _T]):
@@ -112,9 +114,13 @@ def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]:
 
 
 class CrawlerBase(ABC):
+    def __init__(self, *publishers: Publisher):
+        self.publishers = tuple(set(more_itertools.collapse(publishers)))
+
     @abstractmethod
     def _build_article_iterator(
         self,
+        publishers: Tuple[PublisherEnum, ...],
         error_handling: Literal["suppress", "catch", "raise"],
         extraction_filter: Optional[ExtractionFilter],
         url_filter: Optional[URLFilter],
@@ -174,8 +180,35 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
 
         response_cache: Set[str] = set()
 
+        extraction_filter = build_extraction_filter()
+        fitting_publisher: List[PublisherEnum] = []
+
+        if isinstance(extraction_filter, Requires):
+            for publisher in self.publishers:
+                supported_attributes = set(
+                    more_itertools.flatten(
+                        collection.names for collection in publisher.parser.attribute_mapping.values()
+                    )
+                )
+                if missing_attributes := extraction_filter.required_attributes - supported_attributes:
+                    basic_logger.warning(
+                        f"The required attribute(s) `{', '.join(missing_attributes)}` "
+                        f"is(are) not supported by {publisher.publisher_name}. Skipping publisher"
+                    )
+                else:
+                    fitting_publisher.append(publisher)
+
+            if not fitting_publisher:
+                basic_logger.error(
+                    f"Could not find any fitting publisher for required attributes  "
+                    f"`{', '.join(extraction_filter.required_attributes)}`"
+                )
+                return
+
         article_idx = 0
-        for article in self._build_article_iterator(error_handling, build_extraction_filter(), url_filter):
+        for article in self._build_article_iterator(
+            tuple(fitting_publisher or self.publishers), error_handling, build_extraction_filter(), url_filter
+        ):
             if not only_unique or article.html.responded_url not in response_cache:
                 response_cache.add(article.html.responded_url)
                 article_idx += 1
@@ -187,7 +220,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
 class Crawler(CrawlerBase):
     def __init__(
         self,
-        *publishers: Union[PublisherEnum, Type[PublisherEnum], Type[PublisherCollection]],
+        *publishers: Publisher,
         restrict_sources_to: Optional[List[Type[URLSource]]] = None,
         delay: Optional[Union[float, Delay]] = 1.0,
         threading: bool = True,
@@ -196,7 +229,7 @@ def __init__(
 
         Examples:
             >>> from fundus import PublisherCollection, Crawler
-            >>> crawler = Crawler(PublisherCollection)
+            >>> crawler = Crawler(*PublisherCollection)
             >>> # Crawler(PublisherCollection.us) to crawl only english news
             >>> for article in crawler.crawl():
             >>>     print(article)
@@ -215,7 +248,8 @@ def __init__(
         if not publishers:
             raise ValueError("param <publishers> of <Crawler.__init__> has to be non empty")
 
-        self.publishers = tuple(more_itertools.collapse(publishers))
+        super().__init__(*publishers)
+
         self.restrict_sources_to = restrict_sources_to
         self.delay = delay
         self.threading = threading
@@ -268,6 +302,7 @@ def _threaded_crawl(
 
     def _build_article_iterator(
         self,
+        publishers: Tuple[PublisherEnum, ...],
         error_handling: Literal["suppress", "catch", "raise"],
         extraction_filter: Optional[ExtractionFilter],
         url_filter: Optional[URLFilter],
@@ -280,15 +315,15 @@ def _build_article_iterator(
         )
 
         if self.threading:
-            yield from self._threaded_crawl(self.publishers, article_task)
+            yield from self._threaded_crawl(publishers, article_task)
         else:
-            yield from self._single_crawl(self.publishers, article_task)
+            yield from self._single_crawl(publishers, article_task)
 
 
 class CCNewsCrawler(CrawlerBase):
     def __init__(
         self,
-        *publishers: PublisherEnum,
+        *publishers: Publisher,
         start: datetime = datetime(2016, 8, 1),
         end: datetime = datetime.now(),
         processes: int = -1,
@@ -305,7 +340,8 @@ def __init__(
             server_address: The CC-NEWS dataset server address. Defaults to 'https://data.commoncrawl.org/'.
         """
 
-        self.publishers = tuple(more_itertools.collapse(publishers))
+        super().__init__(*publishers)
+
         self.start = start
         self.end = end
         self.processes = os.cpu_count() or 0 if processes == -1 else processes
@@ -402,6 +438,7 @@ def filter_warc_path_by_date(path: str) -> bool:
 
     def _build_article_iterator(
         self,
+        publishers: Tuple[PublisherEnum, ...],
         error_handling: Literal["suppress", "catch", "raise"],
         extraction_filter: Optional[ExtractionFilter],
         url_filter: Optional[URLFilter],
@@ -410,7 +447,7 @@ def _build_article_iterator(
 
         article_task = partial(
             self._fetch_articles,
-            publishers=self.publishers,
+            publishers=publishers,
             error_handling=error_handling,
             extraction_filter=extraction_filter,
             url_filter=url_filter,
diff --git a/tests/fixtures/fixture_collection.py b/tests/fixtures/fixture_collection.py
index 94598285b..b18879bb2 100644
--- a/tests/fixtures/fixture_collection.py
+++ b/tests/fixtures/fixture_collection.py
@@ -77,3 +77,28 @@ class CollectionWithValidatePublisherEnum(metaclass=PublisherCollectionMeta):
         pub = publisher_enum_with_news_map
 
     return CollectionWithValidatePublisherEnum
+
+
+@pytest.fixture
+def collection_with_two_valid_publisher_enum(parser_proxy_with_version):
+    class PubEnumNews(PublisherEnum):
+        news = PublisherSpec(
+            name="test_pub",
+            domain="https://test.com/",
+            sources=[NewsMap("https://test.com/test_newsmap")],
+            parser=parser_proxy_with_version,
+        )
+
+    class PubEnumSitemap(PublisherEnum):
+        sitemap = PublisherSpec(
+            name="test_pub",
+            domain="https://test.com/",
+            sources=[Sitemap("https://test.com/test_sitemap")],
+            parser=parser_proxy_with_version,
+        )
+
+    class CollectionWithTwoValidatePublisherEnum(metaclass=PublisherCollectionMeta):
+        enum_news = PubEnumNews
+        enum_sitemap = PubEnumSitemap
+
+    return CollectionWithTwoValidatePublisherEnum
diff --git a/tests/test_pipeline.py b/tests/test_crawler.py
similarity index 66%
rename from tests/test_pipeline.py
rename to tests/test_crawler.py
index 0cbfafd5b..53587a951 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_crawler.py
@@ -17,6 +17,24 @@ def test_crawler_with_collection(self, collection_with_valid_publisher_enum):
         publisher = collection_with_valid_publisher_enum.pub.value
         assert len(crawler.publishers) == 1
 
+    def test_crawler_with_two_collections(
+        self,
+        collection_with_valid_publisher_enum,
+        collection_with_empty_publisher_enum,
+        collection_with_two_valid_publisher_enum,
+    ):
+        crawler = Crawler(collection_with_empty_publisher_enum, collection_with_valid_publisher_enum)
+        assert len(crawler.publishers) == 1
+
+        crawler = Crawler(collection_with_valid_publisher_enum, collection_with_valid_publisher_enum)
+        assert len(crawler.publishers) == 1
+
+        crawler = Crawler(collection_with_two_valid_publisher_enum)
+        assert len(crawler.publishers) == 2
+
+        crawler = Crawler(collection_with_valid_publisher_enum, collection_with_two_valid_publisher_enum)
+        assert len(crawler.publishers) == 3
+
     def test_crawler_with_publisher_enum(self, publisher_enum_with_rss_feeds, publisher_enum_with_news_map):
         crawler = Crawler(publisher_enum_with_rss_feeds, publisher_enum_with_news_map)
         assert len(crawler.publishers) == 2

From 7a392723fad5eb4b2409d1d8b18bc41386703609 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Sat, 17 Feb 2024 17:32:59 +0100
Subject: [PATCH 11/36] remove duplicate error catch

---
 src/fundus/scraping/html.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 9f04d869c..d70bb5c8d 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -111,9 +111,6 @@ def filter_url(u: str) -> bool:
                     return
                 continue
 
-            except ConnectionError as error:
-                basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'")
-
             except Exception as error:
                 basic_logger.warning(f"Warning! Skipped  requested URL '{url}' because of an unexpected error {error}")
                 continue

From 082b1d81ba0110791b6dd5452465b495aefa8968 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Fri, 8 Mar 2024 19:38:45 +0100
Subject: [PATCH 12/36] limit queue size

---
 src/fundus/scraping/crawler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 9e6d8dd9e..327e027e4 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -294,7 +294,7 @@ def _single_crawl(
     def _threaded_crawl(
         publishers: Tuple[PublisherEnum, ...], article_task: Callable[[PublisherEnum], Iterator[Article]]
     ) -> Iterator[Article]:
-        article_queue: Queue[Article] = Queue()
+        article_queue: Queue[Article] = Queue(len(publishers))
         wrapped_article_task = queue_wrapper(article_queue, article_task)
 
         with ThreadPool(processes=len(publishers) or None) as pool:
@@ -373,7 +373,7 @@ def _parallel_crawl(
         # process-bound. The reason is that we stream the data and process it on the fly rather than downloading all
         # files and processing them afterward. Therefore, we utilize multiprocessing here instead of multithreading.
         with Manager() as manager, Pool(processes=min(self.processes, len(warc_paths))) as pool:
-            article_queue: Queue[Article] = manager.Queue()
+            article_queue: Queue[Article] = manager.Queue(maxsize=1000)
 
             # Because multiprocessing.Pool does not support iterators as targets,
             # we wrap the article_task to write the articles to a queue instead of returning them directly.

From afdda0a0f9d8a7e4df4f339cdcf557e9df8cd8b3 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 25 Mar 2024 14:35:49 +0100
Subject: [PATCH 13/36] finish merge

---
 scripts/generate_parser_test_files.py | 16 ++++++++--------
 src/fundus/scraping/crawler.py        | 10 ++--------
 src/fundus/scraping/scraper.py        | 10 ++++++++--
 tests/utility.py                      |  4 ++--
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
index 83ab0e0ca..8c24b9453 100644
--- a/scripts/generate_parser_test_files.py
+++ b/scripts/generate_parser_test_files.py
@@ -5,25 +5,25 @@
 
 from tqdm import tqdm
 
-from fundus import BaseCrawler, Crawler, PublisherCollection
+from fundus import Crawler, PublisherCollection
 from fundus.logging import basic_logger
 from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.article import Article
-from fundus.scraping.html import FundusSource
-from fundus.scraping.scraper import Scraper
+from fundus.scraping.filter import RequiresAll
+from fundus.scraping.html import WebSource
+from fundus.scraping.scraper import BaseScraper, WebScraper
 from tests.test_parser import attributes_required_to_cover
 from tests.utility import HTMLTestFile, get_test_case_json, load_html_test_file_mapping
 
 
 def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]:
-    crawler: BaseCrawler
     if url is None:
         crawler = Crawler(enum)
+        return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None)
     else:
-        source = FundusSource([url], publisher=enum.publisher_name)
-        scraper = Scraper(source, parser=enum.parser)
-        crawler = BaseCrawler(scraper)
-    return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None)
+        source = WebSource([url], publisher=enum.publisher_name)
+        scraper = BaseScraper(source, parser_mapping={enum.publisher_name: enum.parser})
+        return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()))
 
 
 def parse_arguments() -> Namespace:
diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 327e027e4..a0fb75aaa 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -38,7 +38,7 @@
 from fundus.publishers.base_objects import PublisherCollectionMeta, PublisherEnum
 from fundus.scraping.article import Article
 from fundus.scraping.delay import Delay
-from fundus.scraping.filter import ExtractionFilter, Requires, URLFilter
+from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter
 from fundus.scraping.html import CCNewsSource
 from fundus.scraping.scraper import CCNewsScraper, WebScraper
 from fundus.scraping.url import URLSource
@@ -168,13 +168,7 @@ def crawl(
 
         def build_extraction_filter() -> Optional[ExtractionFilter]:
             if isinstance(only_complete, bool):
-                return (
-                    None
-                    if only_complete is False
-                    else lambda extracted: not all(
-                        bool(v) if not isinstance(v, Exception) else False for _, v in extracted.items()
-                    )
-                )
+                return None if only_complete is False else RequiresAll()
             else:
                 return only_complete
 
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index 84965e437..57ceb3860 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -7,7 +7,11 @@
 from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.article import Article
 from fundus.scraping.delay import Delay
-from fundus.scraping.filter import ExtractionFilter, FilterResultWithMissingAttributes, URLFilter
+from fundus.scraping.filter import (
+    ExtractionFilter,
+    FilterResultWithMissingAttributes,
+    URLFilter,
+)
 from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource
 from fundus.scraping.url import URLSource
 
@@ -51,7 +55,9 @@ def scrape(
                                 f"{', '.join(filter_result.missing_attributes)!r} is(are) missing"
                             )
                         else:
-                            basic_logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter")
+                            basic_logger.debug(
+                                f"Skipped article at '{html.requested_url}' because of extraction filter"
+                            )
                     else:
                         article = Article.from_extracted(html=html, extracted=extraction)
                         yield article
diff --git a/tests/utility.py b/tests/utility.py
index 91f343b43..22378c4b2 100644
--- a/tests/utility.py
+++ b/tests/utility.py
@@ -11,7 +11,7 @@
 from fundus.parser import BaseParser
 from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.article import Article
-from fundus.scraping.html import HTML, HTMLSource
+from fundus.scraping.html import HTML, SourceInfo
 from scripts.generate_tables import supported_publishers_markdown_path
 from tests.resources.parser.test_data import __module_path__ as test_resource_path
 
@@ -28,7 +28,7 @@ def get_test_articles(publisher: PublisherEnum) -> List[Article]:
             crawl_date=html_test_file.crawl_date,
             requested_url=html_test_file.url,
             responded_url=html_test_file.url,
-            source=HTMLSource(publisher.publisher_name),
+            source=SourceInfo(publisher.publisher_name),
         )
         article = Article.from_extracted(extracted=extraction, html=html)
         articles.append(article)

From 6574ac3bfbe9c2ca8bd7ab0db89805faea960c4a Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:00:13 +0200
Subject: [PATCH 14/36] Apply suggestions from code review

Co-authored-by: Adrian Breiding <ad123br@gmail.com>
---
 src/fundus/scraping/crawler.py | 7 ++++---
 src/fundus/scraping/scraper.py | 2 +-
 src/fundus/scraping/session.py | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index a0fb75aaa..add008291 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -224,12 +224,12 @@ def __init__(
         Examples:
             >>> from fundus import PublisherCollection, Crawler
             >>> crawler = Crawler(*PublisherCollection)
-            >>> # Crawler(PublisherCollection.us) to crawl only english news
+            >>> # Crawler(PublisherCollection.us) to crawl only american news
             >>> for article in crawler.crawl():
             >>>     print(article)
 
         Args:
-            *publishers (Union[PublisherEnum, Type[PublisherEnum]]): The publishers to crawl.
+            *publishers (Union[PublisherEnum, Type[PublisherEnum], PublisherCollectionMeta]): The publishers to crawl.
             restrict_sources_to (Optional[List[Type[URLSource]]]): Lets you restrict
                 sources defined in the publisher specs. If set, only articles from given source types
                 will be yielded.
@@ -257,9 +257,10 @@ def _fetch_articles(
     ) -> Iterator[Article]:
         def build_delay() -> Optional[Delay]:
             if isinstance(self.delay, float):
+                  delay = self.delay
 
                 def constant_delay() -> float:
-                    return self.delay  # type: ignore[return-value]
+                    return delay
 
                 return constant_delay
 
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index 57ceb3860..f8753280d 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -86,7 +86,7 @@ def __init__(
             )
             for url_source in url_sources
         ]
-        parser_mapping = {publisher.publisher_name: publisher.parser}
+        parser_mapping: Dict[str, ParserProxy] = {publisher.publisher_name: publisher.parser}
         super().__init__(*html_sources, parser_mapping=parser_mapping)
 
 
diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index 8479fbc12..8df837994 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -33,7 +33,7 @@ def _session_factory(self) -> requests.Session:
         - hooks = {'request': lambda request:}
 
         Returns:
-            An new ClientSession
+            A new requests.Session
         """
 
         session = requests.Session()

From 10d1a560228cc3f7379ee2ad956a825acc48f52d Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:06:48 +0200
Subject: [PATCH 15/36] fix indentation

---
 src/fundus/scraping/crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index add008291..d64f58445 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -257,7 +257,7 @@ def _fetch_articles(
     ) -> Iterator[Article]:
         def build_delay() -> Optional[Delay]:
             if isinstance(self.delay, float):
-                  delay = self.delay
+                delay = self.delay
 
                 def constant_delay() -> float:
                     return delay

From 4cfd4313531f175f414ea6d13397ca8552b3442d Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:09:08 +0200
Subject: [PATCH 16/36] clean imports in documentation

---
 docs/4_how_to_filter_articles.md       | 3 +--
 docs/5_how_to_search_for_publishers.md | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/4_how_to_filter_articles.md b/docs/4_how_to_filter_articles.md
index 786e6a387..d6233d18f 100644
--- a/docs/4_how_to_filter_articles.md
+++ b/docs/4_how_to_filter_articles.md
@@ -184,8 +184,7 @@ You can preselect the source for your articles when initializing a new `Crawler`
 Let's initiate a crawler who only crawls from `NewsMaps`'s.
 
 ````python
-from fundus import Crawler, PublisherCollection
-from fundus.scraping.url import NewsMap
+from fundus import Crawler, PublisherCollection, NewsMap
 
 crawler = Crawler(PublisherCollection.us, restrict_sources_to=[NewsMap])
 ````
diff --git a/docs/5_how_to_search_for_publishers.md b/docs/5_how_to_search_for_publishers.md
index 3d09bb5f5..bbbbf79ee 100644
--- a/docs/5_how_to_search_for_publishers.md
+++ b/docs/5_how_to_search_for_publishers.md
@@ -15,8 +15,7 @@ You can search through the collection to get only publishers fitting your use ca
 Let's get some publishers based in the US, supporting an attribute called `topics` and `NewsMap` as a source, and use them to initialize a crawler afterward.
 
 ````python
-from fundus import Crawler, PublisherCollection
-from fundus.scraping.url import NewsMap
+from fundus import Crawler, PublisherCollection, NewsMap
 
 fitting_publishers = PublisherCollection.us.search(attributes=["topics"], source_types=[NewsMap])
 crawler = Crawler(fitting_publishers)

From 0c57fc1f7f060fe88624728005665a57fbb85049 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:12:43 +0200
Subject: [PATCH 17/36] add `None` as default to `next` for test file
 generation

---
 scripts/generate_parser_test_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
index 8c24b9453..adbf2e773 100644
--- a/scripts/generate_parser_test_files.py
+++ b/scripts/generate_parser_test_files.py
@@ -23,7 +23,7 @@ def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional
     else:
         source = WebSource([url], publisher=enum.publisher_name)
         scraper = BaseScraper(source, parser_mapping={enum.publisher_name: enum.parser})
-        return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()))
+        return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None)
 
 
 def parse_arguments() -> Namespace:

From 53550fd1739279d8b4e3963062622be73bd3d0f3 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:22:44 +0200
Subject: [PATCH 18/36] add `tmp` variable for secure iteration

---
 src/fundus/scraping/crawler.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index d64f58445..e483a6c19 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -199,15 +199,15 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
                 )
                 return
 
-        article_idx = 0
+        article_count = 0
         for article in self._build_article_iterator(
             tuple(fitting_publisher or self.publishers), error_handling, build_extraction_filter(), url_filter
         ):
             if not only_unique or article.html.responded_url not in response_cache:
                 response_cache.add(article.html.responded_url)
-                article_idx += 1
+                article_count += 1
                 yield article
-            if article_idx == max_articles:
+            if article_count == max_articles:
                 break
 
 
@@ -237,6 +237,9 @@ def __init__(
                 downloads. You can set a delay directly using float or any callable satisfying the Delay
                 protocol. If set to None, no delay will be used between batches. See Delay for more
                 information. Defaults to None.
+            threading (bool): If True, the crawler will use a dedicated thread per publisher, if set to False,
+                the crawler will use a single thread for a publishers and load articles succesively. This will greatly
+                influence performance, and it is highly recommended to use a threaded crawler. Deafults to True.
         """
 
         if not publishers:
@@ -279,7 +282,8 @@ def _single_crawl(
     ) -> Iterator[Article]:
         article_iterators = [article_task(publisher) for publisher in publishers]
         while article_iterators:
-            for iterator in article_iterators:
+            tmp = article_iterators
+            for iterator in tmp:
                 try:
                     yield next(iterator)
                 except StopIteration:

From 0dba3632849435c18105552ca4674d9f54ee1a6f Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:30:55 +0200
Subject: [PATCH 19/36] adjust docstrings

---
 src/fundus/scraping/session.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index 8df837994..327ed01b8 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -8,14 +8,14 @@
 
 
 class SessionHandler:
-    """Object for handling  project global aiohttp.ClientSessions
+    """Object for handling  project global request.Session
 
     The session life cycle consists of three steps which can be repeated indefinitely:
     Build, Supply, Teardown.
     Initially there is no session build within the session handler. When a session is requested
     with get_session() either a new one is created with _session_factory() or the session handler's
     existing one returned. Every subsequent call to get_session() will return the same
-    aiohttp.ClientSession object. If close_current_session() is called, the current session will be
+    response.Session object. If close_current_session() is called, the current session will be
     tear-downed and the next call to get_session() will build a new session.
     """
 
@@ -30,12 +30,13 @@ def _session_factory(self) -> requests.Session:
         This returns a new client session build from pre-defined configurations:
         - pool_connections: 50
         - pool_maxsize: 50
-        - hooks = {'request': lambda request:}
+        - hooks = {'response': raise_for_status(), _response_log():}
 
         Returns:
             A new requests.Session
         """
 
+        basic_logger.debug("Creating new session")
         session = requests.Session()
 
         def _response_log(response: requests.Response, *args, **kwargs) -> None:

From 54974a8a9a3f23f609ba9d903bf0281d79ce7fbd Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:31:26 +0200
Subject: [PATCH 20/36] close session after crawler is being used

---
 src/fundus/scraping/crawler.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index e483a6c19..5f9253426 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -41,6 +41,7 @@
 from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter
 from fundus.scraping.html import CCNewsSource
 from fundus.scraping.scraper import CCNewsScraper, WebScraper
+from fundus.scraping.session import session_handler
 from fundus.scraping.url import URLSource
 
 _T = TypeVar("_T")
@@ -210,6 +211,8 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
             if article_count == max_articles:
                 break
 
+        session_handler.close_current_session()
+
 
 class Crawler(CrawlerBase):
     def __init__(

From 06a61d85961dcad8016011d139402fa37684d665 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:33:43 +0200
Subject: [PATCH 21/36] apply 4857a1c to branch

---
 src/fundus/scraping/url.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
index ea830a27b..6b9231af2 100644
--- a/src/fundus/scraping/url.py
+++ b/src/fundus/scraping/url.py
@@ -45,7 +45,7 @@ def __post_init__(self):
         if not self._request_header:
             self._request_header = _default_header
         if not validators.url(self.url):
-            raise ValueError(f"Invalid url '{self.url}'")
+            basic_logger.error(f"{type(self).__name__} initialized with invalid URL {self.url}")
 
     def set_header(self, request_header: Dict[str, str]) -> None:
         self._request_header = request_header

From 940bc976d496a91c69975c841e6ab64b6feea10d Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Thu, 4 Apr 2024 14:41:55 +0200
Subject: [PATCH 22/36] add log message if skipping entire publisher due to
 server errors

---
 src/fundus/scraping/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index d70bb5c8d..028ad3e72 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -108,7 +108,7 @@ def filter_url(u: str) -> bool:
             except (HTTPError, ConnectionError) as error:
                 basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'")
                 if isinstance(error, HTTPError) and error.response.status_code >= 500:
-                    return
+                    basic_logger.info(f"Skipped {self.publisher} due to server errors: '{error}'")
                 continue
 
             except Exception as error:

From 6353352aa68b315343dd6b7682b92e2e01858c63 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Fri, 5 Apr 2024 15:31:04 +0200
Subject: [PATCH 23/36] change log level for error message in test case script
 to error

---
 scripts/generate_parser_test_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
index adbf2e773..de8c10783 100644
--- a/scripts/generate_parser_test_files.py
+++ b/scripts/generate_parser_test_files.py
@@ -106,7 +106,7 @@ def main() -> None:
 
             if arguments.overwrite or not html_mapping.get(publisher.parser.latest_version):
                 if not (article := get_test_article(publisher, url)):
-                    basic_logger.warning(f"Couldn't get article for {publisher.name}. Skipping")
+                    basic_logger.error(f"Couldn't get article for {publisher.name}. Skipping")
                     continue
                 html = HTMLTestFile(
                     url=article.html.responded_url,

From afa4c462f663d0925ae298594dd8d2e0b02cf34f Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 15 Apr 2024 13:58:41 +0200
Subject: [PATCH 24/36] Apply suggestions from code review

Co-authored-by: Conrad Dobberstein <conrad.dobberstein@informatik.hu-berlin.de>
---
 src/fundus/scraping/crawler.py | 4 ++--
 src/fundus/scraping/session.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 5f9253426..078a62cea 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -195,7 +195,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
 
             if not fitting_publisher:
                 basic_logger.error(
-                    f"Could not find any fitting publisher for required attributes  "
+                    f"Could not find any fitting publishers for required attributes  "
                     f"`{', '.join(extraction_filter.required_attributes)}`"
                 )
                 return
@@ -241,7 +241,7 @@ def __init__(
                 protocol. If set to None, no delay will be used between batches. See Delay for more
                 information. Defaults to None.
             threading (bool): If True, the crawler will use a dedicated thread per publisher, if set to False,
-                the crawler will use a single thread for a publishers and load articles succesively. This will greatly
+                the crawler will use a single thread for all publishers and load articles successively. This will greatly
                 influence performance, and it is highly recommended to use a threaded crawler. Deafults to True.
         """
 
diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index 327ed01b8..440574cde 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -8,7 +8,7 @@
 
 
 class SessionHandler:
-    """Object for handling  project global request.Session
+    """Object for handling project global request.Session
 
     The session life cycle consists of three steps which can be repeated indefinitely:
     Build, Supply, Teardown.

From 153c0ebf27ec3065c657d100ad172256e523526c Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 15 Apr 2024 19:56:39 +0200
Subject: [PATCH 25/36] fix README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 687495d1a..a092f0f74 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,7 @@ Maybe you want to crawl a specific news source instead. Let's crawl news article
 ```python
 from fundus import PublisherCollection, Crawler
 
-# initialize the crawler for Washington Times
+# initialize the crawler for The New Yorker
 crawler = Crawler(PublisherCollection.us.TheNewYorker)
 
 # crawl 2 articles and print

From f90c52e7496e27dd0994a0bf378dbcd63ccf026b Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 15 Apr 2024 19:56:58 +0200
Subject: [PATCH 26/36] remove leftover in `HTMLSource`

---
 src/fundus/scraping/html.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 028ad3e72..52497bbf2 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -59,9 +59,7 @@ class WebSourceInfo(SourceInfo):
 
 
 class HTMLSource(Protocol):
-    @abstractmethod
-    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
-        ...
+    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: ...
 
 
 class WebSource:

From f9ba37f89a09b73be188f5fd347a8224aa63a649 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 15 Apr 2024 19:57:47 +0200
Subject: [PATCH 27/36] add contextmanager to session handler and some more
 minor fixes

---
 src/fundus/scraping/crawler.py | 30 ++++++++++++-------------
 src/fundus/scraping/session.py | 41 ++++++++++++++++++++++++++++------
 2 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 078a62cea..18e952710 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -31,6 +31,7 @@
 import more_itertools
 import requests
 from dateutil.rrule import MONTHLY, rrule
+from more_itertools import roundrobin
 from tqdm import tqdm
 from typing_extensions import ParamSpec, TypeAlias
 
@@ -116,7 +117,11 @@ def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]:
 
 class CrawlerBase(ABC):
     def __init__(self, *publishers: Publisher):
-        self.publishers = tuple(set(more_itertools.collapse(publishers)))
+
+        if not publishers:
+            raise ValueError("param <publishers> of <Crawler.__init__> has to be non empty")
+
+        self.publishers: List[PublisherEnum] = list(set(more_itertools.collapse(publishers)))
 
     @abstractmethod
     def _build_article_iterator(
@@ -176,7 +181,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
         response_cache: Set[str] = set()
 
         extraction_filter = build_extraction_filter()
-        fitting_publisher: List[PublisherEnum] = []
+        fitting_publishers: List[PublisherEnum] = []
 
         if isinstance(extraction_filter, Requires):
             for publisher in self.publishers:
@@ -191,18 +196,20 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
                         f"is(are) not supported by {publisher.publisher_name}. Skipping publisher"
                     )
                 else:
-                    fitting_publisher.append(publisher)
+                    fitting_publishers.append(publisher)
 
-            if not fitting_publisher:
+            if not fitting_publishers:
                 basic_logger.error(
                     f"Could not find any fitting publishers for required attributes  "
                     f"`{', '.join(extraction_filter.required_attributes)}`"
                 )
                 return
+        else:
+            fitting_publishers = self.publishers
 
         article_count = 0
         for article in self._build_article_iterator(
-            tuple(fitting_publisher or self.publishers), error_handling, build_extraction_filter(), url_filter
+                tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter
         ):
             if not only_unique or article.html.responded_url not in response_cache:
                 response_cache.add(article.html.responded_url)
@@ -245,9 +252,6 @@ def __init__(
                 influence performance, and it is highly recommended to use a threaded crawler. Deafults to True.
         """
 
-        if not publishers:
-            raise ValueError("param <publishers> of <Crawler.__init__> has to be non empty")
-
         super().__init__(*publishers)
 
         self.restrict_sources_to = restrict_sources_to
@@ -284,13 +288,7 @@ def _single_crawl(
         publishers: Tuple[PublisherEnum, ...], article_task: Callable[[PublisherEnum], Iterator[Article]]
     ) -> Iterator[Article]:
         article_iterators = [article_task(publisher) for publisher in publishers]
-        while article_iterators:
-            tmp = article_iterators
-            for iterator in tmp:
-                try:
-                    yield next(iterator)
-                except StopIteration:
-                    article_iterators.remove(iterator)
+        yield from roundrobin(*article_iterators)
 
     @staticmethod
     def _threaded_crawl(
@@ -299,7 +297,7 @@ def _threaded_crawl(
         article_queue: Queue[Article] = Queue(len(publishers))
         wrapped_article_task = queue_wrapper(article_queue, article_task)
 
-        with ThreadPool(processes=len(publishers) or None) as pool:
+        with ThreadPool(processes=len(publishers) or None) as pool, session_handler.context(len(publishers), 1):
             yield from pool_queue_iter(pool.map_async(wrapped_article_task, publishers), article_queue)
 
     def _build_article_iterator(
diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index 440574cde..d746cfe67 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -1,6 +1,8 @@
-from typing import Optional
+from contextlib import contextmanager
+from typing import Iterator, Optional
 
 import requests.adapters
+from typing_extensions import Self
 
 from fundus.logging import basic_logger
 
@@ -19,7 +21,7 @@ class SessionHandler:
     tear-downed and the next call to get_session() will build a new session.
     """
 
-    def __init__(self, pool_connections: int = 50, pool_maxsize: int = 50):
+    def __init__(self, pool_connections: int = 50, pool_maxsize: int = 1):
         self.session: Optional[requests.Session] = None
         self.pool_connections = pool_connections
         self.pool_maxsize = pool_maxsize
@@ -30,7 +32,7 @@ def _session_factory(self) -> requests.Session:
         This returns a new client session build from pre-defined configurations:
         - pool_connections: 50
         - pool_maxsize: 50
-        - hooks = {'response': raise_for_status(), _response_log():}
+        - hooks = {'response': raise_for_status(), _response_log()}
 
         Returns:
             A new requests.Session
@@ -84,10 +86,35 @@ def close_current_session(self) -> None:
         Returns:
             None
         """
-        session = self.get_session()
-        basic_logger.debug(f"Close session {session}")
-        session.close()
-        self.session = None
+        if self.session is not None:
+            session = self.get_session()
+            basic_logger.debug(f"Close session {session}")
+            session.close()
+            self.session = None
+
+    @contextmanager
+    def context(self, pool_connections: int, pool_maxsize: int) -> Self:
+        """Context manager to temporarily overwrite parameter and build new session.
+
+        Args:
+            pool_connections: see requests.Session documentation.
+            pool_maxsize: see requests.Session documentation.
+
+        Returns:
+            SessionHandler: The session handler instance.
+        """
+        previous_pool_connections = self.pool_connections
+        previous_pool_maxsize = self.pool_maxsize
+
+        self.close_current_session()
+
+        try:
+            self.pool_connections = pool_connections
+            self.pool_maxsize = pool_maxsize
+            yield self
+        finally:
+            self.pool_connections = previous_pool_connections
+            self.pool_maxsize = previous_pool_maxsize
 
 
 session_handler = SessionHandler()

From 23269fa1f2c297669e596c7a42a4da47faf83fe9 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 15 Apr 2024 19:58:56 +0200
Subject: [PATCH 28/36] rename `HTML.source` -> `HTML.source_info`

---
 src/fundus/scraping/article.py | 2 +-
 src/fundus/scraping/html.py    | 8 ++++----
 src/fundus/scraping/scraper.py | 2 +-
 tests/utility.py               | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
index e8e6a2629..77eb9d770 100644
--- a/src/fundus/scraping/article.py
+++ b/src/fundus/scraping/article.py
@@ -83,7 +83,7 @@ def __str__(self):
             f'\n- Title: "{wrapped_title}"'
             f'\n- Text:  "{wrapped_plaintext}"'
             f"\n- URL:    {self.html.requested_url}"
-            f"\n- From:   {self.html.source.publisher}"
+            f"\n- From:   {self.html.source_info.publisher}"
             f'{" (" + self.publishing_date.strftime("%Y-%m-%d %H:%M") + ")" if self.publishing_date else ""}'
         )
 
diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 52497bbf2..40ec3510f 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -37,7 +37,7 @@ class HTML:
     responded_url: str
     content: str
     crawl_date: datetime
-    source: "SourceInfo"
+    source_info: "SourceInfo"
 
 
 @dataclass(frozen=True)
@@ -122,7 +122,7 @@ def filter_url(u: str) -> bool:
                 if response.history:
                     basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")
 
-                source = (
+                source_info = (
                     WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url)
                     if isinstance(self.url_source, URLSource)
                     else SourceInfo(self.publisher)
@@ -133,7 +133,7 @@ def filter_url(u: str) -> bool:
                     responded_url=str(response.url),
                     content=html,
                     crawl_date=datetime.now(),
-                    source=source,
+                    source_info=source_info,
                 )
 
             if self.delay:
@@ -213,7 +213,7 @@ def extract_content(record: WarcRecord) -> Optional[str]:
                     responded_url=target_url,
                     content=content,
                     crawl_date=warc_record.record_date,
-                    source=WarcSourceInfo(
+                    source_info=WarcSourceInfo(
                         publisher=publisher.publisher_name,
                         warc_path=self.warc_path,
                         warc_headers=dict(warc_record.headers),
diff --git a/src/fundus/scraping/scraper.py b/src/fundus/scraping/scraper.py
index f8753280d..b7f8f77ea 100644
--- a/src/fundus/scraping/scraper.py
+++ b/src/fundus/scraping/scraper.py
@@ -29,7 +29,7 @@ def scrape(
     ) -> Iterator[Article]:
         for source in self.sources:
             for html in source.fetch(url_filter=url_filter):
-                parser = self.parser_mapping[html.source.publisher]
+                parser = self.parser_mapping[html.source_info.publisher]
 
                 try:
                     extraction = parser(html.crawl_date).parse(html.content, error_handling)
diff --git a/tests/utility.py b/tests/utility.py
index 22378c4b2..42e9097b7 100644
--- a/tests/utility.py
+++ b/tests/utility.py
@@ -28,7 +28,7 @@ def get_test_articles(publisher: PublisherEnum) -> List[Article]:
             crawl_date=html_test_file.crawl_date,
             requested_url=html_test_file.url,
             responded_url=html_test_file.url,
-            source=SourceInfo(publisher.publisher_name),
+            source_info=SourceInfo(publisher.publisher_name),
         )
         article = Article.from_extracted(extracted=extraction, html=html)
         articles.append(article)

From aaf0f57288b6e18147560afcaf0e04930813d018 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 15 Apr 2024 20:00:52 +0200
Subject: [PATCH 29/36] fix types + lint

---
 src/fundus/scraping/crawler.py | 3 +--
 src/fundus/scraping/html.py    | 3 ++-
 src/fundus/scraping/session.py | 2 +-
 tests/test_crawler.py          | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
index 18e952710..890c0bfc5 100644
--- a/src/fundus/scraping/crawler.py
+++ b/src/fundus/scraping/crawler.py
@@ -117,7 +117,6 @@ def pool_queue_iter(handle: MapResult[Any], queue: Queue[_T]) -> Iterator[_T]:
 
 class CrawlerBase(ABC):
     def __init__(self, *publishers: Publisher):
-
         if not publishers:
             raise ValueError("param <publishers> of <Crawler.__init__> has to be non empty")
 
@@ -209,7 +208,7 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
 
         article_count = 0
         for article in self._build_article_iterator(
-                tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter
+            tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter
         ):
             if not only_unique or article.html.responded_url not in response_cache:
                 response_cache.add(article.html.responded_url)
diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 40ec3510f..245135302 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -59,7 +59,8 @@ class WebSourceInfo(SourceInfo):
 
 
 class HTMLSource(Protocol):
-    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: ...
+    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
+        ...
 
 
 class WebSource:
diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index d746cfe67..36630cf4f 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -93,7 +93,7 @@ def close_current_session(self) -> None:
             self.session = None
 
     @contextmanager
-    def context(self, pool_connections: int, pool_maxsize: int) -> Self:
+    def context(self, pool_connections: int, pool_maxsize: int) -> Iterator[Self]:
         """Context manager to temporarily overwrite parameter and build new session.
 
         Args:
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
index 53587a951..20e8e1fe3 100644
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -6,7 +6,7 @@
 class TestPipeline:
     def test_crawler_with_empty_collection(self, collection_with_empty_publisher_enum):
         crawler = Crawler(collection_with_empty_publisher_enum)
-        assert crawler.publishers == tuple()
+        assert crawler.publishers == list()
         assert next(crawler.crawl(), None) is None
 
         with pytest.raises(ValueError):

From f1b9a082681e9b3cb9a7b151a8d2aa645ba5cdc4 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Mon, 15 Apr 2024 20:01:54 +0200
Subject: [PATCH 30/36] fix imports after merge

---
 src/fundus/publishers/lt/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fundus/publishers/lt/__init__.py b/src/fundus/publishers/lt/__init__.py
index 3fc3c7a2a..3fe649e38 100644
--- a/src/fundus/publishers/lt/__init__.py
+++ b/src/fundus/publishers/lt/__init__.py
@@ -1,5 +1,5 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
-from fundus.scraping.html import RSSFeed, Sitemap
+from fundus.scraping.url import RSSFeed, Sitemap
 
 from .lrt import LRTParser
 

From 053c9fef1bbc05b6dc4f24895faf158dd28d45e5 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 16 Apr 2024 13:37:15 +0200
Subject: [PATCH 31/36] fix bug leading to a potential `KeyError` in `Sitemap`

---
 src/fundus/scraping/url.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fundus/scraping/url.py b/src/fundus/scraping/url.py
index 6b9231af2..3db8e750a 100644
--- a/src/fundus/scraping/url.py
+++ b/src/fundus/scraping/url.py
@@ -106,7 +106,7 @@ def yield_recursive(sitemap_url: str) -> Iterator[str]:
                 basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
                 return
             content = response.content
-            if (content_type := response.headers["content-type"]) in self._decompressor.supported_file_formats:
+            if (content_type := response.headers.get("content-type")) in self._decompressor.supported_file_formats:
                 content = self._decompressor.decompress(content, content_type)
             if not content:
                 basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")

From 484d0c03fcb0fde4af130f8e99abc3cb7afc623f Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 16 Apr 2024 13:37:41 +0200
Subject: [PATCH 32/36] fix crawler delay for `WebSource`

---
 src/fundus/scraping/html.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 245135302..98cdb5927 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -85,12 +85,16 @@ def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
             [url_filter] if url_filter else []
         )
 
-        timestamp = time.time()
+        timestamp = time.time() + self.delay() if self.delay is not None else time.time()
 
         def filter_url(u: str) -> bool:
             return any(f(u) for f in combined_filters)
 
         for url in self.url_source:
+            if self.delay:
+                time.sleep(max(0.0, self.delay() - time.time() + timestamp))
+                timestamp = time.time()
+
             if not validators.url(url):
                 basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
                 continue
@@ -137,10 +141,6 @@ def filter_url(u: str) -> bool:
                     source_info=source_info,
                 )
 
-            if self.delay:
-                time.sleep(max(0.0, self.delay() - time.time() + timestamp))
-                timestamp = time.time()
-
 
 class CCNewsSource:
     def __init__(self, *publishers: PublisherEnum, warc_path: str, headers: Optional[Dict[str, str]] = None):

From e5b865b8a7c7c6e8a51f3fcf5e96481359a11278 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 16 Apr 2024 13:46:13 +0200
Subject: [PATCH 33/36] use a thread lock for `SessionHandler.get_session`

---
 src/fundus/scraping/session.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index 36630cf4f..abcc2057f 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -1,3 +1,4 @@
+import threading
 from contextlib import contextmanager
 from typing import Iterator, Optional
 
@@ -25,6 +26,7 @@ def __init__(self, pool_connections: int = 50, pool_maxsize: int = 1):
         self.session: Optional[requests.Session] = None
         self.pool_connections = pool_connections
         self.pool_maxsize = pool_maxsize
+        self.lock = threading.Lock()
 
     def _session_factory(self) -> requests.Session:
         """Builds a new Session
@@ -76,9 +78,11 @@ def get_session(self) -> requests.Session:
         Returns:
             requests.Session: The current build session
         """
-        if not self.session:
-            self.session = self._session_factory()
-        return self.session
+
+        with self.lock:
+            if not self.session:
+                self.session = self._session_factory()
+            return self.session
 
     def close_current_session(self) -> None:
         """Tears down the current build session

From 8e9f3d7f348bd769fc80ed791117f5bbfcc14e03 Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Tue, 16 Apr 2024 14:00:36 +0200
Subject: [PATCH 34/36] remove leftover imports

---
 src/fundus/parser/data.py                | 2 --
 src/fundus/publishers/fr/le_monde.py     | 1 -
 src/fundus/publishers/na/the_namibian.py | 1 -
 src/fundus/scraping/delay.py             | 1 -
 src/fundus/scraping/html.py              | 1 -
 5 files changed, 6 deletions(-)

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
index 1168e7793..3a6013c25 100644
--- a/src/fundus/parser/data.py
+++ b/src/fundus/parser/data.py
@@ -16,8 +16,6 @@
 
 from typing_extensions import TypeAlias
 
-from fundus.logging import basic_logger
-
 LDMappingValue: TypeAlias = Union[List[Dict[str, Any]], Dict[str, Any]]
 
 
diff --git a/src/fundus/publishers/fr/le_monde.py b/src/fundus/publishers/fr/le_monde.py
index dfef8b6b2..6d4757ae3 100644
--- a/src/fundus/publishers/fr/le_monde.py
+++ b/src/fundus/publishers/fr/le_monde.py
@@ -9,7 +9,6 @@
     extract_article_body_with_selector,
     generic_author_parsing,
     generic_date_parsing,
-    generic_topic_parsing,
 )
 
 
diff --git a/src/fundus/publishers/na/the_namibian.py b/src/fundus/publishers/na/the_namibian.py
index d711620fd..c52b96183 100644
--- a/src/fundus/publishers/na/the_namibian.py
+++ b/src/fundus/publishers/na/the_namibian.py
@@ -2,7 +2,6 @@
 from datetime import datetime
 from typing import List, Optional, Pattern
 
-from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
 
 from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
diff --git a/src/fundus/scraping/delay.py b/src/fundus/scraping/delay.py
index c83cd6918..ee2a70830 100644
--- a/src/fundus/scraping/delay.py
+++ b/src/fundus/scraping/delay.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import random
 from typing import Protocol, runtime_checkable
 
 
diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
index 98cdb5927..d393d0aa3 100644
--- a/src/fundus/scraping/html.py
+++ b/src/fundus/scraping/html.py
@@ -1,5 +1,4 @@
 import time
-from abc import abstractmethod
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Dict, Iterable, Iterator, List, Optional, Protocol

From 4facafecac3c700a5236425be2ed68b2035f8ebb Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Wed, 17 Apr 2024 13:07:02 +0200
Subject: [PATCH 35/36] rearrange code

---
 scripts/generate_parser_test_files.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
index b9da059c8..5a3afcf89 100644
--- a/scripts/generate_parser_test_files.py
+++ b/scripts/generate_parser_test_files.py
@@ -17,14 +17,14 @@
 
 
 def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]:
-    if url is None:
-        crawler = Crawler(enum)
-        return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None)
-    else:
+    if url is not None:
         source = WebSource([url], publisher=enum.publisher_name)
         scraper = BaseScraper(source, parser_mapping={enum.publisher_name: enum.parser})
         return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None)
 
+    crawler = Crawler(enum)
+    return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None)
+
 
 def parse_arguments() -> Namespace:
     parser = ArgumentParser(

From 6d12cad3f9b6d5911f38c215db44946607ec2d6b Mon Sep 17 00:00:00 2001
From: Max Dallabetta <max.dallabetta@googlemail.com>
Date: Wed, 17 Apr 2024 13:07:23 +0200
Subject: [PATCH 36/36] adjust some docstrings

---
 src/fundus/scraping/session.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
index abcc2057f..4a804fbe0 100644
--- a/src/fundus/scraping/session.py
+++ b/src/fundus/scraping/session.py
@@ -32,9 +32,9 @@ def _session_factory(self) -> requests.Session:
         """Builds a new Session
 
         This returns a new client session build from pre-defined configurations:
-        - pool_connections: 50
-        - pool_maxsize: 50
-        - hooks = {'response': raise_for_status(), _response_log()}
+        - pool_connections: <self.pool_connections>
+        - pool_maxsize: <self.pool_maxsize>
+        - hooks: (1) Hook to raise an `HTTPError` if one occurred. (2) Hook to log the request responses.
 
         Returns:
             A new requests.Session
@@ -98,7 +98,7 @@ def close_current_session(self) -> None:
 
     @contextmanager
     def context(self, pool_connections: int, pool_maxsize: int) -> Iterator[Self]:
-        """Context manager to temporarily overwrite parameter and build new session.
+        """Context manager to temporarily overwrite parameter and build a new session.
 
         Args:
             pool_connections: see requests.Session documentation.