Merge pull request #357 from flairNLP/unbatch-fundus

Unbatch Fundus
flairNLP · Apr 18, 2024 · 2135e92 · 2135e92
2 parents efc57f9 + 6d12cad
commit 2135e92
Show file tree

Hide file tree

Showing 35 changed files with 895 additions and 1,047 deletions.
diff --git a/README.md b/README.md
@@ -99,8 +99,8 @@ Maybe you want to crawl a specific news source instead. Let's crawl news article
 ```python
 from fundus import PublisherCollection, Crawler
 
-# initialize the crawler for Washington Times
-crawler = Crawler(PublisherCollection.us.WashingtonTimes)
+# initialize the crawler for The New Yorker
+crawler = Crawler(PublisherCollection.us.TheNewYorker)
 
 # crawl 2 articles and print
 for article in crawler.crawl(max_articles=2):

diff --git a/docs/1_getting_started.md b/docs/1_getting_started.md
@@ -46,8 +46,6 @@ You can also initialize a crawler for the entire publisher collection
 crawler = Crawler(PublisherCollection)
 ````
 
-**_NOTE:_** To build a pipeline from low-level `Scraper` objects make use of the `BaseCrawler` class.
-
 # How to crawl articles
 
 Now to crawl articles make use of the `crawl()` method of the initialized crawler class.

diff --git a/docs/2_crawl_from_cc_news.md b/docs/2_crawl_from_cc_news.md
@@ -1,12 +1,12 @@
 # Table of Contents
 
-* [Crawl articles from CC-NEWS](#crawl-articles-from-cc-news)
+* [How to crawl articles from CC-NEWS](#how-to-crawl-articles-from-cc-news)
   * [The crawler](#the-crawler)
     * [OS start method](#os-start-method)
   * [Date range](#date-range)
   * [Multiprocessing](#multiprocessing)
 
-# Crawl articles from CC-NEWS
+# How to crawl articles from CC-NEWS
 
 This tutorial explains how to crawl articles from the [CC-NEWS](https://paperswithcode.com/dataset/cc-news) dataset using Fundus.
 
@@ -48,8 +48,8 @@ from datetime import datetime
 
 from fundus import CCNewsCrawler, PublisherCollection
 
-crawler = CCNewsCrawler(*PublisherCollection)
-for article in crawler.crawl(start=datetime(2020, 1, 1), end=datetime(2020, 3, 1), max_articles=100):
+crawler = CCNewsCrawler(*PublisherCollection, start=datetime(2020, 1, 1), end=datetime(2020, 3, 1))
+for article in crawler.crawl(max_articles=100):
     print(article)
 ````
 
@@ -66,7 +66,7 @@ from fundus import CCNewsCrawler, PublisherCollection
 crawler = CCNewsCrawler(*PublisherCollection, processes=4)
 ````
 
-To omit multiprocessing, pass `0` to the `processes` parameter.
+To omit multiprocessing, pass `-1` to the `processes` parameter.
 
 In the [next section](3_the_article_class.md) we will introduce you to the `Article` class.
 
diff --git a/docs/how_to_add_a_publisher.md b/docs/how_to_add_a_publisher.md
@@ -128,8 +128,11 @@ To instantiate an object inheriting from URLSource like `RSSFeed` or `Sitemap`,
 Getting links for RSS feeds can vary from publisher to publisher.
 Most of the time, you can find them through a quick browser search.
 Building an `RSSFeed` looks like this:
+
 ````python
-from fundus.scraping.html import RSSFeed
+
+from fundus import RSSFeed
+
 RSSFeed("https://theintercept.com/feed/?rss")
 ````
 
@@ -181,8 +184,11 @@ You can alter this behavior or reverse the order in which sitemaps are processed
 **_NOTE:_** If you wonder why you should reverse your sources from time to time, `URLSource`'s should, if possible, yield URLs in descending order by publishing date.
 
 Now building a new `URLSource` for a `NewsMap` covering the LA Times looks like this:
+
 ````python
-from fundus.scraping.html import NewsMap
+
+from fundus import NewsMap
+
 NewsMap("https://www.latimes.com/news-sitemap.xml", reverse=True)
 ````
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,8 +31,6 @@ dependencies = [
     "colorama>=0.4, <1",
     "typing-extensions>=4.6, <5",
     "langdetect>=1.0, <2",
-    "aiohttp>=3.8, <4",
-    "aioitertools>=0.11, <1",
     "validators>=0.20, <1, !=0.23",
     "requests>=2.28, <3",
     "tqdm>=4.66, <5",

diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py
@@ -5,24 +5,24 @@
 
 from tqdm import tqdm
 
-from fundus import BaseCrawler, Crawler, PublisherCollection
+from fundus import Crawler, PublisherCollection
 from fundus.logging import basic_logger
 from fundus.publishers.base_objects import PublisherEnum
 from fundus.scraping.article import Article
-from fundus.scraping.html import FundusSource
-from fundus.scraping.scraper import Scraper
+from fundus.scraping.filter import RequiresAll
+from fundus.scraping.html import WebSource
+from fundus.scraping.scraper import BaseScraper, WebScraper
 from tests.test_parser import attributes_required_to_cover
 from tests.utility import HTMLTestFile, get_test_case_json, load_html_test_file_mapping
 
 
 def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]:
-    crawler: BaseCrawler
-    if url is None:
-        crawler = Crawler(enum)
-    else:
-        source = FundusSource([url], publisher=enum.publisher_name)
-        scraper = Scraper(source, parser=enum.parser)
-        crawler = BaseCrawler(scraper)
+    if url is not None:
+        source = WebSource([url], publisher=enum.publisher_name)
+        scraper = BaseScraper(source, parser_mapping={enum.publisher_name: enum.parser})
+        return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None)
+
+    crawler = Crawler(enum)
     return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None)
 
 
@@ -107,7 +107,7 @@ def main() -> None:
 
             if arguments.overwrite or not html_mapping.get(publisher.parser.latest_version):
                 if not (article := get_test_article(publisher, url)):
-                    basic_logger.warning(f"Couldn't get article for {publisher.name}. Skipping")
+                    basic_logger.error(f"Couldn't get article for {publisher.name}. Skipping")
                     continue
                 html = HTMLTestFile(
                     url=article.html.responded_url,

diff --git a/src/fundus/__init__.py b/src/fundus/__init__.py
@@ -1,41 +1,20 @@
 import pathlib
-import sys
 
 from fundus.publishers import PublisherCollection
-from fundus.scraping.common_crawl import CCNewsCrawler
+from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase
 from fundus.scraping.filter import Requires
-from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
-from fundus.scraping.pipeline import BaseCrawler, Crawler
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 __module_path__ = pathlib.Path(__file__).parent
 __development_base_path__ = __module_path__.parents[1]
 
 __all__ = [
+    "CrawlerBase",
     "Crawler",
-    "BaseCrawler",
     "CCNewsCrawler",
     "PublisherCollection",
     "Requires",
     "RSSFeed",
     "Sitemap",
     "NewsMap",
 ]
-
-# On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times,
-# Python throws an `RuntimeError: Event loop is closed exception` during Python's clean-up phase.
-
-# To reproduce the error run the following code:
-# from fundus import Crawler, PublisherCollection
-# crawler = Crawler(PublisherCollection.de.DieWelt)
-# for article in crawler.crawl(max_articles=1):
-#     pass
-# for article in crawler.crawl(max_articles=1):
-#     pass
-
-# A workaround involves to modify the event loop policy of asyncio on Windows machines.
-# Unfortunately, this is a global modification. For further information see:
-# https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
-if sys.platform == "win32":
-    import asyncio
-
-    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
@@ -16,8 +16,6 @@
 
 from typing_extensions import TypeAlias
 
-from fundus.logging import basic_logger
-
 LDMappingValue: TypeAlias = Union[List[Dict[str, Any]], Dict[str, Any]]
 
 

diff --git a/src/fundus/publishers/at/__init__.py b/src/fundus/publishers/at/__init__.py
@@ -1,5 +1,5 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
-from fundus.scraping.html import RSSFeed
+from fundus.scraping.url import RSSFeed
 
 from .orf import OrfParser
 

diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py
@@ -6,7 +6,7 @@
 
 from fundus.parser.base_parser import ParserProxy
 from fundus.scraping.filter import URLFilter
-from fundus.scraping.html import FundusSource, NewsMap, RSSFeed, Sitemap, URLSource
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
 from fundus.utils.iteration import iterate_all_subclasses
 
 
@@ -44,10 +44,11 @@ def __init__(self, spec: PublisherSpec):
         self.parser = spec.parser()
         self.publisher_name = spec.name
         self.url_filter = spec.url_filter
+        self.request_header = spec.request_header
 
         # we define the dict here manually instead of using default dict so that we can control
         # the order in which sources are proceeded.
-        source_mapping: Dict[Type[URLSource], List[FundusSource]] = {
+        source_mapping: Dict[Type[URLSource], List[URLSource]] = {
             RSSFeed: [],
             NewsMap: [],
             Sitemap: [],
@@ -59,13 +60,7 @@ def __init__(self, spec: PublisherSpec):
                     f"Unexpected type '{type(url_source).__name__}' as source for {self.name}. "
                     f"Allowed are '{', '.join(cls.__name__ for cls in iterate_all_subclasses(URLSource))}'"
                 )
-            source: FundusSource = FundusSource(
-                url_source=url_source,
-                publisher=self.publisher_name,
-                url_filter=spec.url_filter,
-                request_header=spec.request_header,
-            )
-            source_mapping[type(url_source)].append(source)
+            source_mapping[type(url_source)].append(url_source)
 
         self.source_mapping = source_mapping
 

diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py
@@ -4,7 +4,7 @@
 
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.scraping.filter import regex_filter
-from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 from .berliner_zeitung import BerlinerZeitungParser
 from .bild import BildParser

diff --git a/src/fundus/publishers/fr/__init__.py b/src/fundus/publishers/fr/__init__.py
@@ -1,7 +1,6 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.publishers.fr.le_monde import LeMondeParser
-from fundus.scraping.filter import inverse, regex_filter
-from fundus.scraping.html import NewsMap, Sitemap
+from fundus.scraping.url import NewsMap, Sitemap
 
 
 class FR(PublisherEnum):

diff --git a/src/fundus/publishers/fr/le_monde.py b/src/fundus/publishers/fr/le_monde.py
@@ -9,7 +9,6 @@
     extract_article_body_with_selector,
     generic_author_parsing,
     generic_date_parsing,
-    generic_topic_parsing,
 )
 
 

diff --git a/src/fundus/publishers/lt/__init__.py b/src/fundus/publishers/lt/__init__.py
@@ -1,5 +1,5 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
-from fundus.scraping.html import RSSFeed, Sitemap
+from fundus.scraping.url import RSSFeed, Sitemap
 
 from .lrt import LRTParser
 

diff --git a/src/fundus/publishers/na/__init__.py b/src/fundus/publishers/na/__init__.py
@@ -1,6 +1,6 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.scraping.filter import inverse, regex_filter
-from fundus.scraping.html import RSSFeed, Sitemap
+from fundus.scraping.url import RSSFeed, Sitemap
 
 from .the_namibian import TheNamibianParser
 

diff --git a/src/fundus/publishers/na/the_namibian.py b/src/fundus/publishers/na/the_namibian.py
@@ -2,7 +2,6 @@
 from datetime import datetime
 from typing import List, Optional, Pattern
 
-from lxml.cssselect import CSSSelector
 from lxml.etree import XPath
 
 from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute

diff --git a/src/fundus/publishers/uk/__init__.py b/src/fundus/publishers/uk/__init__.py
@@ -2,7 +2,7 @@
 
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.scraping.filter import inverse, regex_filter
-from fundus.scraping.html import NewsMap, Sitemap
+from fundus.scraping.url import NewsMap, Sitemap
 
 from .i_news import INewsParser
 from .the_guardian import TheGuardianParser

diff --git a/src/fundus/publishers/us/__init__.py b/src/fundus/publishers/us/__init__.py
@@ -1,6 +1,6 @@
 from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
 from fundus.scraping.filter import inverse, regex_filter
-from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
 
 from .ap_news import APNewsParser
 from .business_insider import BusinessInsiderParser

diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
@@ -83,7 +83,7 @@ def __str__(self):
             f'\n- Title: "{wrapped_title}"'
             f'\n- Text:  "{wrapped_plaintext}"'
             f"\n- URL:    {self.html.requested_url}"
-            f"\n- From:   {self.html.source.publisher}"
+            f"\n- From:   {self.html.source_info.publisher}"
             f'{" (" + self.publishing_date.strftime("%Y-%m-%d %H:%M") + ")" if self.publishing_date else ""}'
         )
 

diff --git a/src/fundus/scraping/common_crawl/__init__.py b/src/fundus/scraping/common_crawl/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,8 +16,6 @@

		from typing_extensions import TypeAlias

		from fundus.logging import basic_logger

		LDMappingValue: TypeAlias = Union[List[Dict[str, Any]], Dict[str, Any]]


Expand Down