Skip to content

Commit

Permalink
Merge pull request #357 from flairNLP/unbatch-fundus
Browse files Browse the repository at this point in the history
Unbatch Fundus
  • Loading branch information
MaxDall authored Apr 18, 2024
2 parents efc57f9 + 6d12cad commit 2135e92
Show file tree
Hide file tree
Showing 35 changed files with 895 additions and 1,047 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ Maybe you want to crawl a specific news source instead. Let's crawl news article
```python
from fundus import PublisherCollection, Crawler

# initialize the crawler for Washington Times
crawler = Crawler(PublisherCollection.us.WashingtonTimes)
# initialize the crawler for The New Yorker
crawler = Crawler(PublisherCollection.us.TheNewYorker)

# crawl 2 articles and print
for article in crawler.crawl(max_articles=2):
Expand Down
2 changes: 0 additions & 2 deletions docs/1_getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ You can also initialize a crawler for the entire publisher collection
crawler = Crawler(PublisherCollection)
````

**_NOTE:_** To build a pipeline from low-level `Scraper` objects make use of the `BaseCrawler` class.

# How to crawl articles

Now to crawl articles make use of the `crawl()` method of the initialized crawler class.
Expand Down
10 changes: 5 additions & 5 deletions docs/2_crawl_from_cc_news.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Table of Contents

* [Crawl articles from CC-NEWS](#crawl-articles-from-cc-news)
* [How to crawl articles from CC-NEWS](#how-to-crawl-articles-from-cc-news)
* [The crawler](#the-crawler)
* [OS start method](#os-start-method)
* [Date range](#date-range)
* [Multiprocessing](#multiprocessing)

# Crawl articles from CC-NEWS
# How to crawl articles from CC-NEWS

This tutorial explains how to crawl articles from the [CC-NEWS](https://paperswithcode.com/dataset/cc-news) dataset using Fundus.

Expand Down Expand Up @@ -48,8 +48,8 @@ from datetime import datetime

from fundus import CCNewsCrawler, PublisherCollection

crawler = CCNewsCrawler(*PublisherCollection)
for article in crawler.crawl(start=datetime(2020, 1, 1), end=datetime(2020, 3, 1), max_articles=100):
crawler = CCNewsCrawler(*PublisherCollection, start=datetime(2020, 1, 1), end=datetime(2020, 3, 1))
for article in crawler.crawl(max_articles=100):
print(article)
````

Expand All @@ -66,7 +66,7 @@ from fundus import CCNewsCrawler, PublisherCollection
crawler = CCNewsCrawler(*PublisherCollection, processes=4)
````

To omit multiprocessing, pass `0` to the `processes` parameter.
To omit multiprocessing, pass `-1` to the `processes` parameter.

In the [next section](3_the_article_class.md) we will introduce you to the `Article` class.

10 changes: 8 additions & 2 deletions docs/how_to_add_a_publisher.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,11 @@ To instantiate an object inheriting from URLSource like `RSSFeed` or `Sitemap`,
Getting links for RSS feeds can vary from publisher to publisher.
Most of the time, you can find them through a quick browser search.
Building an `RSSFeed` looks like this:

````python
from fundus.scraping.html import RSSFeed

from fundus import RSSFeed

RSSFeed("https://theintercept.com/feed/?rss")
````

Expand Down Expand Up @@ -181,8 +184,11 @@ You can alter this behavior or reverse the order in which sitemaps are processed
**_NOTE:_** If you wonder why you should reverse your sources from time to time, `URLSource`'s should, if possible, yield URLs in descending order by publishing date.
Now building a new `URLSource` for a `NewsMap` covering the LA Times looks like this:
````python
from fundus.scraping.html import NewsMap
from fundus import NewsMap
NewsMap("https://www.latimes.com/news-sitemap.xml", reverse=True)
````
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ dependencies = [
"colorama>=0.4, <1",
"typing-extensions>=4.6, <5",
"langdetect>=1.0, <2",
"aiohttp>=3.8, <4",
"aioitertools>=0.11, <1",
"validators>=0.20, <1, !=0.23",
"requests>=2.28, <3",
"tqdm>=4.66, <5",
Expand Down
22 changes: 11 additions & 11 deletions scripts/generate_parser_test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@

from tqdm import tqdm

from fundus import BaseCrawler, Crawler, PublisherCollection
from fundus import Crawler, PublisherCollection
from fundus.logging import basic_logger
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.article import Article
from fundus.scraping.html import FundusSource
from fundus.scraping.scraper import Scraper
from fundus.scraping.filter import RequiresAll
from fundus.scraping.html import WebSource
from fundus.scraping.scraper import BaseScraper, WebScraper
from tests.test_parser import attributes_required_to_cover
from tests.utility import HTMLTestFile, get_test_case_json, load_html_test_file_mapping


def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]:
crawler: BaseCrawler
if url is None:
crawler = Crawler(enum)
else:
source = FundusSource([url], publisher=enum.publisher_name)
scraper = Scraper(source, parser=enum.parser)
crawler = BaseCrawler(scraper)
if url is not None:
source = WebSource([url], publisher=enum.publisher_name)
scraper = BaseScraper(source, parser_mapping={enum.publisher_name: enum.parser})
return next(scraper.scrape(error_handling="suppress", extraction_filter=RequiresAll()), None)

crawler = Crawler(enum)
return next(crawler.crawl(max_articles=1, error_handling="suppress", only_complete=True), None)


Expand Down Expand Up @@ -107,7 +107,7 @@ def main() -> None:

if arguments.overwrite or not html_mapping.get(publisher.parser.latest_version):
if not (article := get_test_article(publisher, url)):
basic_logger.warning(f"Couldn't get article for {publisher.name}. Skipping")
basic_logger.error(f"Couldn't get article for {publisher.name}. Skipping")
continue
html = HTMLTestFile(
url=article.html.responded_url,
Expand Down
27 changes: 3 additions & 24 deletions src/fundus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,20 @@
import pathlib
import sys

from fundus.publishers import PublisherCollection
from fundus.scraping.common_crawl import CCNewsCrawler
from fundus.scraping.crawler import CCNewsCrawler, Crawler, CrawlerBase
from fundus.scraping.filter import Requires
from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
from fundus.scraping.pipeline import BaseCrawler, Crawler
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap

__module_path__ = pathlib.Path(__file__).parent
__development_base_path__ = __module_path__.parents[1]

__all__ = [
"CrawlerBase",
"Crawler",
"BaseCrawler",
"CCNewsCrawler",
"PublisherCollection",
"Requires",
"RSSFeed",
"Sitemap",
"NewsMap",
]

# On a Windows machines, when executing `BaseCrawler.crawl` from our sync API two times,
# Python throws an `RuntimeError: Event loop is closed exception` during Python's clean-up phase.

# To reproduce the error run the following code:
# from fundus import Crawler, PublisherCollection
# crawler = Crawler(PublisherCollection.de.DieWelt)
# for article in crawler.crawl(max_articles=1):
# pass
# for article in crawler.crawl(max_articles=1):
# pass

# A workaround involves to modify the event loop policy of asyncio on Windows machines.
# Unfortunately, this is a global modification. For further information see:
# https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
if sys.platform == "win32":
import asyncio

asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
2 changes: 0 additions & 2 deletions src/fundus/parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

from typing_extensions import TypeAlias

from fundus.logging import basic_logger

LDMappingValue: TypeAlias = Union[List[Dict[str, Any]], Dict[str, Any]]


Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/at/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.html import RSSFeed
from fundus.scraping.url import RSSFeed

from .orf import OrfParser

Expand Down
13 changes: 4 additions & 9 deletions src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from fundus.parser.base_parser import ParserProxy
from fundus.scraping.filter import URLFilter
from fundus.scraping.html import FundusSource, NewsMap, RSSFeed, Sitemap, URLSource
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
from fundus.utils.iteration import iterate_all_subclasses


Expand Down Expand Up @@ -44,10 +44,11 @@ def __init__(self, spec: PublisherSpec):
self.parser = spec.parser()
self.publisher_name = spec.name
self.url_filter = spec.url_filter
self.request_header = spec.request_header

# we define the dict here manually instead of using default dict so that we can control
# the order in which sources are proceeded.
source_mapping: Dict[Type[URLSource], List[FundusSource]] = {
source_mapping: Dict[Type[URLSource], List[URLSource]] = {
RSSFeed: [],
NewsMap: [],
Sitemap: [],
Expand All @@ -59,13 +60,7 @@ def __init__(self, spec: PublisherSpec):
f"Unexpected type '{type(url_source).__name__}' as source for {self.name}. "
f"Allowed are '{', '.join(cls.__name__ for cls in iterate_all_subclasses(URLSource))}'"
)
source: FundusSource = FundusSource(
url_source=url_source,
publisher=self.publisher_name,
url_filter=spec.url_filter,
request_header=spec.request_header,
)
source_mapping[type(url_source)].append(source)
source_mapping[type(url_source)].append(url_source)

self.source_mapping = source_mapping

Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.filter import regex_filter
from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap

from .berliner_zeitung import BerlinerZeitungParser
from .bild import BildParser
Expand Down
3 changes: 1 addition & 2 deletions src/fundus/publishers/fr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.publishers.fr.le_monde import LeMondeParser
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.html import NewsMap, Sitemap
from fundus.scraping.url import NewsMap, Sitemap


class FR(PublisherEnum):
Expand Down
1 change: 0 additions & 1 deletion src/fundus/publishers/fr/le_monde.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/lt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.html import RSSFeed, Sitemap
from fundus.scraping.url import RSSFeed, Sitemap

from .lrt import LRTParser

Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/na/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.html import RSSFeed, Sitemap
from fundus.scraping.url import RSSFeed, Sitemap

from .the_namibian import TheNamibianParser

Expand Down
1 change: 0 additions & 1 deletion src/fundus/publishers/na/the_namibian.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from datetime import datetime
from typing import List, Optional, Pattern

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/uk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.html import NewsMap, Sitemap
from fundus.scraping.url import NewsMap, Sitemap

from .i_news import INewsParser
from .the_guardian import TheGuardianParser
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/publishers/us/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from fundus.publishers.base_objects import PublisherEnum, PublisherSpec
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.html import NewsMap, RSSFeed, Sitemap
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap

from .ap_news import APNewsParser
from .business_insider import BusinessInsiderParser
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/scraping/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __str__(self):
f'\n- Title: "{wrapped_title}"'
f'\n- Text: "{wrapped_plaintext}"'
f"\n- URL: {self.html.requested_url}"
f"\n- From: {self.html.source.publisher}"
f"\n- From: {self.html.source_info.publisher}"
f'{" (" + self.publishing_date.strftime("%Y-%m-%d %H:%M") + ")" if self.publishing_date else ""}'
)

Expand Down
3 changes: 0 additions & 3 deletions src/fundus/scraping/common_crawl/__init__.py

This file was deleted.

Loading

0 comments on commit 2135e92

Please sign in to comment.