diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 23073c830..fb19004d7 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -294,6 +294,21 @@     + + + RheinischePost + + +
Rheinische Post
+ + + + rp-online.de + + +   +   + SpiegelOnline diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index 1cfc72f87..eba571fc5 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -19,6 +19,7 @@ from .merkur import MerkurParser from .ndr import NDRParser from .ntv import NTVParser +from .rheinische_post import RheinischePostParser from .spon import SPONParser from .stern import SternParser from .sz import SZParser @@ -227,3 +228,14 @@ class DE(PublisherEnum): ], parser=BusinessInsiderDEParser, ) + + RheinischePost = PublisherSpec( + name="Rheinische Post", + domain="https://rp-online.de/", + sources=[ + RSSFeed("https://rp-online.de/feed.rss"), + NewsMap("https://rp-online.de/sitemap-news.xml"), + Sitemap("https://rp-online.de/sitemap.xml"), + ], + parser=RheinischePostParser, + ) diff --git a/src/fundus/publishers/de/rheinische_post.py b/src/fundus/publishers/de/rheinische_post.py new file mode 100644 index 000000000..3e7b741d4 --- /dev/null +++ b/src/fundus/publishers/de/rheinische_post.py @@ -0,0 +1,44 @@ +import datetime +from typing import List, Optional + +from lxml.cssselect import CSSSelector + +from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.utility import ( + extract_article_body_with_selector, + generic_author_parsing, + generic_date_parsing, + generic_topic_parsing, +) + + +class RheinischePostParser(ParserProxy): + class V1(BaseParser): + _summary_selector = CSSSelector("strong[data-cy='intro']") + _paragraph_selector = CSSSelector("div[data-cy='article-content'] p") + _subheadline_selector = CSSSelector("div[data-cy='article-content'] h2") + + @attribute + def body(self) -> ArticleBody: + return extract_article_body_with_selector( + self.precomputed.doc, + summary_selector=self._summary_selector, + paragraph_selector=self._paragraph_selector, + subheadline_selector=self._subheadline_selector, + ) + + @attribute + def authors(self) -> List[str]: + return generic_author_parsing(self.precomputed.meta.get("author")) + + @attribute + def publishing_date(self) -> Optional[datetime.datetime]: + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def title(self) -> Optional[str]: + return self.precomputed.meta.get("og:title") + + @attribute + def topics(self) -> List[str]: + return generic_topic_parsing(self.precomputed.meta.get("keywords")) diff --git a/tests/resources/parser/test_data/de/RheinischePost.json b/tests/resources/parser/test_data/de/RheinischePost.json new file mode 100644 index 000000000..8944c56dc --- /dev/null +++ b/tests/resources/parser/test_data/de/RheinischePost.json @@ -0,0 +1,22 @@ +{ + "V1": { + "authors": [ + "Simon Janßen" + ], + "publishing_date": "2024-04-15 16:37:00+02:00", + "title": "Schomaker an der Niederstraße: Bio-Bäckerei schließt Filiale in Neuss", + "topics": [ + "Schomaker", + "Neuss", + "Bio", + "Niederstraße", + "Schließung", + "Reißleine", + "Backwaren", + "Bäcker", + "begründet", + "Biobäckerei", + "Bäckereien" + ] + } +} diff --git a/tests/resources/parser/test_data/de/RheinischePost_2024_04_15.html.gz b/tests/resources/parser/test_data/de/RheinischePost_2024_04_15.html.gz new file mode 100644 index 000000000..ce1ed083a Binary files /dev/null and b/tests/resources/parser/test_data/de/RheinischePost_2024_04_15.html.gz differ diff --git a/tests/resources/parser/test_data/de/meta.info b/tests/resources/parser/test_data/de/meta.info index 7f1e92dee..4789c6726 100644 --- a/tests/resources/parser/test_data/de/meta.info +++ b/tests/resources/parser/test_data/de/meta.info @@ -63,6 +63,10 @@ "url": "https://www.n-tv.de/leben/Judy-Lybke-ganz-und-gar-nicht-eigenartig-article24075843.html", "crawl_date": "2023-04-28 20:32:13.689394" }, + "RheinischePost_2024_04_15.html.gz": { + "url": "https://rp-online.de/nrw/staedte/neuss/neuss-biobaeckerei-schomaker-an-der-niederstrasse-schliesst_aid-110715299", + "crawl_date": "2024-04-15 16:40:22.430078" + }, "SZ_2023_04_28.html.gz": { "url": "https://www.sueddeutsche.de/muenchen/landkreismuenchen/ranking-fahrradfreundlichkeit-fleissige-bundestagsabgeordnete-kolumne-1.5827206", "crawl_date": "2023-04-28 20:21:15.488026"