diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 23073c830..fb19004d7 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -294,6 +294,21 @@
RheinischePost
+ SpiegelOnline
diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py
index 1cfc72f87..eba571fc5 100644
--- a/src/fundus/publishers/de/__init__.py
+++ b/src/fundus/publishers/de/__init__.py
@@ -19,6 +19,7 @@
from .merkur import MerkurParser
from .ndr import NDRParser
from .ntv import NTVParser
+from .rheinische_post import RheinischePostParser
from .spon import SPONParser
from .stern import SternParser
from .sz import SZParser
@@ -227,3 +228,14 @@ class DE(PublisherEnum):
],
parser=BusinessInsiderDEParser,
)
+
+ RheinischePost = PublisherSpec(
+ name="Rheinische Post",
+ domain="https://rp-online.de/",
+ sources=[
+ RSSFeed("https://rp-online.de/feed.rss"),
+ NewsMap("https://rp-online.de/sitemap-news.xml"),
+ Sitemap("https://rp-online.de/sitemap.xml"),
+ ],
+ parser=RheinischePostParser,
+ )
diff --git a/src/fundus/publishers/de/rheinische_post.py b/src/fundus/publishers/de/rheinische_post.py
new file mode 100644
index 000000000..3e7b741d4
--- /dev/null
+++ b/src/fundus/publishers/de/rheinische_post.py
@@ -0,0 +1,44 @@
+import datetime
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+
+from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
+from fundus.parser.utility import (
+ extract_article_body_with_selector,
+ generic_author_parsing,
+ generic_date_parsing,
+ generic_topic_parsing,
+)
+
+
+class RheinischePostParser(ParserProxy):
+ class V1(BaseParser):
+ _summary_selector = CSSSelector("strong[data-cy='intro']")
+ _paragraph_selector = CSSSelector("div[data-cy='article-content'] p")
+ _subheadline_selector = CSSSelector("div[data-cy='article-content'] h2")
+
+ @attribute
+ def body(self) -> ArticleBody:
+ return extract_article_body_with_selector(
+ self.precomputed.doc,
+ summary_selector=self._summary_selector,
+ paragraph_selector=self._paragraph_selector,
+ subheadline_selector=self._subheadline_selector,
+ )
+
+ @attribute
+ def authors(self) -> List[str]:
+ return generic_author_parsing(self.precomputed.meta.get("author"))
+
+ @attribute
+ def publishing_date(self) -> Optional[datetime.datetime]:
+ return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+ @attribute
+ def title(self) -> Optional[str]:
+ return self.precomputed.meta.get("og:title")
+
+ @attribute
+ def topics(self) -> List[str]:
+ return generic_topic_parsing(self.precomputed.meta.get("keywords"))
diff --git a/tests/resources/parser/test_data/de/RheinischePost.json b/tests/resources/parser/test_data/de/RheinischePost.json
new file mode 100644
index 000000000..8944c56dc
--- /dev/null
+++ b/tests/resources/parser/test_data/de/RheinischePost.json
@@ -0,0 +1,22 @@
+{
+ "V1": {
+ "authors": [
+ "Simon Janßen"
+ ],
+ "publishing_date": "2024-04-15 16:37:00+02:00",
+ "title": "Schomaker an der Niederstraße: Bio-Bäckerei schließt Filiale in Neuss",
+ "topics": [
+ "Schomaker",
+ "Neuss",
+ "Bio",
+ "Niederstraße",
+ "Schließung",
+ "Reißleine",
+ "Backwaren",
+ "Bäcker",
+ "begründet",
+ "Biobäckerei",
+ "Bäckereien"
+ ]
+ }
+}
diff --git a/tests/resources/parser/test_data/de/RheinischePost_2024_04_15.html.gz b/tests/resources/parser/test_data/de/RheinischePost_2024_04_15.html.gz
new file mode 100644
index 000000000..ce1ed083a
Binary files /dev/null and b/tests/resources/parser/test_data/de/RheinischePost_2024_04_15.html.gz differ
diff --git a/tests/resources/parser/test_data/de/meta.info b/tests/resources/parser/test_data/de/meta.info
index 7f1e92dee..4789c6726 100644
--- a/tests/resources/parser/test_data/de/meta.info
+++ b/tests/resources/parser/test_data/de/meta.info
@@ -63,6 +63,10 @@
"url": "https://www.n-tv.de/leben/Judy-Lybke-ganz-und-gar-nicht-eigenartig-article24075843.html",
"crawl_date": "2023-04-28 20:32:13.689394"
},
+ "RheinischePost_2024_04_15.html.gz": {
+ "url": "https://rp-online.de/nrw/staedte/neuss/neuss-biobaeckerei-schomaker-an-der-niederstrasse-schliesst_aid-110715299",
+ "crawl_date": "2024-04-15 16:40:22.430078"
+ },
"SZ_2023_04_28.html.gz": {
"url": "https://www.sueddeutsche.de/muenchen/landkreismuenchen/ranking-fahrradfreundlichkeit-fleissige-bundestagsabgeordnete-kolumne-1.5827206",
"crawl_date": "2023-04-28 20:21:15.488026"