Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Rheinische Post as publisher #416

Merged
merged 13 commits into from
Apr 21, 2024
Merged
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>RheinischePost</code>
</td>
<td>
<div>Rheinische Post</div>
</td>
<td>
<a href="https://rp-online.de/">
<span>rp-online.de</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>SpiegelOnline</code>
Expand Down
8 changes: 8 additions & 0 deletions src/fundus/publishers/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from .merkur import MerkurParser
from .ndr import NDRParser
from .ntv import NTVParser
from .rheinische_post import RheinischePostParser
from .spon import SPONParser
from .stern import SternParser
from .sz import SZParser
Expand Down Expand Up @@ -227,3 +228,10 @@ class DE(PublisherEnum):
],
parser=BusinessInsiderDEParser,
)

RheinischePost = PublisherSpec(
name="Rheinische Post",
domain="https://rp-online.de/",
sources=[NewsMap("https://rp-online.de/sitemap-news.xml"), Sitemap("https://rp-online.de/sitemap.xml")],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could add an RSSFeed from here as well.

parser=RheinischePostParser,
)
42 changes: 42 additions & 0 deletions src/fundus/publishers/de/rheinische_post.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class RheinischePostParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("strong[data-cy='intro']")
_paragraph_selector = CSSSelector("div[data-cy='article-content'] p")

addie9800 marked this conversation as resolved.
Show resolved Hide resolved
@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
paragraph_selector=self._paragraph_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.meta.get("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("keywords"))
22 changes: 22 additions & 0 deletions tests/resources/parser/test_data/de/RheinischePost.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"V1": {
"authors": [
"Simon Janßen"
],
"publishing_date": "2024-04-15 16:37:00+02:00",
"title": "Schomaker an der Niederstraße: Bio-Bäckerei schließt Filiale in Neuss",
"topics": [
"Schomaker",
"Neuss",
"Bio",
"Niederstraße",
"Schließung",
"Reißleine",
"Backwaren",
"Bäcker",
"begründet",
"Biobäckerei",
"Bäckereien"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/de/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@
"url": "https://www.n-tv.de/leben/Judy-Lybke-ganz-und-gar-nicht-eigenartig-article24075843.html",
"crawl_date": "2023-04-28 20:32:13.689394"
},
"RheinischePost_2024_04_15.html.gz": {
"url": "https://rp-online.de/nrw/staedte/neuss/neuss-biobaeckerei-schomaker-an-der-niederstrasse-schliesst_aid-110715299",
"crawl_date": "2024-04-15 16:40:22.430078"
},
"SZ_2023_04_28.html.gz": {
"url": "https://www.sueddeutsche.de/muenchen/landkreismuenchen/ranking-fahrradfreundlichkeit-fleissige-bundestagsabgeordnete-kolumne-1.5827206",
"crawl_date": "2023-04-28 20:21:15.488026"
Expand Down
Loading