diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index 55afd402c..79a32f101 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -181,7 +181,11 @@ class DE(PublisherEnum): Bild = PublisherSpec( name="Bild", domain="https://www.bild.de/", - sources=[RSSFeed("https://www.bild.de/rssfeeds/vw-neu/vw-neu-32001674,view=rss2.bild.xml")], + sources=[ + RSSFeed("https://www.bild.de/rssfeeds/vw-neu/vw-neu-32001674,view=rss2.bild.xml"), + NewsMap("https://www.bild.de/sitemap-news.xml"), + Sitemap("https://www.bild.de/sitemap-index.xml"), + ], parser=BildParser, ) diff --git a/src/fundus/publishers/de/sz.py b/src/fundus/publishers/de/sz.py index 475c66fe0..3f6f281f7 100644 --- a/src/fundus/publishers/de/sz.py +++ b/src/fundus/publishers/de/sz.py @@ -1,7 +1,8 @@ import datetime -from typing import List, Optional +from typing import List, Optional, Union from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute from fundus.parser.utility import ( @@ -14,9 +15,12 @@ class SZParser(ParserProxy): class V1(BaseParser): - _paragraph_selector = CSSSelector('main [itemprop="articleBody"] > p, ' "main .css-korpch > div > ul > li") - _summary_selector = CSSSelector("main [data-manual='teaserText']") - _subheadline_selector = CSSSelector("main [itemprop='articleBody'] > h3") + VALID_UNTIL = datetime.datetime(2024, 2, 1).date() + _paragraph_selector: Union[CSSSelector, XPath] = CSSSelector( + 'main [itemprop="articleBody"] > p, ' "main .css-korpch > div > ul > li" + ) + _summary_selector: Union[CSSSelector, XPath] = CSSSelector("main [data-manual='teaserText']") + _subheadline_selector: Union[CSSSelector, XPath] = CSSSelector("main [itemprop='articleBody'] > h3") @attribute def body(self) -> ArticleBody: @@ -42,3 +46,14 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + class V1_1(V1): + VALID_UNTIL = datetime.date.today() + _paragraph_selector = XPath( + "//div[@itemprop='articleBody']//p[@data-manual='paragraph'" " and not(contains(text(), '© dpa-infocom'))]" + ) + _summary_selector = CSSSelector("main [data-manual='teaserText']") + _subheadline_selector = XPath( + "//div[@itemprop='articleBody']//h3[@data-manual='subheadline']|" + "//div[@itemprop='articleBody']//h2[@data-manual='subheadline']" + ) diff --git a/tests/resources/parser/test_data/de/SZ.json b/tests/resources/parser/test_data/de/SZ.json index a54341ee1..b5bc19d80 100644 --- a/tests/resources/parser/test_data/de/SZ.json +++ b/tests/resources/parser/test_data/de/SZ.json @@ -12,5 +12,18 @@ "Landkreis München", "Süddeutsche Zeitung" ] + }, + "V1_1": { + "authors": [ + "Süddeutsche Zeitung" + ], + "publishing_date": "2024-02-06 21:19:20+01:00", + "title": "Ex-Präsident stirbt bei Hubschrauberabsturz", + "topics": [ + "Kommentarfunktion", + "Politik", + "Politik", + "Süddeutsche Zeitung" + ] } } diff --git a/tests/resources/parser/test_data/de/SZ_2024_02_06.html.gz b/tests/resources/parser/test_data/de/SZ_2024_02_06.html.gz new file mode 100644 index 000000000..4b3ae3034 Binary files /dev/null and b/tests/resources/parser/test_data/de/SZ_2024_02_06.html.gz differ diff --git a/tests/resources/parser/test_data/de/meta.info b/tests/resources/parser/test_data/de/meta.info index aa35f4c74..d6d649f8d 100644 --- a/tests/resources/parser/test_data/de/meta.info +++ b/tests/resources/parser/test_data/de/meta.info @@ -55,6 +55,10 @@ "url": "https://www.sueddeutsche.de/muenchen/landkreismuenchen/ranking-fahrradfreundlichkeit-fleissige-bundestagsabgeordnete-kolumne-1.5827206", "crawl_date": "2023-04-28 20:21:15.488026" }, + "SZ_2024_02_06.html.gz": { + "url": "https://www.sueddeutsche.de/politik/chile-ex-praesident-stirbt-bei-hubschrauberabsturz-1.6345022", + "crawl_date": "2024-02-06 21:46:40.133150" + }, "SpiegelOnline_2023_04_28.html.gz": { "url": "https://www.spiegel.de/wirtschaft/unternehmen/silicon-valley-bank-federal-reserve-raeumt-versaeumnisse-bei-aufsicht-ein-a-60a0111a-c2d2-46c8-9e93-b16a374e6ba8#ref=rss", "crawl_date": "2023-04-28 20:21:16.250027"