Skip to content

Commit

Permalink
Merge pull request #352 from flairNLP/update_SZ
Browse files Browse the repository at this point in the history
Update Bild Sources, Update SZ Parser
  • Loading branch information
addie9800 authored Feb 7, 2024
2 parents c18f234 + 119b5e2 commit ce6039d
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 5 deletions.
6 changes: 5 additions & 1 deletion src/fundus/publishers/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,11 @@ class DE(PublisherEnum):
Bild = PublisherSpec(
name="Bild",
domain="https://www.bild.de/",
sources=[RSSFeed("https://www.bild.de/rssfeeds/vw-neu/vw-neu-32001674,view=rss2.bild.xml")],
sources=[
RSSFeed("https://www.bild.de/rssfeeds/vw-neu/vw-neu-32001674,view=rss2.bild.xml"),
NewsMap("https://www.bild.de/sitemap-news.xml"),
Sitemap("https://www.bild.de/sitemap-index.xml"),
],
parser=BildParser,
)

Expand Down
23 changes: 19 additions & 4 deletions src/fundus/publishers/de/sz.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import datetime
from typing import List, Optional
from typing import List, Optional, Union

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
Expand All @@ -14,9 +15,12 @@

class SZParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = CSSSelector('main [itemprop="articleBody"] > p, ' "main .css-korpch > div > ul > li")
_summary_selector = CSSSelector("main [data-manual='teaserText']")
_subheadline_selector = CSSSelector("main [itemprop='articleBody'] > h3")
VALID_UNTIL = datetime.datetime(2024, 2, 1).date()
_paragraph_selector: Union[CSSSelector, XPath] = CSSSelector(
'main [itemprop="articleBody"] > p, ' "main .css-korpch > div > ul > li"
)
_summary_selector: Union[CSSSelector, XPath] = CSSSelector("main [data-manual='teaserText']")
_subheadline_selector: Union[CSSSelector, XPath] = CSSSelector("main [itemprop='articleBody'] > h3")

@attribute
def body(self) -> ArticleBody:
Expand All @@ -42,3 +46,14 @@ def title(self) -> Optional[str]:
@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))

class V1_1(V1):
VALID_UNTIL = datetime.date.today()
_paragraph_selector = XPath(
"//div[@itemprop='articleBody']//p[@data-manual='paragraph'" " and not(contains(text(), '© dpa-infocom'))]"
)
_summary_selector = CSSSelector("main [data-manual='teaserText']")
_subheadline_selector = XPath(
"//div[@itemprop='articleBody']//h3[@data-manual='subheadline']|"
"//div[@itemprop='articleBody']//h2[@data-manual='subheadline']"
)
13 changes: 13 additions & 0 deletions tests/resources/parser/test_data/de/SZ.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,18 @@
"Landkreis München",
"Süddeutsche Zeitung"
]
},
"V1_1": {
"authors": [
"Süddeutsche Zeitung"
],
"publishing_date": "2024-02-06 21:19:20+01:00",
"title": "Ex-Präsident stirbt bei Hubschrauberabsturz",
"topics": [
"Kommentarfunktion",
"Politik",
"Politik",
"Süddeutsche Zeitung"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/de/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
"url": "https://www.sueddeutsche.de/muenchen/landkreismuenchen/ranking-fahrradfreundlichkeit-fleissige-bundestagsabgeordnete-kolumne-1.5827206",
"crawl_date": "2023-04-28 20:21:15.488026"
},
"SZ_2024_02_06.html.gz": {
"url": "https://www.sueddeutsche.de/politik/chile-ex-praesident-stirbt-bei-hubschrauberabsturz-1.6345022",
"crawl_date": "2024-02-06 21:46:40.133150"
},
"SpiegelOnline_2023_04_28.html.gz": {
"url": "https://www.spiegel.de/wirtschaft/unternehmen/silicon-valley-bank-federal-reserve-raeumt-versaeumnisse-bei-aufsicht-ein-a-60a0111a-c2d2-46c8-9e93-b16a374e6ba8#ref=rss",
"crawl_date": "2023-04-28 20:21:16.250027"
Expand Down

0 comments on commit ce6039d

Please sign in to comment.