diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py index 9a1e93bfd..3fe464dc2 100644 --- a/src/fundus/publishers/de/mdr.py +++ b/src/fundus/publishers/de/mdr.py @@ -3,6 +3,7 @@ from typing import List, Optional, Pattern from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute from fundus.parser.utility import ( @@ -17,7 +18,13 @@ class MDRParser(ParserProxy): class V1(BaseParser): _author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von") - _paragraph_selector = CSSSelector("div.paragraph") + # regex examples: https://regex101.com/r/2DSjAz/1 + _source_detection: str = r"^((MDR (AKTUELL ){0,1}\(([A-z]{2,3}(\/[A-z]{2,3})*|[A-z, ]{2,50}))\)|(Quell(e|en): (u.a. ){0,1}[A-z,]{3,4})|[A-z]{2,4}(, [A-z]{2,4}){0,3}( \([A-z]{2,4}\)){0,1}$|[A-z]{2,4}\/[A-z(), \/]{3,10}$)" + _paragraph_selector = XPath( + f"//div[@class='paragraph '] " + f"/p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]", + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) _summary_selector = CSSSelector("p.einleitung") _subheadline_selector = CSSSelector("div > .subtitle") _author_selector = CSSSelector(".articleMeta > .author") @@ -33,7 +40,10 @@ def body(self) -> ArticleBody: @attribute def topics(self) -> List[str]: - return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) + if self.precomputed.meta.get("news_keywords") is not None: + return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) + else: + return generic_topic_parsing(self.precomputed.meta.get("keywords")) @attribute def publishing_date(self) -> Optional[datetime.datetime]: