From fe4a66319251651a2767a6ea62041d3e225c577c Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Fri, 23 Feb 2024 17:52:22 +0100 Subject: [PATCH 1/3] update paragraph selector --- src/fundus/publishers/de/mdr.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py index 9a1e93bfd..c5a5ef192 100644 --- a/src/fundus/publishers/de/mdr.py +++ b/src/fundus/publishers/de/mdr.py @@ -3,6 +3,7 @@ from typing import List, Optional, Pattern from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute from fundus.parser.utility import ( @@ -17,7 +18,15 @@ class MDRParser(ParserProxy): class V1(BaseParser): _author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von") - _paragraph_selector = CSSSelector("div.paragraph") + _source_detection: str = ( + r"((MDR AKTUELL \(ans\))|(Quell(e|en): [A-z\.]{3,4})|(\([A-z]{1,4}\))|([A-z]{3}/[A-z]{3}))" + ) + _paragraph_selector = XPath( + f"//div[@class='paragraph '" + f" and not(div[@class='mediaCon mediaLeft mediaSizeA cssImage hasNoRessort item-delegated-lightbox'])" + f" ]//p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]", + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) _summary_selector = CSSSelector("p.einleitung") _subheadline_selector = CSSSelector("div > .subtitle") _author_selector = CSSSelector(".articleMeta > .author") @@ -33,7 +42,10 @@ def body(self) -> ArticleBody: @attribute def topics(self) -> List[str]: - return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) + if self.precomputed.meta.get("news_keywords") is not None: + return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) + else: + return generic_topic_parsing(self.precomputed.meta.get("keywords")) @attribute def publishing_date(self) -> Optional[datetime.datetime]: From ff8ef15bbb9c077373015f5c85093a5a7f06bf43 Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Sun, 25 Feb 2024 14:36:26 +0100 Subject: [PATCH 2/3] Update src/fundus/publishers/de/mdr.py Co-authored-by: Max Dallabetta --- src/fundus/publishers/de/mdr.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py index c5a5ef192..11144085a 100644 --- a/src/fundus/publishers/de/mdr.py +++ b/src/fundus/publishers/de/mdr.py @@ -22,9 +22,8 @@ class V1(BaseParser): r"((MDR AKTUELL \(ans\))|(Quell(e|en): [A-z\.]{3,4})|(\([A-z]{1,4}\))|([A-z]{3}/[A-z]{3}))" ) _paragraph_selector = XPath( - f"//div[@class='paragraph '" - f" and not(div[@class='mediaCon mediaLeft mediaSizeA cssImage hasNoRessort item-delegated-lightbox'])" - f" ]//p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]", + f"//div[@class='paragraph '] " + f"/p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]", namespaces={"re": "http://exslt.org/regular-expressions"}, ) _summary_selector = CSSSelector("p.einleitung") From 66538e67d49333586a1a8967600ad8bf1448b23e Mon Sep 17 00:00:00 2001 From: Adrian Breiding Date: Sat, 2 Mar 2024 16:23:19 +0100 Subject: [PATCH 3/3] update regex - add regex101 link --- src/fundus/publishers/de/mdr.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py index 11144085a..3fe464dc2 100644 --- a/src/fundus/publishers/de/mdr.py +++ b/src/fundus/publishers/de/mdr.py @@ -18,9 +18,8 @@ class MDRParser(ParserProxy): class V1(BaseParser): _author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von") - _source_detection: str = ( - r"((MDR AKTUELL \(ans\))|(Quell(e|en): [A-z\.]{3,4})|(\([A-z]{1,4}\))|([A-z]{3}/[A-z]{3}))" - ) + # regex examples: https://regex101.com/r/2DSjAz/1 + _source_detection: str = r"^((MDR (AKTUELL ){0,1}\(([A-z]{2,3}(\/[A-z]{2,3})*|[A-z, ]{2,50}))\)|(Quell(e|en): (u.a. ){0,1}[A-z,]{3,4})|[A-z]{2,4}(, [A-z]{2,4}){0,3}( \([A-z]{2,4}\)){0,1}$|[A-z]{2,4}\/[A-z(), \/]{3,10}$)" _paragraph_selector = XPath( f"//div[@class='paragraph '] " f"/p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",