From fe4a66319251651a2767a6ea62041d3e225c577c Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Fri, 23 Feb 2024 17:52:22 +0100
Subject: [PATCH 1/3] update paragraph selector

---
 src/fundus/publishers/de/mdr.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py
index 9a1e93bfd..c5a5ef192 100644
--- a/src/fundus/publishers/de/mdr.py
+++ b/src/fundus/publishers/de/mdr.py
@@ -3,6 +3,7 @@
 from typing import List, Optional, Pattern
 
 from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
 
 from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
 from fundus.parser.utility import (
@@ -17,7 +18,15 @@
 class MDRParser(ParserProxy):
     class V1(BaseParser):
         _author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von")
-        _paragraph_selector = CSSSelector("div.paragraph")
+        _source_detection: str = (
+            r"((MDR AKTUELL \(ans\))|(Quell(e|en): [A-z\.]{3,4})|(\([A-z]{1,4}\))|([A-z]{3}/[A-z]{3}))"
+        )
+        _paragraph_selector = XPath(
+            f"//div[@class='paragraph '"
+            f" and not(div[@class='mediaCon mediaLeft mediaSizeA cssImage hasNoRessort item-delegated-lightbox'])"
+            f" ]//p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",
+            namespaces={"re": "http://exslt.org/regular-expressions"},
+        )
         _summary_selector = CSSSelector("p.einleitung")
         _subheadline_selector = CSSSelector("div > .subtitle")
         _author_selector = CSSSelector(".articleMeta > .author")
@@ -33,7 +42,10 @@ def body(self) -> ArticleBody:
 
         @attribute
         def topics(self) -> List[str]:
-            return generic_topic_parsing(self.precomputed.meta.get("news_keywords"))
+            if self.precomputed.meta.get("news_keywords") is not None:
+                return generic_topic_parsing(self.precomputed.meta.get("news_keywords"))
+            else:
+                return generic_topic_parsing(self.precomputed.meta.get("keywords"))
 
         @attribute
         def publishing_date(self) -> Optional[datetime.datetime]:

From ff8ef15bbb9c077373015f5c85093a5a7f06bf43 Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Sun, 25 Feb 2024 14:36:26 +0100
Subject: [PATCH 2/3] Update src/fundus/publishers/de/mdr.py

Co-authored-by: Max Dallabetta <max.dallabetta@googlemail.com>
---
 src/fundus/publishers/de/mdr.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py
index c5a5ef192..11144085a 100644
--- a/src/fundus/publishers/de/mdr.py
+++ b/src/fundus/publishers/de/mdr.py
@@ -22,9 +22,8 @@ class V1(BaseParser):
             r"((MDR AKTUELL \(ans\))|(Quell(e|en): [A-z\.]{3,4})|(\([A-z]{1,4}\))|([A-z]{3}/[A-z]{3}))"
         )
         _paragraph_selector = XPath(
-            f"//div[@class='paragraph '"
-            f" and not(div[@class='mediaCon mediaLeft mediaSizeA cssImage hasNoRessort item-delegated-lightbox'])"
-            f" ]//p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",
+            f"//div[@class='paragraph '] "
+            f"/p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",
             namespaces={"re": "http://exslt.org/regular-expressions"},
         )
         _summary_selector = CSSSelector("p.einleitung")

From 66538e67d49333586a1a8967600ad8bf1448b23e Mon Sep 17 00:00:00 2001
From: Adrian Breiding <ad123br@gmail.com>
Date: Sat, 2 Mar 2024 16:23:19 +0100
Subject: [PATCH 3/3] update regex - add regex101 link

---
 src/fundus/publishers/de/mdr.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py
index 11144085a..3fe464dc2 100644
--- a/src/fundus/publishers/de/mdr.py
+++ b/src/fundus/publishers/de/mdr.py
@@ -18,9 +18,8 @@
 class MDRParser(ParserProxy):
     class V1(BaseParser):
         _author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von")
-        _source_detection: str = (
-            r"((MDR AKTUELL \(ans\))|(Quell(e|en): [A-z\.]{3,4})|(\([A-z]{1,4}\))|([A-z]{3}/[A-z]{3}))"
-        )
+        # regex examples: https://regex101.com/r/2DSjAz/1
+        _source_detection: str = r"^((MDR (AKTUELL ){0,1}\(([A-z]{2,3}(\/[A-z]{2,3})*|[A-z, ]{2,50}))\)|(Quell(e|en): (u.a. ){0,1}[A-z,]{3,4})|[A-z]{2,4}(, [A-z]{2,4}){0,3}( \([A-z]{2,4}\)){0,1}$|[A-z]{2,4}\/[A-z(), \/]{3,10}$)"
         _paragraph_selector = XPath(
             f"//div[@class='paragraph '] "
             f"/p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",