flairNLP · MaxDall · Feb 12, 2024 · Jan 30, 2024 · Feb 1, 2024 · Feb 6, 2024
diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
@@ -132,10 +132,10 @@ def get_meta_content(tree: lxml.html.HtmlElement) -> Dict[str, str]:
     return meta
 
 
-def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement]) -> Optional[str]:
+def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement], join_on: str = "\n\n") -> Optional[str]:
     if not text_nodes:
         return None
-    return "\n\n".join(([re.sub(r"\n+", " ", node.text_content()) for node in text_nodes])).strip()
+    return join_on.join(([re.sub(r"\n+", " ", node.text_content()) for node in text_nodes])).strip()
 
 
 def apply_substitution_pattern_over_list(

diff --git a/src/fundus/publishers/de/dw.py b/src/fundus/publishers/de/dw.py
@@ -13,12 +13,19 @@
     generic_date_parsing,
     generic_text_extraction_with_css,
     generic_topic_parsing,
+    strip_nodes_to_text,
 )
 
 
 class DWParser(ParserProxy):
     class V2(BaseParser):
-        _paragraph_selector = CSSSelector("div.rich-text > p")
+        VALID_UNTIL = datetime.date(2024, 1, 18)
+        # https://regex101.com/r/Xsadk5/1
+        _author_regex = r"^([A-z]{2,3}\/)*([A-z]{2,3})\s\([A-z\s,\d]*\)$"
+        _paragraph_selector = XPath(
+            f"//div[contains(@class, 'rich-text')] /p[text() and not(re:test(text(), '{_author_regex}'))]",
+            namespaces={"re": "http://exslt.org/regular-expressions"},
+        )
         _summary_selector = CSSSelector("header > p")
         _subheadline_selector = CSSSelector("div.rich-text > h2")
 
@@ -53,6 +60,18 @@ def title(self) -> Optional[str]:
         def topics(self) -> List[str]:
             return [node.text_content().strip() for node in self._topic_selector(self.precomputed.doc)]
 
+    class V2_1(V2):
+        VALID_UNTIL = datetime.date.today()
+
+        _topic_selector = CSSSelector("header > div.kicker > span")
+
+        @attribute
+        def topics(self) -> List[str]:
+            topic_nodes = self._topic_selector(self.precomputed.doc)
+            if (topic_string := strip_nodes_to_text(topic_nodes, join_on=", ")) is not None:
+                return topic_string.split(", ")
+            return []
+
     class V1(BaseParser):
         VALID_UNTIL = datetime.date(2023, 6, 12)
 

diff --git a/tests/resources/parser/test_data/de/DW.json b/tests/resources/parser/test_data/de/DW.json
@@ -25,5 +25,16 @@
       "Long COVID",
       "Coronavirus"
     ]
+  },
+  "V2_1": {
+    "authors": [
+      "Jennifer Pahlke"
+    ],
+    "publishing_date": "2024-01-30 14:13:12.269000+00:00",
+    "title": "Russland-Wahl: Nadeschdin setzt auf ein Ende der Putin-Ära",
+    "topics": [
+      "Politik",
+      "Russische Föderation"
+    ]
   }
 }
diff --git a/tests/resources/parser/test_data/de/DW_2024_01_30.html.gz b/tests/resources/parser/test_data/de/DW_2024_01_30.html.gz
diff --git a/tests/resources/parser/test_data/de/meta.info b/tests/resources/parser/test_data/de/meta.info
@@ -15,6 +15,10 @@
     "url": "https://www.dw.com/de/post-vac-syndrom-nach-covid-19-impfung-was-wissen-wir/a-65897191?maca=de-rss-de-all-1119-xml-mrss",
     "crawl_date": "2023-06-13 16:57:08.558047"
   },
+  "DW_2024_01_30.html.gz": {
+    "url": "https://www.dw.com/de/russland-wahl-nadeschdin-setzt-auf-ein-ende-der-putin-%C3%A4ra/a-68117411?maca=de-rss-de-all-1119-xml-mrss",
+    "crawl_date": "2024-01-30 16:12:24.867249"
+  },
   "DieWelt_2023_04_28.html.gz": {
     "url": "https://www.welt.de/wirtschaft/article245055596/BIP-Diese-Grafiken-zeigen-wie-schlecht-es-um-Deutschlands-Wirtschaft-steht.html",
     "crawl_date": "2023-04-28 20:22:32.033988"