Skip to content

Commit

Permalink
Merge pull request #370 from flairNLP/update-mdr
Browse files Browse the repository at this point in the history
Updated MDR Parser
  • Loading branch information
addie9800 authored Mar 8, 2024
2 parents 995f278 + 66538e6 commit abcf816
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions src/fundus/publishers/de/mdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List, Optional, Pattern

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
Expand All @@ -17,7 +18,13 @@
class MDRParser(ParserProxy):
class V1(BaseParser):
_author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von")
_paragraph_selector = CSSSelector("div.paragraph")
# regex examples: https://regex101.com/r/2DSjAz/1
_source_detection: str = r"^((MDR (AKTUELL ){0,1}\(([A-z]{2,3}(\/[A-z]{2,3})*|[A-z, ]{2,50}))\)|(Quell(e|en): (u.a. ){0,1}[A-z,]{3,4})|[A-z]{2,4}(, [A-z]{2,4}){0,3}( \([A-z]{2,4}\)){0,1}$|[A-z]{2,4}\/[A-z(), \/]{3,10}$)"
_paragraph_selector = XPath(
f"//div[@class='paragraph '] "
f"/p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",
namespaces={"re": "http://exslt.org/regular-expressions"},
)
_summary_selector = CSSSelector("p.einleitung")
_subheadline_selector = CSSSelector("div > .subtitle")
_author_selector = CSSSelector(".articleMeta > .author")
Expand All @@ -33,7 +40,10 @@ def body(self) -> ArticleBody:

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("news_keywords"))
if self.precomputed.meta.get("news_keywords") is not None:
return generic_topic_parsing(self.precomputed.meta.get("news_keywords"))
else:
return generic_topic_parsing(self.precomputed.meta.get("keywords"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
Expand Down

0 comments on commit abcf816

Please sign in to comment.