Merge branch 'master' into add-taipei-times

# Conflicts: # src/fundus/publishers/__init__.py
flairNLP · Jan 3, 2025 · 2793b1b · 2793b1b
2 parents a378007 + 9592e2b
commit 2793b1b
Show file tree

Hide file tree

Showing 23 changed files with 499 additions and 24 deletions.
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -227,6 +227,23 @@
       </td>
       <td>&#160;</td>
     </tr>
+    <tr>
+      <td>
+        <code>ZwanzigMinuten</code>
+      </td>
+      <td>
+        <div>Zwanzig Minuten</div>
+      </td>
+      <td>
+        <a href="https://www.20min.ch/">
+          <span>www.20min.ch</span>
+        </a>
+      </td>
+      <td>
+        <code>topics</code>
+      </td>
+      <td>&#160;</td>
+    </tr>
   </tbody>
 </table>
 
@@ -1127,6 +1144,40 @@
 </table>
 
 
+## IT-Publishers
+
+<table class="publishers it">
+  <thead>
+    <tr>
+      <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>Missing&#160;Attributes</th>
+      <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>
+        <code>LaRepubblica</code>
+      </td>
+      <td>
+        <div>La Repubblica</div>
+      </td>
+      <td>
+        <a href="https://www.repubblica.it">
+          <span>www.repubblica.it</span>
+        </a>
+      </td>
+      <td>
+        <code>images</code>
+      </td>
+      <td>&#160;</td>
+    </tr>
+  </tbody>
+</table>
+
+
 ## JP-Publishers
 
 <table class="publishers jp">

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
@@ -212,7 +212,7 @@ def to_original_characters(text: str) -> str:
             xml = f"<result{i}>" + node2string(node) + f"</result{i}>"
             results.update(replace_keys_in_nested_dict(xmltodict.parse(xml), to_original_characters))
 
-        values = list(results.values())
+        values = list(filter(bool, results.values()))
 
         if scalar:
             if not values:

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
@@ -542,13 +542,13 @@ def get_versions_from_node(
                 query_width = f"{param}:{value}"
 
     # get width, height and init calculator
-    try:
-        width = float(source.get("width") or 0) or None
-    except ValueError:
+    if (src_width := source.get("width")) and src_width.replace(".", "", 1).isdigit():
+        width = float(src_width or 0) or None
+    else:
         width = None
-    try:
-        height = float(source.get("height") or 0) or None
-    except ValueError:
+    if (src_height := source.get("height")) and src_height.replace(".", "", 1).isdigit():
+        height = float(src_height or 0) or None
+    else:
         height = None
     if width and height:
         ratio = width / height
@@ -654,7 +654,6 @@ def nodes_to_text(nodes: List[Union[lxml.html.HtmlElement, str]]) -> Optional[st
                 caption = re.sub(author_selector, "", caption).strip() or None
             elif description and (match := re.search(author_selector, description)):
                 authors = [match.group("credits")]
-                description = re.sub(author_selector, "", description).strip() or None
         else:
             # author is selectable as node
             if author_nodes := author_selector(node):

diff --git a/src/fundus/publishers/__init__.py b/src/fundus/publishers/__init__.py
@@ -11,6 +11,7 @@
 from fundus.publishers.es import ES
 from fundus.publishers.fr import FR
 from fundus.publishers.ind import IND
+from fundus.publishers.it import IT
 from fundus.publishers.jp import JP
 from fundus.publishers.lt import LT
 from fundus.publishers.my import MY
@@ -70,4 +71,5 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
     ca = CA
     es = ES
     jp = JP
+    it = IT
     tw = TW
diff --git a/src/fundus/publishers/ch/__init__.py b/src/fundus/publishers/ch/__init__.py
@@ -5,6 +5,7 @@
 from .nzz import NZZParser
 from .srf import SRFParser
 from .ta import TAParser
+from .zwanzig_minuten import ZwanzigMinutenParser
 
 # noinspection PyPep8Naming
 
@@ -44,3 +45,14 @@ class CH(metaclass=PublisherGroup):
             ),
         ],
     )
+    ZwanzigMinuten = Publisher(
+        name="Zwanzig Minuten",
+        domain="https://www.20min.ch/",
+        parser=ZwanzigMinutenParser,
+        sources=[
+            NewsMap("https://www.20min.ch/sitemaps/de/news.xml"),
+            Sitemap("https://www.20min.ch/sitemaps/de/articles.xml"),
+            NewsMap("https://www.20min.ch/sitemaps/fr/news.xml"),
+            Sitemap("https://www.20min.ch/sitemaps/fr/articles.xml"),
+        ],
+    )
diff --git a/src/fundus/publishers/ch/zwanzig_minuten.py b/src/fundus/publishers/ch/zwanzig_minuten.py
@@ -0,0 +1,52 @@
+import datetime
+from typing import List, Optional
+
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    image_extraction,
+)
+
+
+class ZwanzigMinutenParser(ParserProxy):
+    class V1(BaseParser):
+        _summary_selector = XPath(
+            "//div[@class='Article_elementLead__N3pGr']/p | (//div[@type='typeInfoboxSummary'])[1]//li"
+        )
+        _subheadline_selector = XPath("//section[@class='Article_body__60Liu']//h2[contains(@class, 'crosshead')]")
+        _paragraph_selector = XPath("//div[@class='Article_elementTextblockarray__WNyan']/p")
+
+        @attribute
+        def body(self) -> Optional[ArticleBody]:
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                summary_selector=self._summary_selector,
+                subheadline_selector=self._subheadline_selector,
+                paragraph_selector=self._paragraph_selector,
+            )
+
+        @attribute
+        def authors(self) -> List[str]:
+            return generic_author_parsing(self.precomputed.ld.bf_search("author"))
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.meta.get("og:title")
+
+        @attribute
+        def images(self) -> List[Image]:
+            return image_extraction(
+                doc=self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                upper_boundary_selector=XPath("//article"),
+                caption_selector=XPath("./ancestor::figure//figcaption/span[@class='sc-d47814d6-2 bDLFoO']/p"),
+                author_selector=XPath("./ancestor::figure//figcaption/span[@class='sc-d47814d6-3 bmEwwn']"),
+            )
diff --git a/src/fundus/publishers/de/boersenzeitung.py b/src/fundus/publishers/de/boersenzeitung.py
@@ -16,6 +16,8 @@
 
 class BoersenZeitungParser(ParserProxy):
     class V1(BaseParser):
+        VALID_UNTIL = datetime.date(2024, 12, 9)
+
         _paragraph_selector = CSSSelector("storefront-content-body .no-tts p")
         _subheadline_selector = XPath("//p[contains(@class, 'interline')]")
         _summary_selector = CSSSelector("storefront-html.excerpt > div")
@@ -66,3 +68,14 @@ def images(self) -> List[Image]:
                 image_selector=XPath("//storefront-image|//figure//img"),
                 author_selector=XPath("./ancestor::storefront-section//storefront-html[@class='image-copyright']"),
             )
+
+    class V1_1(V1):
+        VALID_UNTIL = datetime.date.today()
+
+        @attribute
+        def authors(self) -> List[str]:
+            return generic_author_parsing(self.precomputed.meta.get("twitter:data1"))
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(self.precomputed.meta.get("article:published_time"))
diff --git a/src/fundus/publishers/it/__init__.py b/src/fundus/publishers/it/__init__.py
@@ -0,0 +1,24 @@
+from datetime import datetime, timedelta
+
+from dateutil.rrule import MONTHLY, rrule
+
+from fundus.publishers.base_objects import Publisher, PublisherGroup
+from fundus.publishers.it.la_repubblica import LaRepubblicaParser
+from fundus.scraping.url import RSSFeed, Sitemap
+
+
+class IT(metaclass=PublisherGroup):
+    LaRepubblica = Publisher(
+        name="La Repubblica",
+        domain="https://www.repubblica.it",
+        parser=LaRepubblicaParser,
+        sources=[
+            RSSFeed("https://www.repubblica.it/rss/homepage/rss2.0.xml"),
+        ]
+        + [
+            Sitemap(f"https://www.repubblica.it/sitemap-{date.strftime('%Y-%m')}.xml")
+            for date in reversed(
+                list(rrule(MONTHLY, dtstart=datetime(2020, 1, 1), until=datetime.now() + timedelta(days=30)))
+            )
+        ],
+    )
diff --git a/src/fundus/publishers/it/la_repubblica.py b/src/fundus/publishers/it/la_repubblica.py
@@ -0,0 +1,55 @@
+from datetime import datetime
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    generic_topic_parsing,
+)
+
+
+class LaRepubblicaParser(ParserProxy):
+    class V1(BaseParser):
+        # Selectors for article body parts
+        _summary_selector = CSSSelector("div.story__summary p")
+        _paragraph_selector = CSSSelector("div.story__text p")
+        _subheadline_selector = CSSSelector("div.story__text h2")
+
+        @attribute
+        def title(self) -> Optional[str]:
+            # Get the headline from og:title meta tag
+            return self.precomputed.meta.get("og:title")
+
+        @attribute
+        def body(self) -> Optional[ArticleBody]:
+            # Extract article body using utility function
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                subheadline_selector=self._subheadline_selector,
+            )
+
+        @attribute
+        def authors(self) -> List[str]:
+            # Extract authors from schema.org NewsArticle data
+            authors = self.precomputed.ld.xpath_search("//NewsArticle/author")
+            if authors:
+                return generic_author_parsing(authors)
+            return []
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime]:
+            # Use scalar parameter for direct value
+            date_str = self.precomputed.ld.xpath_search("//NewsArticle/datePublished", scalar=True)
+            return generic_date_parsing(date_str)
+
+        @attribute
+        def topics(self) -> List[str]:
+            # Simplified topic extraction using name in xpath
+            topics = self.precomputed.ld.xpath_search("//NewsArticle/about/name")
+            return generic_topic_parsing(topics) if topics else []
diff --git a/src/fundus/publishers/uk/the_bbc.py b/src/fundus/publishers/uk/the_bbc.py
@@ -16,12 +16,16 @@
 
 class TheBBCParser(ParserProxy):
     class V1(BaseParser):
-        _subheadline_selector = CSSSelector("div[data-component='subheadline-block']")
-        _summary_selector = XPath("//div[@data-component='text-block'][1] //p[b]")
+        _subheadline_selector = XPath(
+            "//div[@data-component='subheadline-block' or @data-component='text-block' or contains(@class, 'ebmt73l0')]//*[self::h2 or (self::p and b and position()>1)]"
+        )
+        _summary_selector = XPath(
+            "(//div[@data-component='text-block' or contains(@class, 'ebmt73l0')])[1] //p[b and position()=1]"
+        )
         _paragraph_selector = XPath(
-            "//div[@data-component='text-block'][1]//p[not(b) and text()] |"
-            "//div[@data-component='text-block'][position()>1] //p[text()] |"
-            "//div[@data-component='text-block'] //ul /li[text()]"
+            "//div[@data-component='text-block' or contains(@class, 'ebmt73l0')][1]//p[not(b) and text()] |"
+            "//div[@data-component='text-block' or contains(@class, 'ebmt73l0')][position()>1] //p[text()] |"
+            "//div[@data-component='text-block' or contains(@class, 'ebmt73l0')] //ul /li[text()]"
         )
 
         _topic_selector = CSSSelector(