Skip to content

Commit

Permalink
Merge branch 'master' into add-taipei-times
Browse files Browse the repository at this point in the history
# Conflicts:
#	src/fundus/publishers/__init__.py
  • Loading branch information
MaxDall committed Jan 3, 2025
2 parents a378007 + 9592e2b commit 2793b1b
Show file tree
Hide file tree
Showing 23 changed files with 499 additions and 24 deletions.
51 changes: 51 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,23 @@
</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>ZwanzigMinuten</code>
</td>
<td>
<div>Zwanzig Minuten</div>
</td>
<td>
<a href="https://www.20min.ch/">
<span>www.20min.ch</span>
</a>
</td>
<td>
<code>topics</code>
</td>
<td>&#160;</td>
</tr>
</tbody>
</table>

Expand Down Expand Up @@ -1127,6 +1144,40 @@
</table>


## IT-Publishers

<table class="publishers it">
<thead>
<tr>
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Missing&#160;Attributes</th>
<th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<code>LaRepubblica</code>
</td>
<td>
<div>La Repubblica</div>
</td>
<td>
<a href="https://www.repubblica.it">
<span>www.repubblica.it</span>
</a>
</td>
<td>
<code>images</code>
</td>
<td>&#160;</td>
</tr>
</tbody>
</table>


## JP-Publishers

<table class="publishers jp">
Expand Down
2 changes: 1 addition & 1 deletion src/fundus/parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def to_original_characters(text: str) -> str:
xml = f"<result{i}>" + node2string(node) + f"</result{i}>"
results.update(replace_keys_in_nested_dict(xmltodict.parse(xml), to_original_characters))

values = list(results.values())
values = list(filter(bool, results.values()))

if scalar:
if not values:
Expand Down
13 changes: 6 additions & 7 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,13 +542,13 @@ def get_versions_from_node(
query_width = f"{param}:{value}"

# get width, height and init calculator
try:
width = float(source.get("width") or 0) or None
except ValueError:
if (src_width := source.get("width")) and src_width.replace(".", "", 1).isdigit():
width = float(src_width or 0) or None
else:
width = None
try:
height = float(source.get("height") or 0) or None
except ValueError:
if (src_height := source.get("height")) and src_height.replace(".", "", 1).isdigit():
height = float(src_height or 0) or None
else:
height = None
if width and height:
ratio = width / height
Expand Down Expand Up @@ -654,7 +654,6 @@ def nodes_to_text(nodes: List[Union[lxml.html.HtmlElement, str]]) -> Optional[st
caption = re.sub(author_selector, "", caption).strip() or None
elif description and (match := re.search(author_selector, description)):
authors = [match.group("credits")]
description = re.sub(author_selector, "", description).strip() or None
else:
# author is selectable as node
if author_nodes := author_selector(node):
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from fundus.publishers.es import ES
from fundus.publishers.fr import FR
from fundus.publishers.ind import IND
from fundus.publishers.it import IT
from fundus.publishers.jp import JP
from fundus.publishers.lt import LT
from fundus.publishers.my import MY
Expand Down Expand Up @@ -70,4 +71,5 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
ca = CA
es = ES
jp = JP
it = IT
tw = TW
12 changes: 12 additions & 0 deletions src/fundus/publishers/ch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .nzz import NZZParser
from .srf import SRFParser
from .ta import TAParser
from .zwanzig_minuten import ZwanzigMinutenParser

# noinspection PyPep8Naming

Expand Down Expand Up @@ -44,3 +45,14 @@ class CH(metaclass=PublisherGroup):
),
],
)
ZwanzigMinuten = Publisher(
name="Zwanzig Minuten",
domain="https://www.20min.ch/",
parser=ZwanzigMinutenParser,
sources=[
NewsMap("https://www.20min.ch/sitemaps/de/news.xml"),
Sitemap("https://www.20min.ch/sitemaps/de/articles.xml"),
NewsMap("https://www.20min.ch/sitemaps/fr/news.xml"),
Sitemap("https://www.20min.ch/sitemaps/fr/articles.xml"),
],
)
52 changes: 52 additions & 0 deletions src/fundus/publishers/ch/zwanzig_minuten.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import datetime
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
image_extraction,
)


class ZwanzigMinutenParser(ParserProxy):
class V1(BaseParser):
_summary_selector = XPath(
"//div[@class='Article_elementLead__N3pGr']/p | (//div[@type='typeInfoboxSummary'])[1]//li"
)
_subheadline_selector = XPath("//section[@class='Article_body__60Liu']//h2[contains(@class, 'crosshead')]")
_paragraph_selector = XPath("//div[@class='Article_elementTextblockarray__WNyan']/p")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
upper_boundary_selector=XPath("//article"),
caption_selector=XPath("./ancestor::figure//figcaption/span[@class='sc-d47814d6-2 bDLFoO']/p"),
author_selector=XPath("./ancestor::figure//figcaption/span[@class='sc-d47814d6-3 bmEwwn']"),
)
13 changes: 13 additions & 0 deletions src/fundus/publishers/de/boersenzeitung.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

class BoersenZeitungParser(ParserProxy):
class V1(BaseParser):
VALID_UNTIL = datetime.date(2024, 12, 9)

_paragraph_selector = CSSSelector("storefront-content-body .no-tts p")
_subheadline_selector = XPath("//p[contains(@class, 'interline')]")
_summary_selector = CSSSelector("storefront-html.excerpt > div")
Expand Down Expand Up @@ -66,3 +68,14 @@ def images(self) -> List[Image]:
image_selector=XPath("//storefront-image|//figure//img"),
author_selector=XPath("./ancestor::storefront-section//storefront-html[@class='image-copyright']"),
)

class V1_1(V1):
VALID_UNTIL = datetime.date.today()

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.meta.get("twitter:data1"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.meta.get("article:published_time"))
24 changes: 24 additions & 0 deletions src/fundus/publishers/it/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from datetime import datetime, timedelta

from dateutil.rrule import MONTHLY, rrule

from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.it.la_repubblica import LaRepubblicaParser
from fundus.scraping.url import RSSFeed, Sitemap


class IT(metaclass=PublisherGroup):
LaRepubblica = Publisher(
name="La Repubblica",
domain="https://www.repubblica.it",
parser=LaRepubblicaParser,
sources=[
RSSFeed("https://www.repubblica.it/rss/homepage/rss2.0.xml"),
]
+ [
Sitemap(f"https://www.repubblica.it/sitemap-{date.strftime('%Y-%m')}.xml")
for date in reversed(
list(rrule(MONTHLY, dtstart=datetime(2020, 1, 1), until=datetime.now() + timedelta(days=30)))
)
],
)
55 changes: 55 additions & 0 deletions src/fundus/publishers/it/la_repubblica.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from datetime import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class LaRepubblicaParser(ParserProxy):
class V1(BaseParser):
# Selectors for article body parts
_summary_selector = CSSSelector("div.story__summary p")
_paragraph_selector = CSSSelector("div.story__text p")
_subheadline_selector = CSSSelector("div.story__text h2")

@attribute
def title(self) -> Optional[str]:
# Get the headline from og:title meta tag
return self.precomputed.meta.get("og:title")

@attribute
def body(self) -> Optional[ArticleBody]:
# Extract article body using utility function
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def authors(self) -> List[str]:
# Extract authors from schema.org NewsArticle data
authors = self.precomputed.ld.xpath_search("//NewsArticle/author")
if authors:
return generic_author_parsing(authors)
return []

@attribute
def publishing_date(self) -> Optional[datetime]:
# Use scalar parameter for direct value
date_str = self.precomputed.ld.xpath_search("//NewsArticle/datePublished", scalar=True)
return generic_date_parsing(date_str)

@attribute
def topics(self) -> List[str]:
# Simplified topic extraction using name in xpath
topics = self.precomputed.ld.xpath_search("//NewsArticle/about/name")
return generic_topic_parsing(topics) if topics else []
14 changes: 9 additions & 5 deletions src/fundus/publishers/uk/the_bbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,16 @@

class TheBBCParser(ParserProxy):
class V1(BaseParser):
_subheadline_selector = CSSSelector("div[data-component='subheadline-block']")
_summary_selector = XPath("//div[@data-component='text-block'][1] //p[b]")
_subheadline_selector = XPath(
"//div[@data-component='subheadline-block' or @data-component='text-block' or contains(@class, 'ebmt73l0')]//*[self::h2 or (self::p and b and position()>1)]"
)
_summary_selector = XPath(
"(//div[@data-component='text-block' or contains(@class, 'ebmt73l0')])[1] //p[b and position()=1]"
)
_paragraph_selector = XPath(
"//div[@data-component='text-block'][1]//p[not(b) and text()] |"
"//div[@data-component='text-block'][position()>1] //p[text()] |"
"//div[@data-component='text-block'] //ul /li[text()]"
"//div[@data-component='text-block' or contains(@class, 'ebmt73l0')][1]//p[not(b) and text()] |"
"//div[@data-component='text-block' or contains(@class, 'ebmt73l0')][position()>1] //p[text()] |"
"//div[@data-component='text-block' or contains(@class, 'ebmt73l0')] //ul /li[text()]"
)

_topic_selector = CSSSelector(
Expand Down
Loading

0 comments on commit 2793b1b

Please sign in to comment.