diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 20289581..14f7de01 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -1205,6 +1205,21 @@
+AsahiShimbun
+ TheJapanNews
diff --git a/src/fundus/publishers/jp/__init__.py b/src/fundus/publishers/jp/__init__.py
index 4b61586a..071d645d 100644
--- a/src/fundus/publishers/jp/__init__.py
+++ b/src/fundus/publishers/jp/__init__.py
@@ -1,4 +1,5 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
+from fundus.publishers.jp.asahi_shimbun import AsahiShimbunParser
from fundus.publishers.jp.the_japan_news import TheJapanNewsParser
from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser
from fundus.scraping.filter import regex_filter
@@ -28,3 +29,10 @@ class JP(metaclass=PublisherGroup):
NewsMap("https://www.yomiuri.co.jp/sitemap-news-latest.xml"),
],
)
+
+ AsahiShimbun = Publisher(
+ name="Asahi Shimbun",
+ domain="https://www.asahi.com/",
+ parser=AsahiShimbunParser,
+ sources=[NewsMap("https://www.asahi.com/sitemap.xml")],
+ )
diff --git a/src/fundus/publishers/jp/asahi_shimbun.py b/src/fundus/publishers/jp/asahi_shimbun.py
new file mode 100644
index 00000000..96f33937
--- /dev/null
+++ b/src/fundus/publishers/jp/asahi_shimbun.py
@@ -0,0 +1,60 @@
+import datetime
+import re
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+ apply_substitution_pattern_over_list,
+ extract_article_body_with_selector,
+ generic_author_parsing,
+ generic_date_parsing,
+ generic_topic_parsing,
+ image_extraction,
+)
+
+
+class AsahiShimbunParser(ParserProxy):
+ class V1(BaseParser):
+ _summary_selector = CSSSelector("div.nfyQp > div.bv2Sj > p")
+ _paragraph_selector = CSSSelector("div.nfyQp > p")
+ _subtitle_selector = CSSSelector("div.nfyQp > h2")
+
+ topic_bloat_pattern = re.compile(r"朝日新聞デジタル|朝日新聞|ニュース|新聞|その他・話題")
+
+ @attribute
+ def body(self) -> Optional[ArticleBody]:
+ return extract_article_body_with_selector(
+ self.precomputed.doc,
+ paragraph_selector=self._paragraph_selector,
+ summary_selector=self._summary_selector,
+ subheadline_selector=self._subtitle_selector,
+ )
+
+ @attribute
+ def title(self) -> Optional[str]:
+ return self.precomputed.meta.get("TITLE")
+
+ @attribute
+ def publishing_date(self) -> Optional[datetime.datetime]:
+ return generic_date_parsing(self.precomputed.meta.get("article:published_time"))
+
+ @attribute
+ def authors(self) -> List[str]:
+ return generic_author_parsing(self.precomputed.ld.bf_search("author"))
+
+ @attribute
+ def topics(self) -> List[str]:
+ return apply_substitution_pattern_over_list(
+ generic_topic_parsing(self.precomputed.meta.get("keywords")), self.topic_bloat_pattern
+ )
+
+ @attribute
+ def images(self) -> List[Image]:
+ return image_extraction(
+ doc=self.precomputed.doc,
+ paragraph_selector=self._paragraph_selector,
+ author_selector=re.compile(r"、(?P