diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 20289581..14f7de01 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -1205,6 +1205,21 @@ + + + AsahiShimbun + + +
Asahi Shimbun
+ + + + www.asahi.com + + +   +   + TheJapanNews diff --git a/src/fundus/publishers/jp/__init__.py b/src/fundus/publishers/jp/__init__.py index 4b61586a..071d645d 100644 --- a/src/fundus/publishers/jp/__init__.py +++ b/src/fundus/publishers/jp/__init__.py @@ -1,4 +1,5 @@ from fundus.publishers.base_objects import Publisher, PublisherGroup +from fundus.publishers.jp.asahi_shimbun import AsahiShimbunParser from fundus.publishers.jp.the_japan_news import TheJapanNewsParser from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser from fundus.scraping.filter import regex_filter @@ -28,3 +29,10 @@ class JP(metaclass=PublisherGroup): NewsMap("https://www.yomiuri.co.jp/sitemap-news-latest.xml"), ], ) + + AsahiShimbun = Publisher( + name="Asahi Shimbun", + domain="https://www.asahi.com/", + parser=AsahiShimbunParser, + sources=[NewsMap("https://www.asahi.com/sitemap.xml")], + ) diff --git a/src/fundus/publishers/jp/asahi_shimbun.py b/src/fundus/publishers/jp/asahi_shimbun.py new file mode 100644 index 00000000..96f33937 --- /dev/null +++ b/src/fundus/publishers/jp/asahi_shimbun.py @@ -0,0 +1,60 @@ +import datetime +import re +from typing import List, Optional + +from lxml.cssselect import CSSSelector + +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute +from fundus.parser.utility import ( + apply_substitution_pattern_over_list, + extract_article_body_with_selector, + generic_author_parsing, + generic_date_parsing, + generic_topic_parsing, + image_extraction, +) + + +class AsahiShimbunParser(ParserProxy): + class V1(BaseParser): + _summary_selector = CSSSelector("div.nfyQp > div.bv2Sj > p") + _paragraph_selector = CSSSelector("div.nfyQp > p") + _subtitle_selector = CSSSelector("div.nfyQp > h2") + + topic_bloat_pattern = re.compile(r"朝日新聞デジタル|朝日新聞|ニュース|新聞|その他・話題") + + @attribute + def body(self) -> Optional[ArticleBody]: + return extract_article_body_with_selector( + self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + summary_selector=self._summary_selector, + subheadline_selector=self._subtitle_selector, + ) + + @attribute + def title(self) -> Optional[str]: + return self.precomputed.meta.get("TITLE") + + @attribute + def publishing_date(self) -> Optional[datetime.datetime]: + return generic_date_parsing(self.precomputed.meta.get("article:published_time")) + + @attribute + def authors(self) -> List[str]: + return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def topics(self) -> List[str]: + return apply_substitution_pattern_over_list( + generic_topic_parsing(self.precomputed.meta.get("keywords")), self.topic_bloat_pattern + ) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + author_selector=re.compile(r"、(?P[^、]*?)撮影"), + relative_urls=True, + ) diff --git a/tests/resources/parser/test_data/jp/AsahiShimbun.json b/tests/resources/parser/test_data/jp/AsahiShimbun.json new file mode 100644 index 00000000..7e70a92e --- /dev/null +++ b/tests/resources/parser/test_data/jp/AsahiShimbun.json @@ -0,0 +1,52 @@ +{ + "V1": { + "authors": [ + "朝日新聞デジタル" + ], + "body": { + "summary": [], + "sections": [ + { + "headline": [], + "paragraphs": [ + "気象庁は13日、午後9時19分ごろ、日向灘(北緯31.8度、東経131.6度)で震度5弱の地震があったと発表した。震源の深さは約30キロ、地震の規模(マグニチュード)は6.9と推定される。この地震で、気象庁は高知県と宮崎県に1メートルの津波注意報を出した。", + "各地の震度は次のとおり。", + "<震度5弱>", + "宮崎県:高鍋町、新富町、宮崎市", + "<震度4>", + "宮崎県:延岡市、西都市、木城町、川南町、都農町、門川町、日南市*、串間市、国富町、綾町、美郷町、高千穂町、都城市、小林市、えびの市、三股町、高原町", + "福岡県:久留米市", + "佐賀県:神埼市、白石町", + "熊本県:阿蘇市、産山村、高森町、南阿蘇村、熊本市南区、熊本市北区、八代市、菊池市、宇土市、宇城市、合志市、美里町、西原村、氷川町、人吉市、多良木町、あさぎり町、芦北町", + "大分県:大分市、臼杵市、佐伯市、竹田市", + "鹿児島県:鹿児島市、霧島市、いちき串木野市、南さつま市、伊佐市、姶良市、鹿屋市、垂水市、曽於市、大崎町、東串良町、肝付町" + ] + } + ] + }, + "images": [ + { + "versions": [ + { + "url": "https://www.asahicom.jp/imgopt/img/4ff96428f2/comm_L/AS20250113003419.jpg", + "query_width": null, + "size": null, + "type": "image/jpeg" + } + ], + "is_cover": true, + "description": "写真・図版", + "caption": null, + "authors": [], + "position": 737 + } + ], + "publishing_date": "2025-01-13 21:37:00+09:00", + "title": "宮崎県で震度5弱、高知と宮崎に1メートルの津波注意報 気象庁", + "topics": [ + "社会", + "災害・気象", + "宮崎県" + ] + } +} diff --git a/tests/resources/parser/test_data/jp/AsahiShimbun_2025_01_13.html.gz b/tests/resources/parser/test_data/jp/AsahiShimbun_2025_01_13.html.gz new file mode 100644 index 00000000..a9b1198e Binary files /dev/null and b/tests/resources/parser/test_data/jp/AsahiShimbun_2025_01_13.html.gz differ diff --git a/tests/resources/parser/test_data/jp/meta.info b/tests/resources/parser/test_data/jp/meta.info index 438a2da8..a62926b9 100644 --- a/tests/resources/parser/test_data/jp/meta.info +++ b/tests/resources/parser/test_data/jp/meta.info @@ -1,4 +1,8 @@ { + "AsahiShimbun_2025_01_13.html.gz": { + "url": "https://www.asahi.com/articles/AST1F4445T1FUTIL02SM.html", + "crawl_date": "2025-01-13 14:12:17.527262" + }, "TheJapanNews_2024_10_13.html.gz": { "url": "https://japannews.yomiuri.co.jp/politics/politics-government/20241013-216478/", "crawl_date": "2024-10-13 16:27:01.520980"