Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AsahiShimbun #682

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,21 @@
</tr>
</thead>
<tbody>
<tr>
<td>
<code>AsahiShimbun</code>
</td>
<td>
<div>Asahi Shimbun</div>
</td>
<td>
<a href="https://www.asahi.com/">
<span>www.asahi.com</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TheJapanNews</code>
Expand Down
8 changes: 8 additions & 0 deletions src/fundus/publishers/jp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.jp.asahi_shimbun import AsahiShimbunParser
from fundus.publishers.jp.the_japan_news import TheJapanNewsParser
from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser
from fundus.scraping.filter import regex_filter
Expand Down Expand Up @@ -28,3 +29,10 @@ class JP(metaclass=PublisherGroup):
NewsMap("https://www.yomiuri.co.jp/sitemap-news-latest.xml"),
],
)

AsahiShimbun = Publisher(
name="Asahi Shimbun",
domain="https://www.asahi.com/",
parser=AsahiShimbunParser,
sources=[NewsMap("https://www.asahi.com/sitemap.xml")],
)
60 changes: 60 additions & 0 deletions src/fundus/publishers/jp/asahi_shimbun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import datetime
import re
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
apply_substitution_pattern_over_list,
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
)


class AsahiShimbunParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("div.nfyQp > div.bv2Sj > p")
_paragraph_selector = CSSSelector("div.nfyQp > p")
_subtitle_selector = CSSSelector("div.nfyQp > h2")

topic_bloat_pattern = re.compile(r"朝日新聞デジタル|朝日新聞|ニュース|新聞|その他・話題")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
summary_selector=self._summary_selector,
subheadline_selector=self._subtitle_selector,
)

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("TITLE")

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.meta.get("article:published_time"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def topics(self) -> List[str]:
return apply_substitution_pattern_over_list(
generic_topic_parsing(self.precomputed.meta.get("keywords")), self.topic_bloat_pattern
)

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
author_selector=re.compile(r"、(?P<credits>[^、]*?)撮影"),
relative_urls=True,
MaxDall marked this conversation as resolved.
Show resolved Hide resolved
)
52 changes: 52 additions & 0 deletions tests/resources/parser/test_data/jp/AsahiShimbun.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"V1": {
"authors": [
"朝日新聞デジタル"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"気象庁は13日、午後9時19分ごろ、日向灘(北緯31.8度、東経131.6度)で震度5弱の地震があったと発表した。震源の深さは約30キロ、地震の規模(マグニチュード)は6.9と推定される。この地震で、気象庁は高知県と宮崎県に1メートルの津波注意報を出した。",
"各地の震度は次のとおり。",
"<震度5弱>",
"宮崎県:高鍋町、新富町、宮崎市",
"<震度4>",
"宮崎県:延岡市、西都市、木城町、川南町、都農町、門川町、日南市*、串間市、国富町、綾町、美郷町、高千穂町、都城市、小林市、えびの市、三股町、高原町",
"福岡県:久留米市",
"佐賀県:神埼市、白石町",
"熊本県:阿蘇市、産山村、高森町、南阿蘇村、熊本市南区、熊本市北区、八代市、菊池市、宇土市、宇城市、合志市、美里町、西原村、氷川町、人吉市、多良木町、あさぎり町、芦北町",
"大分県:大分市、臼杵市、佐伯市、竹田市",
"鹿児島県:鹿児島市、霧島市、いちき串木野市、南さつま市、伊佐市、姶良市、鹿屋市、垂水市、曽於市、大崎町、東串良町、肝付町"
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://www.asahicom.jp/imgopt/img/4ff96428f2/comm_L/AS20250113003419.jpg",
"query_width": null,
"size": null,
"type": "image/jpeg"
}
],
"is_cover": true,
"description": "写真・図版",
"caption": null,
"authors": [],
"position": 737
}
],
"publishing_date": "2025-01-13 21:37:00+09:00",
"title": "宮崎県で震度5弱、高知と宮崎に1メートルの津波注意報 気象庁",
"topics": [
"社会",
"災害・気象",
"宮崎県"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/jp/meta.info
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
{
"AsahiShimbun_2025_01_13.html.gz": {
"url": "https://www.asahi.com/articles/AST1F4445T1FUTIL02SM.html",
"crawl_date": "2025-01-13 14:12:17.527262"
},
"TheJapanNews_2024_10_13.html.gz": {
"url": "https://japannews.yomiuri.co.jp/politics/politics-government/20241013-216478/",
"crawl_date": "2024-10-13 16:27:01.520980"
Expand Down
Loading