Skip to content

Commit

Permalink
add ChunichiShimbun
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxDall committed Jan 13, 2025
1 parent 9c65127 commit ec0c759
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 5 deletions.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1220,6 +1220,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>ChunichiShimbun</code>
</td>
<td>
<div>Chunichi Shimbun</div>
</td>
<td>
<a href="https://www.chunichi.co.jp/">
<span>www.chunichi.co.jp</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TheJapanNews</code>
Expand Down
11 changes: 9 additions & 2 deletions src/fundus/publishers/jp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.jp.asahi_shimbun import AsahiShimbunParser
from fundus.publishers.jp.the_japan_news import TheJapanNewsParser
from fundus.publishers.jp.tokyo_shimbun import TokyoShimbunParser
from fundus.publishers.jp.tokyo_chunichi_shimbun import TokyoChunichiShimbunParser
from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser
from fundus.scraping.filter import regex_filter
from fundus.scraping.url import NewsMap, Sitemap
Expand Down Expand Up @@ -41,6 +41,13 @@ class JP(metaclass=PublisherGroup):
TokyoShimbun = Publisher(
name="Tokyo Shimbun",
domain="https://www.tokyo-np.co.jp/",
parser=TokyoShimbunParser,
parser=TokyoChunichiShimbunParser,
sources=[NewsMap("https://www.tokyo-np.co.jp/sitemap.xml")],
)

ChunichiShimbun = Publisher(
name="Chunichi Shimbun",
domain="https://www.chunichi.co.jp/",
parser=TokyoChunichiShimbunParser,
sources=[NewsMap("https://www.chunichi.co.jp/sitemap.xml")],
)
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)


class TokyoShimbunParser(ParserProxy):
class TokyoChunichiShimbunParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath("//div[@class='block' and not(descendant::div or descendant::h2)]")
_subheadline_selector = XPath("//div[@class='block']//h2")
Expand Down Expand Up @@ -48,8 +48,11 @@ def topics(self) -> List[str]:
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
image_selector=CSSSelector("div.image img"),
caption_selector=XPath("./ancestor::div[@class='wrap']//p[@class='caption']"),
image_selector=CSSSelector("main div.image img, main div.thumb img"),
caption_selector=XPath(
"./ancestor::div[@class='wrap']//p[@class='caption'] | "
"./ancestor::div[@class='thumb']//p[@class='thumb-caption']"
),
author_selector=re.compile(r"((?P<credits>[^)]+))\s*$"),
paragraph_selector=self._paragraph_selector,
relative_urls=True,
Expand Down
45 changes: 45 additions & 0 deletions tests/resources/parser/test_data/jp/ChunichiShimbun.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"V1": {
"authors": [
"中日新聞Web"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"13日午後9時19分ごろ、宮崎県で震度5弱の地震があった。気象庁によると、震源地は日向灘で、震源の深さは約36キロ。地震の規模はマグニチュード(M)6・6と推定される。高知県、宮崎県で津波を観測した。昨年8月に続いて2度目となる臨時情報を発表。評価検討会を開いて南海トラフ巨大地震との関連について調査し「発生可能性が平常時と比べて相対的に高まったと考えられる現象ではない」として終了すると明らかにした。",
"気象庁は調査終了を受けた記者会見で「地震はいつ起きてもおかしくない。日頃からの備えを確実に実施しておくようお願いしたい」と呼びかけた。気象庁によると、日向灘の地震は南海トラフ巨大地震の想定震源域...",
"中日新聞しずおか 北陸中日新聞 日刊県民福井"
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://static.chunichi.co.jp/image/article/size1/c/f/9/8/cf987ccfaa6a53cdafbf62304b029848_1.jpg",
"query_width": null,
"size": null,
"type": "image/jpeg"
}
],
"is_cover": true,
"description": "宮崎県で最大震度5弱を観測した地震と南海トラフ地震との関連を調べるため、気象庁で開かれた評価検討会=13日午後10時31分(代表撮影)",
"caption": "宮崎県で最大震度5弱を観測した地震と南海トラフ地震との関連を調べるため、気象庁で開かれた評価検討会=13日午後10時31分",
"authors": [
"代表撮影"
],
"position": 343
}
],
"publishing_date": "2025-01-14 01:28:54+09:00",
"title": "南海トラフ、一時調査 巨大地震、基準達せず:中日新聞Web",
"topics": [
"ニュース",
"社会"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/jp/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
"url": "https://www.asahi.com/articles/AST1F4445T1FUTIL02SM.html",
"crawl_date": "2025-01-13 14:12:17.527262"
},
"ChunichiShimbun_2025_01_13.html.gz": {
"url": "https://www.chunichi.co.jp/article/1011185",
"crawl_date": "2025-01-13 18:10:25.145717"
},
"TheJapanNews_2024_10_13.html.gz": {
"url": "https://japannews.yomiuri.co.jp/politics/politics-government/20241013-216478/",
"crawl_date": "2024-10-13 16:27:01.520980"
Expand Down

0 comments on commit ec0c759

Please sign in to comment.