Skip to content

Commit

Permalink
adjust image extraction, author and topic parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxDall committed Jan 14, 2025
1 parent ec0c759 commit 0f488fa
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 14 deletions.
20 changes: 15 additions & 5 deletions src/fundus/publishers/jp/tokyo_chunichi_shimbun.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
apply_substitution_pattern_over_list,
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
Expand All @@ -20,6 +21,9 @@ class V1(BaseParser):
_paragraph_selector = XPath("//div[@class='block' and not(descendant::div or descendant::h2)]")
_subheadline_selector = XPath("//div[@class='block']//h2")

_author_bloat_pattern = re.compile(r"記者")
_topic_bloat_pattern = re.compile(r"話題・|話題")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
Expand All @@ -30,30 +34,36 @@ def body(self) -> Optional[ArticleBody]:

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")
return self.precomputed.ld.bf_search("headline")

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
return apply_substitution_pattern_over_list(
generic_author_parsing(self.precomputed.ld.bf_search("author")), self._author_bloat_pattern
)

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.ld.bf_search("articleSection"))
if topics := apply_substitution_pattern_over_list(
generic_topic_parsing(self.precomputed.ld.bf_search("articleSection")), self._topic_bloat_pattern
):
return [topic for topic in topics if "ニュース" not in topic]
return []

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
image_selector=CSSSelector("main div.image img, main div.thumb img"),
caption_selector=XPath(
"./ancestor::div[@class='wrap']//p[@class='caption'] | "
"./ancestor::div[@class='thumb']//p[@class='thumb-caption']"
),
author_selector=re.compile(r"((?P<credits>[^)]+))\s*$"),
paragraph_selector=self._paragraph_selector,
author_selector=re.compile(r".+((?P<credits>[^)]*?)(撮影)?)\s*$"),
relative_urls=True,
)
7 changes: 3 additions & 4 deletions tests/resources/parser/test_data/jp/ChunichiShimbun.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,16 @@
],
"is_cover": true,
"description": "宮崎県で最大震度5弱を観測した地震と南海トラフ地震との関連を調べるため、気象庁で開かれた評価検討会=13日午後10時31分(代表撮影)",
"caption": "宮崎県で最大震度5弱を観測した地震と南海トラフ地震との関連を調べるため、気象庁で開かれた評価検討会=13日午後10時31分",
"caption": null,
"authors": [
"代表撮影"
"代表"
],
"position": 343
}
],
"publishing_date": "2025-01-14 01:28:54+09:00",
"title": "南海トラフ、一時調査 巨大地震、基準達せず:中日新聞Web",
"title": "南海トラフ、一時調査 巨大地震、基準達せず",
"topics": [
"ニュース",
"社会"
]
}
Expand Down
8 changes: 3 additions & 5 deletions tests/resources/parser/test_data/jp/TokyoShimbun.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@
],
"is_cover": false,
"description": "移送のため警視庁を出る岩本絹子容疑者=13日、東京・霞ケ関で(潟沼義樹撮影)",
"caption": "移送のため警視庁を出る岩本絹子容疑者=13日、東京・霞ケ関で",
"caption": null,
"authors": [
"潟沼義樹撮影"
"潟沼義樹"
],
"position": 314
},
Expand Down Expand Up @@ -82,12 +82,10 @@
}
],
"publishing_date": "2025-01-13 20:01:37+09:00",
"title": "「金に執着」と指弾された東京女子医大元理事長、大学に不正に1.2億円支払わせ私的流用か 警視庁逮捕:東京新聞デジタル",
"title": "「金に執着」と指弾された東京女子医大元理事長、大学に不正に1.2億円支払わせ私的流用か 警視庁逮捕",
"topics": [
"ニュース",
"社会",
"東京・首都圏",
"東京ニュース",
"新宿区"
]
}
Expand Down

0 comments on commit 0f488fa

Please sign in to comment.