Skip to content

Commit

Permalink
add script to get counts of lemmas by pos for gt dicts
Browse files Browse the repository at this point in the history
  • Loading branch information
Phaqui committed Jan 13, 2025
1 parent 402cc02 commit 331225a
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 0 deletions.
44 changes: 44 additions & 0 deletions dicts/scripts/gt_pos_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3

# gt_pos_counts.py
# run in a giellalt/dict-xxx-yyy directory, and print how many lemmas
# are found of each pos in the src/ xml files. E.g. output for dict-nob-sme:
# {"Num": 86, "V": 2348, "Phrase": 260, "Det": 21, "Po": 2, "Adv": 414, "A": 1758, "CC": 8, "CS": 14, "Pron": 88, "N": 21593, "Interj": 13, "total": 26657, "Pr": 52}

import xml.etree.ElementTree as ET
from pathlib import Path
from collections import defaultdict
import json


def handle_file(file):
try:
tree = ET.parse(file)
except Exception:
# unparsable xml file, ignore
return

root = tree.getroot()
if root.tag != "r":
# xml file with root that is not <r>, ignore
return

poses = defaultdict(int)
for L in tree.iter("l"):
poses[L.attrib.get("pos")] += 1
return poses


def main():
nlemmas = defaultdict(int)
for file in Path("src/").glob("*.xml"):
if result := handle_file(file):
for k, v in result.items():
nlemmas[k] += v

nlemmas["total"] = sum(nlemmas.values())
print(json.dumps(nlemmas))


if __name__ == "__main__":
raise SystemExit(main())
67 changes: 67 additions & 0 deletions dicts/scripts/gt_pos_counts.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env -S cargo +nightly --color=always -Z script
---cargo
[package]
edition = "2021"
[profile.dev]
opt-level = 3
[dependencies]
serde = { version = "1", features = ["derive"] }
quick-xml = { version = "0.37.2", features = ["serialize"] }
itertools = "0.14"
tap = "1"
---

/*
* gt_pos_counts.rs
* run in a giellalt/dict-xxx-yyy directory, and print how many lemmas
* are found of each pos in the src/ xml files. E.g. output for dict-nob-sme:
* {"Num": 86, "V": 2348, "Phrase": 260, "Det": 21, "Po": 2, "Adv": 414, "A": 1758, "CC": 8, "CS": 14, "Pron": 88, "N": 21593, "Interj": 13, "total": 26657, "Pr": 52}
*/

use itertools::Itertools;
use serde::Deserialize;
use tap::prelude::*;

#[derive(Deserialize)]
struct Root {
#[serde(rename = "e")]
entries: Vec<Entry>,
}

#[derive(Deserialize)]
struct Entry {
#[serde(rename = "lg")]
lgs: Vec<LemmaGroup>,
}

#[derive(Deserialize)]
struct LemmaGroup {
#[serde(rename = "l")]
lemmas: Vec<Lemma>,
}

#[derive(Deserialize)]
struct Lemma {
#[serde(rename = "@pos")]
pos: String,
#[serde(rename = "$text")]
text: String,
}

pub fn main() {
std::path::Path::new("src/")
.read_dir()
.expect("can read src directory")
.flat_map(|direntry_result| direntry_result)
.flat_map(|direntry| std::fs::read_to_string(direntry.path()))
.flat_map(|s| quick_xml::de::from_str::<Root>(&s))
.flat_map(|root| root.entries)
.flat_map(|entry| entry.lgs)
.flat_map(|lg| lg.lemmas)
.filter(|lemma| !lemma.text.contains("_"))
.filter(|lemma| !lemma.text.is_empty())
.map(|lemma| lemma.pos)
.counts()
.tap_mut(|d| { d.insert("total".to_string(), d.values().sum()); })
.tap(|counts| println!("{counts:?}"));
}

0 comments on commit 331225a

Please sign in to comment.