add script to get counts of lemmas by pos for gt dicts

giellalt · Jan 13, 2025 · 331225a · 331225a
1 parent 402cc02
commit 331225a
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 0 deletions.
diff --git a/dicts/scripts/gt_pos_counts.py b/dicts/scripts/gt_pos_counts.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+# gt_pos_counts.py
+# run in a giellalt/dict-xxx-yyy directory, and print how many lemmas
+# are found of each pos in the src/ xml files. E.g. output for dict-nob-sme:
+# {"Num": 86, "V": 2348, "Phrase": 260, "Det": 21, "Po": 2, "Adv": 414, "A": 1758, "CC": 8, "CS": 14, "Pron": 88, "N": 21593, "Interj": 13, "total": 26657, "Pr": 52}
+
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from collections import defaultdict
+import json
+
+
+def handle_file(file):
+    try:
+        tree = ET.parse(file)
+    except Exception:
+        # unparsable xml file, ignore
+        return
+
+    root = tree.getroot()
+    if root.tag != "r":
+        # xml file with root that is not <r>, ignore
+        return
+
+    poses = defaultdict(int)
+    for L in tree.iter("l"):
+        poses[L.attrib.get("pos")] += 1
+    return poses
+
+
+def main():
+    nlemmas = defaultdict(int)
+    for file in Path("src/").glob("*.xml"):
+        if result := handle_file(file):
+            for k, v in result.items():
+                nlemmas[k] += v
+
+    nlemmas["total"] = sum(nlemmas.values())
+    print(json.dumps(nlemmas))
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/dicts/scripts/gt_pos_counts.rs b/dicts/scripts/gt_pos_counts.rs
@@ -0,0 +1,67 @@
+#!/usr/bin/env -S cargo +nightly --color=always -Z script
+---cargo
+[package]
+edition = "2021"
+[profile.dev]
+opt-level = 3
+[dependencies]
+serde = { version = "1", features = ["derive"] }
+quick-xml = { version = "0.37.2", features = ["serialize"] }
+itertools = "0.14"
+tap = "1"
+---
+
+/*
+ * gt_pos_counts.rs
+ * run in a giellalt/dict-xxx-yyy directory, and print how many lemmas
+ * are found of each pos in the src/ xml files. E.g. output for dict-nob-sme:
+ * {"Num": 86, "V": 2348, "Phrase": 260, "Det": 21, "Po": 2, "Adv": 414, "A": 1758, "CC": 8, "CS": 14, "Pron": 88, "N": 21593, "Interj": 13, "total": 26657, "Pr": 52}
+ */
+
+use itertools::Itertools;
+use serde::Deserialize;
+use tap::prelude::*;
+
+#[derive(Deserialize)]
+struct Root {
+    #[serde(rename = "e")]
+    entries: Vec<Entry>,
+}
+
+#[derive(Deserialize)]
+struct Entry {
+    #[serde(rename = "lg")]
+    lgs: Vec<LemmaGroup>,
+}
+
+#[derive(Deserialize)]
+struct LemmaGroup {
+    #[serde(rename = "l")]
+    lemmas: Vec<Lemma>,
+}
+
+#[derive(Deserialize)]
+struct Lemma {
+    #[serde(rename = "@pos")]
+    pos: String,
+    #[serde(rename = "$text")]
+    text: String,
+}
+
+pub fn main() {
+    std::path::Path::new("src/")
+        .read_dir()
+        .expect("can read src directory")
+        .flat_map(|direntry_result| direntry_result)
+        .flat_map(|direntry| std::fs::read_to_string(direntry.path()))
+        .flat_map(|s| quick_xml::de::from_str::<Root>(&s))
+        .flat_map(|root| root.entries)
+        .flat_map(|entry| entry.lgs)
+        .flat_map(|lg| lg.lemmas)
+        .filter(|lemma| !lemma.text.contains("_"))
+        .filter(|lemma| !lemma.text.is_empty())
+        .map(|lemma| lemma.pos)
+        .counts()
+        .tap_mut(|d| { d.insert("total".to_string(), d.values().sum()); })
+        .tap(|counts| println!("{counts:?}"));
+}