You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
in our testing we found too many words missing than we can realistically add to Wiktionary (see also #8). so we now implemented the following strategy which at least allows us to detect the genus. would this be interesting to add to your package?
primary_german_genus_endings = {
"n": [
"chen",
"ett",
"eau",
"lein",
"icht",
"il",
"ium",
"it",
"ma",
"ment",
"tel",
"tum",
"um",
],
"f": [
"in",
"a",
"ade",
"age",
"anz",
"elle",
"ette",
"ere",
"enz",
"ei",
"ine",
"isse",
"itis",
"ive",
"ie",
"heit",
"keit",
"ik",
"sion",
"se",
"sis",
"tät",
"ung",
"ur",
"schaft",
],
"m": [
"ant",
"ast",
"ich",
"ist",
"ig",
"ling",
"or",
"us",
"ismus",
"är",
"eur",
"iker",
"ps",
],
}
secondary_german_genus_endings = {
# 3 out of four words ending with -nis and -sal are neuter nouns
"n": [
"nis", "sal",
],
# There are exceptions such as Postillion, which is masculine while the oberwhelming majority of -ion words in German is feminine.
"f": [
"ion",
],
# More than half of words ending with -er, -en, -el are masculine
"m": [
"er", "en", "el",
],
}
def determine_genus_from_ending(word, german_genus_endings):
for genus in german_genus_endings:
for ending in german_genus_endings[genus]:
if word.endswith(ending):
return {"genus": genus}
return None
def german_noun_lookup(word):
result = german_nouns[word]
if not len(result):
return None
result = result[0]
if "genus" in result:
return result
if "genus 1" in result:
result["genus"] = result["genus 1"]
return result
if word[-5:].lower() == "leute":
result["genus"] = "f"
return result
genus_result = determine_genus_from_ending(word, primary_german_genus_endings)
if genus_result == None or "genus" not in genus_result:
genus_result = determine_genus_from_ending(word, secondary_german_genus_endings)
if genus_result == None or "genus" not in genus_result:
return None
result["genus"] = genus_result["genus"]
return result
def german_noun_analysis(word, genus_only=False):
result = german_noun_lookup(word)
if result != None:
return result
if genus_only:
result = determine_genus_from_ending(word, primary_german_genus_endings)
if result != None:
return result
# skip the first 2 letters
i = 2
# skip the last 2 letters
while i < len(word) - 2:
partial_word = word[i:]
# avoid cases like 'Ende' at the end of 'Arbeitgebende'
if partial_word == "ende":
break
result = german_noun_lookup(partial_word.capitalize())
if result == None:
i += 1
continue
result["Lemma"] = word
if not genus_only:
word_prefix = word[0:i]
for flexion in result["flexion"]:
result["flexion"][flexion] = (
word_prefix + result["flexion"][flexion].lower()
)
return result
if genus_only:
result = determine_genus_from_ending(word, primary_german_genus_endings)
return result
The text was updated successfully, but these errors were encountered:
in our testing we found too many words missing than we can realistically add to Wiktionary (see also #8). so we now implemented the following strategy which at least allows us to detect the genus. would this be interesting to add to your package?
The text was updated successfully, but these errors were encountered: