Skip to content

Commit

Permalink
fine-tuned verbal derivation for more precise compounding
Browse files Browse the repository at this point in the history
  • Loading branch information
merisiga committed May 14, 2024
1 parent 92f7b70 commit a41279f
Show file tree
Hide file tree
Showing 8 changed files with 349 additions and 48 deletions.
6 changes: 4 additions & 2 deletions src/fst/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ generator-raw-gt-desc.simple.hfst: generator-raw-gt-desc.simple.weightless.hfst
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mus' -a 10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ng' -a 10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/v' -a 10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/vus' -a 10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tav' -a 10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/nud' -a 10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mata' -a 10 -A \
Expand Down Expand Up @@ -625,7 +626,7 @@ generator-raw-gt-desc.simple.hfst: generator-raw-gt-desc.simple.weightless.hfst
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Neg' -a 1 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sup' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Inf' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ger' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ger' -a 5 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prc' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Foc/gi' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Emph' -a 0 -A \
Expand Down Expand Up @@ -748,6 +749,7 @@ guesser-raw.weighted.hfst: guesser-raw.simple.hfst
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mus' -a -10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ng' -a -10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/v' -a -10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/vus' -a 10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tav' -a -10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/nud' -a -10 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mata' -a -10 -A \
Expand Down Expand Up @@ -821,7 +823,7 @@ guesser-raw.weighted.hfst: guesser-raw.simple.hfst
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Neg' -a 1 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sup' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Inf' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ger' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ger' -a 5 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prc' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Foc/gi' -a 0 -A \
| $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Emph' -a 0 -A \
Expand Down
8 changes: 6 additions & 2 deletions src/fst/filters/block-derivations.est.xfscript
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ define BadDer1 [
[~[?* ("+Guess") [["+N"] | ["+Num" "+Card"]]] "+Der/kond"] |
[~[?* [[ ("+Guess") "+A"] | [("+Guess") "+A" "+Comp"] | [("+Guess") "+A" "+Superl"] |
[ ZZ {tama} ("+Guess") "+V"] | [ ZZ {dama} ("+Guess") "+V"] |
[ {hooldama} ("+Guess") "+V"] | [ {soendama} ("+Guess") "+V"] | [ {tsema} ("+Guess") "+V"] | [ ? ? ? {lema} ("+Guess") "+V"]]] "+Der/us"] |
[ {hooldama} ("+Guess") "+V"] | [ {soendama} ("+Guess") "+V"] |
[ {tsema} ("+Guess") "+V"] | [ ? ? ? {lema} ("+Guess") "+V"]]] "+Der/us"] |
[~[?* [[ ZZ {tama} ("+Guess") "+V"] | [ ZZ {dama} ("+Guess") "+V"]]] "+Der/is"] |
[~[?* [{uma} | {tellima}] ("+Guess") "+V"] "+Der/mus"] | # devalveeruma - devalveerumus
[~[?* {eerima} ("+Guess") "+V"] "+Der/ng"] | # devalveerima - devalveering
Expand Down Expand Up @@ -65,7 +66,10 @@ define BadDer2 [

# some suffix sequences are actually bad
define BadDer3 [
[[ "+Der/nu" | "+Der/us" | "+Der/lane" ] "+N" "+Der/ti"]
[[ "+Der/nu" | "+Der/us" | "+Der/lane" ] "+N" "+Der/ti"] |
[[ "+Der/v" | "+Der/tav" | "+Der/nud" | "+Der/matu" | "+Der/tamatu" | "+Der/tu" ] "+A" "+Der/sti"] |
[ "+Der/v" "+A" "+Der/us"] |
[[ "+Der/lik" | "+Der/line" ] "+A" "+Der/sti"]
] ;

# some words are not suitable for certain suffixes
Expand Down
38 changes: 33 additions & 5 deletions src/fst/morphology/affixes/verbs.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,8 @@ LEXICON SUPINE_V
@R.Part.One@ SUPINE_MA_FORMS ; ! These can be 1st part of a compound
: MINE_DERIVATION ; ! elamine
: JA_DERIVATION ; ! elaja, õpetaja, õpetajanna, õpetajake, õpetajalik, ...
: V_DERIVATION ; ! õpetav, õpetavalt, õpetavus, õpetavam, ...
: V_DERIVATION ; ! õpetav, õpetavalt, õpetavam, ...
: VUS_DERIVATION ; ! õpetavus
: MATU_DERIVATION ; ! segamatu, segamatult, segamatus, segamatum, ...
: MATA_DERIVATION ; ! segamata
: MUS_DERIVATION ; ! küllastumus, promoveerumus etc
Expand Down Expand Up @@ -461,11 +462,12 @@ LEXICON INFINITIVE ! infinitive, gerund (des-form)

:%>D7 A_INFINITIVE ;

LEXICON A_INFINITIVE ! common part of an infinitive, gerund (des-form);
! also alone in a subparadigm of a few old words
LEXICON A_INFINITIVE ! common part of an infinitive, gerund (des-form);
! also alone in a subparadigm of a few old words

@R.Part.One@ INF_FORMS ; ! This can be 1st part of a compound
@R.Part.One@ INF_FORMS ; ! This can be 1st part of a compound
+Ger:es NO_COMPOUND ;
GER_DERIVATION ; ! pool+magades, rõht+kirjutades;

LEXICON INF_FORMS
+Inf:a GI ;
Expand Down Expand Up @@ -508,6 +510,14 @@ LEXICON PL3
LEXICON NO_COMPOUND ! this form cannot participate in a compound word
@R.Part.One@@P.Part.Bad@ GI ;

! gerund is an exception in compounding and derivation ...
LEXICON GER_DERIVATION ! 2nd part of a compound
@R.Part.Two@@P.Part.Bad@@R.POS.Pref@ GER_DERIVATION_SUF ; ! only 2 components: prefix + verb

LEXICON GER_DERIVATION_SUF ! A derived adverb (but still tagged as +Verb+Ger)
+Ger:es GI ;


! paradigmatic derivation, i.e. very regular
! @R.Part.One@ means that this is not a latter part of a compound
! @R.Case.Par@ and @R.Case.Gen@ mean that this is a latter part of a compound
Expand All @@ -516,8 +526,10 @@ LEXICON NO_COMPOUND ! this form cannot participate in a compound word
LEXICON NU_DERIVATION
@R.Part.One@@P.POS.N@ NU_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Par@@P.POS.N@ NU_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.N@ NU_DERIVATION_SUF ;
@R.Part.One@@P.POS.A@ NUD_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Par@@P.POS.A@ NUD_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.N@ NUD_DERIVATION_SUF ;

LEXICON NU_DERIVATION_SUF
+Der/nu+N:»nu AASTA ; ! derivation: elanu etc
Expand All @@ -529,6 +541,7 @@ LEXICON NUD_DERIVATION_SUF
LEXICON TU_DERIVATION ! NB! the morpheme border + d/t is assigned before this lexicon
@R.Part.One@@P.POS.A@ TU_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Gen@@P.POS.A@ TU_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.A@ TU_DERIVATION_SUF ;

LEXICON TU_DERIVATION_SUF ! NB! the morpheme border + d/t is assigned before this lexicon
+Der/tu+A:u AASTA ; ! derivation: elatu etc
Expand All @@ -537,48 +550,62 @@ LEXICON TU_DERIVATION_SUF ! NB! the morpheme border + d/t is assigned before
LEXICON MATA_DERIVATION
@R.Part.One@@P.POS.A@ MATA_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Par@@P.POS.A@ MATA_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.A@ MATA_DERIVATION_SUF ;

LEXICON MATA_DERIVATION_SUF
+Der/mata+A:»mata GI ; ! elamata

LEXICON MATU_DERIVATION
@R.Part.One@@P.POS.N@ MATU_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Gen@@P.POS.N@ MATU_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.N@ MATU_DERIVATION_SUF ;

LEXICON MATU_DERIVATION_SUF
+Der/matu+A:»matu AASTA ; ! segamatu etc

LEXICON TAMATU_DERIVATION ! NB! the morpheme border + d/t is assigned before this lexicon
@R.Part.One@@P.POS.A@ TAMATU_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Par@@P.POS.A@ TAMATU_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.A@ TAMATU_DERIVATION_SUF ;

LEXICON TAMATU_DERIVATION_SUF ! NB! the morpheme border + d/t is assigned before this lexicon
+Der/tamatu+A:amatu AASTA ; ! segatamatu etc

LEXICON V_DERIVATION
@R.Part.One@@P.POS.A@ V_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Par@@P.POS.A@ V_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.A@ V_DERIVATION_SUF ;

LEXICON V_DERIVATION_SUF
+Der/v+A:»v MAGUS ; ! elav, elava etc

LEXICON VUS_DERIVATION
@R.Part.One@@P.POS.N@ VUS_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Gen@@P.POS.N@@P.Der.us@@C.NeedNoun@ VUS_DERIVATION_SUF ;

LEXICON VUS_DERIVATION_SUF
+Der/vus+N:»vus OLULINE ; ! elavus, ...

LEXICON TAV_DERIVATION ! NB! the morpheme border + d/t is assigned before this lexicon
@R.Part.One@@P.POS.A@ TAV_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Par@@P.POS.A@ TAV_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.A@ TAV_DERIVATION_SUF ;

LEXICON TAV_DERIVATION_SUF ! NB! the morpheme border + d/t is assigned before this lexicon
+Der/tav+A:av MAGUS ; ! elatav, elatava etc

LEXICON MINE_DERIVATION
@R.Part.One@@P.POS.N@ MINE_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Gen@@P.POS.N@ MINE_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.N@ MINE_DERIVATION_SUF ;

LEXICON MINE_DERIVATION_SUF
+Der/mine+N:»mine OLULINE ; ! elamine

LEXICON JA_DERIVATION
@R.Part.One@@P.POS.N@ JA_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Gen@@P.POS.N@ JA_DERIVATION_SUF ;
@R.Case.Sem@@P.POS.N@ JA_DERIVATION_SUF ;

LEXICON JA_DERIVATION_SUF
+Der/ja+N:»ja AASTA ; ! elaja, elajat etc
Expand All @@ -593,14 +620,15 @@ LEXICON IS_DERIVATION_SUF

LEXICON MUS_DERIVATION
@R.Part.One@@P.POS.N@ MUS_DERIVATION_SUF ; ! no previous part; no restrictions
@R.Case.Gen@@P.POS.N@ MUS_DERIVATION_SUF ;
@R.Case.Gen@@P.POS.N@@P.Der.us@@C.NeedNoun@ MUS_DERIVATION_SUF ;

LEXICON MUS_DERIVATION_SUF
+Der/mus+N:»mus OLULINE ; ! promoveerumus, elamus, ...

LEXICON US_DERIVATION_VERB ! us also changes A -> N; directing to that continuation class
@R.Part.One@ US_DERIVATION ; ! no previous part; no restrictions
@R.Case.Gen@ US_DERIVATION ;
@R.Case.Sem@ US_DERIVATION ;

LEXICON NG_DERIVATION
@R.Part.One@@P.POS.N@ NG_DERIVATION_SUF ; ! no previous part; no restrictions
Expand Down
14 changes: 9 additions & 5 deletions src/fst/morphology/root.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,9 @@ Multichar_Symbols
+Der/mus !!= * `@CODE@`
!!€ küllastumus: küllastuma+V+Der/mus+N+Sg+Nom

+Der/vus !!= * `@CODE@`
!!€ elavus: elama+V+Der/vus+N+Sg+Nom

+Der/ng !!= * `@CODE@`
!!€ devalveering: devalveerima+V+Der/ng+N+Sg+Nom

Expand Down Expand Up @@ -713,9 +716,9 @@ Multichar_Symbols
@D.Stem.topelt@
@P.Stem.Guessed@ ! stem is guessed, it is not from the lexicon
@R.Stem.Guessed@
@D.Stem.Guessed@ ! if a lemma has it, then this lemma is restricted in its ability to be the last part of a compound
@D.Stem.Guessed@ ! if a word has it, then this word cannot follow a guessed stem
@C.Stem@

!! A special condition that is used for filtering derivations and compounds
@P.NeedAdj.On@ ! +A or +Der/A
@R.NeedAdj@
Expand Down Expand Up @@ -1224,16 +1227,17 @@ LEXICON Latter !!= * `@CODE@` the latter part of a compound

@R.POS.ACRMinus@ AnyLatterVerb ;
@R.POS.Pref@ AnyLatterVerb ; ! ebalugemine
@R.POS.Pref@ Verbs ; ! gerund: poolmagades

@R.POS.N@@R.Case.Gen@ LatterVerb ; ! lauajooksja
@R.POS.N@@R.Case.Par@ LatterVerb ; ! laudajooksnud

@R.POS.N@@R.Case.Nom@@R.Stem.Nom@ AnyLatterVerb ; ! map orig. case flag for derivation needs
@R.POS.N@@R.Case.Sem@ AnyLatterVerb ; ! map orig. case flag for derivation needs
@R.POS.N@@R.Case.Sem@ LatterVerb ; !
@R.POS.N@@R.Case.Short@ AnyLatterVerb ; ! investeerimisnõustamine

@R.POS.A@@R.Case.Short@ AnyLatterVerb ; ! map orig. case flag for derivation needs
@R.POS.A@@R.Case.Sem@ AnyLatterVerb ; ! map orig. case flag for derivation needs
@R.POS.A@@R.Case.Sem@ LatterVerb ; !
@R.POS.AComp@@R.Case.Sem@ AnyLatterVerb ; ! map orig. case flag for derivation needs

@R.POS.GA@@P.Case.Gen@ LatterVerb ; ! eestivihkaja
Expand All @@ -1253,7 +1257,7 @@ LEXICON Latter !!= * `@CODE@` the latter part of a compound
! NB only some pronouns !
@R.POS.Pron@@R.Case.Gen@ LatterVerb ;
@R.POS.Pron@@R.Case.Par@ LatterVerb ;
@R.POS.Pron@@R.Case.Sem@ AnyLatterVerb ;
@R.POS.Pron@@R.Case.Sem@ LatterVerb ;

@R.POS.V@@D.Case@ AnyLatterVerb ; ! infinitive
@R.POS.V@@R.Case.Sem@ AnyLatterVerb ; ! -ma, -mas etc
Expand Down
8 changes: 6 additions & 2 deletions src/fst/morphology/stems/adverbs.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
! CompoundingAdverbs and NonCompoundingAdverbs ;



LEXICON CompoundingAdverbs


@P.Stem.topelt@järel+Adv:@P.Stem.topelt@järel GI "weight: 5 " ;
@D.Stem.Guessed@@P.Stem.topelt@koos+Adv:@D.Stem.Guessed@@P.Stem.topelt@k˘oos GI "weight: 4 " ;
@P.Stem.topelt@kõrval+Adv:@P.Stem.topelt@kõrval GI "weight: 5 " ;
Expand Down Expand Up @@ -449,11 +451,10 @@ pooleli+Adv:pooleli GI "weight: 7 " ;
poolt+Adv:p˘oolt GI "weight: 4 " ;
praokil+Adv:pr˘aokil GI "weight: 11 " ;
praokile+Adv:pr˘aokile GI "weight: 11 " ;
@D.Stem.Guessed@@P.Stem.vähe@puht+Adv:@D.Stem.Guessed@@P.Stem.vähe@p˘uht GI "weight: 9 " ;
@D.Stem.Guessed@@P.Stem.vähe@vähe+Adv:@D.Stem.Guessed@@P.Stem.vähe@vähe GI "weight: 5 " ;
puhevil+Adv:puhevil GI "weight: 11 " ;
puhevile+Adv:puhevile GI "weight: 11 " ;
puhta+Adv:p˘uhta GI "weight: 11 " ;
@D.Stem.Guessed@@P.Stem.vähe@puht+Adv:@D.Stem.Guessed@@P.Stem.vähe@p˘uht GI "weight: 9 " ;
puhvi+Adv:p˘uhvi GI "weight: 11 " ;
pungi+Adv:p˘ungi GI "weight: 10 " ;
pungil+Adv:pungil GI "weight: 9 " ;
Expand Down Expand Up @@ -714,6 +715,7 @@ võlgu+Adv:v˘õlgu GI "weight: 8 " ;
võõriti+Adv:võõriti GI "weight: 11 " ;
võõrsil+Adv:v˘õõrsil GI "weight: 8 " ;
võõrsile+Adv:v˘õõrsile GI "weight: 11 " ;
@D.Stem.Guessed@@P.Stem.vähe@vähe+Adv:@D.Stem.Guessed@@P.Stem.vähe@vähe GI "weight: 5 " ;
vähem+Adv:vähem GI "weight: 5 " ;
välja+Adv:v˘älʲja GI "weight: 3 " ;
vääri+Adv:vääri GI "weight: 11 " ;
Expand Down Expand Up @@ -752,8 +754,10 @@ väärt+Adv:v˘äärt GI "weight: 7 " ;
@D.Stem.Guessed@ülle+Adv:@D.Stem.Guessed@˘ülle GI "weight: 10 " ;
ümber+Adv:˘ümber GI "weight: 5 " ;


LEXICON NonCompoundingAdverbs


aasta-aastalt+Adv:˘aasta-˘aastalt GI "weight: 9 " ;
aastaringselt+Adv:˘aasta#r˘ingselt GI "weight: 10 " ;
aastates+Adv:˘aastates GI "weight: 11 " ;
Expand Down
1 change: 1 addition & 0 deletions src/fst/morphology/stems/prefixes.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ piko+Pref:piko»- # ;
pisi+Pref:pisi»- # ;
polaar+Pref:pol˘aar»- # ;
poliit+Pref:pol˘iit»- # ;
pool+Pref:p˘ool»- # ;
pop+Pref:p˘op»- # ;
pseudo+Pref:pseudo»- # ;
psühho+Pref:psühho»- # ;
Expand Down
Loading

0 comments on commit a41279f

Please sign in to comment.