HFST: Lingala

We examplify the use of HFST command line tools with an example taken from Beesley & Karttunen that generates a lexical analyzer for Lingala, a Bantu language # spoken along the Zaire river. $FORMAT is the implementation type of the transducer. The solution given on this page can also be executed with a single script.

echo "[ {bet} | {béb} | {bomb} | {bóndel} | {bóngol} | {bót} | {búk} |
 {fung} | {kabol} | {kang} | {kom} | {kund} | {kóm} | {lakis} |
 {lí}  | {lob} | {luk} | {ndim} | {palangan} | {pangwis} | {sál} |
 {sepel} | {sómb} | {tál} | {támbol} | {tambwis} | {tataban} |
 {tún} | {yébis}
]" | hfst-regexp2fst -f $FORMAT > Stems

echo "[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|
       á|é|í|ó|ú]" | hfst-regexp2fst -f $FORMAT > L

echo '[Per ":" 1]' | hfst-regexp2fst -f $FORMAT > Person1
echo '[Per ":" 2]' | hfst-regexp2fst -f $FORMAT > Person2
echo '[Per ":" 3]' | hfst-regexp2fst -f $FORMAT > Person3
echo '[Num ":" [Sg|Pl] ]' | hfst-regexp2fst -f $FORMAT > Number
# define Gender [Gen ":" [[1["0"|1]|2|3|4|5|6|7|8|9] "." [1[4|5]|[1|9|10]a]]];
echo '[Gen ":" [1 "." 2 | 1a "." 2 | 3 "." 4 | 5 "." 6 |
                7 "." 8 | 9a "." 10a | 10 | 11 "." 6 |
                14 "." 6 | 15]]' | hfst-regexp2fst -f $FORMAT > Gender3
 
# define Reflexive [No | Yes];

# Passive: -am-
# Causative: -is-
# Reciprocal: -an-
# Applicative:l -el-
# Reversive: -ol-
# Iterative: reduplication of the root or root plus other morpheme;
#  the two copies are separated by -a-; the second copy of the root
#  has low tone on the first syllable
# That is, the order of the morphemes is: REV, CAUS, PASS, APPL, RECIP

# alobaloba 'he/she speak SP ITER'
# alíalia   'he eat SP ITER'

echo '[Past ":" [Rec|Hist|MoreRem|MostRem]]' | hfst-regexp2fst -f $FORMAT > PastTense
echo '[Pres ":" [Cont|Hab1|Hab2]]' | hfst-regexp2fst -f $FORMAT > PresTense
echo '[Fut ":" [Immed|MostRem]]' | hfst-regexp2fst -f $FORMAT > FutTense

# define Reflexive [Refl ":" [No|Yes]];
echo '[Tns ":" [PastTense|PresTense|FutTense]]' | hfst-regexp2fst -f $FORMAT > Tense
# define Polarity [Pol ":" [Pos|Neg]];

define Agreement [[[Person1 | Person2] " " Number] |
                  [Person3 " " Number " " Gender3]];

# For Gender 15 we have only singular subject marker, no plural
# and no object markers. Missing info?

echo '[Sub ":" Agreement]  - [$Pl & $15]' | hfst-regexp2fst -f $FORMAT > SubjAgr
echo ' [Obj ":" Agreement] - $15 ' | hfst-regexp2fst -f $FORMAT > ObjAgr
echo '  [Func ":" Agreement]' | hfst-regexp2fst -f $FORMAT > Agr

#define Features [SubjAgr " " ObjAgr " "  Tense " " Polarity];

echo ' [SubjAgr " " ObjAgr " " Tense]' | hfst-regexp2fst -f $FORMAT > Features

echo '"<" Stems "," Features ">" ' | hfst-regexp2fst -f $FORMAT > VerbLex

# Common singular agreement markers.

echo '[[. .] -> {mo} || "<" _ [$[Agr & $Person3 & $Sg & $4]]]' | hfst-regexp2fst -f $FORMAT > RAgr1
echo '[[. .] -> {li} || "<" _ [$[Agr & $Person3 & $Sg & $5]]]' | hfst-regexp2fst -f $FORMAT > RAgr2
echo '[[. .] -> e  || "<" _ [$[Agr & $Person3 & $Sg & $[9a"."10a]]]]' | hfst-regexp2fst -f $FORMAT > RAgr3
echo '[[. .] -> {lo} || "<" _ [$[Agr & $Person3 & $Sg & $[10|11]]]]' | hfst-regexp2fst -f $FORMAT > RAgr4
echo '[[. .] -> {bo} || "<" _ [$[Agr & $Person3 & $Sg & $14]]]' | hfst-regexp2fst -f $FORMAT > RAgr5

# Common plural agreement markers

echo '[[. .] -> {bo} || "<" _ [$[Agr & $Person2 & $Pl]]] ' | hfst-regexp2fst -f $FORMAT > RAgr6
echo '[[. .] -> {ba} || "<" _ [$[Agr & $Person3 & $Pl & $2]]]' | hfst-regexp2fst -f $FORMAT > RAgr7
echo '[[. .] -> {mi} || "<" _ [$[Agr & $Person3 & $Pl & $4]]]' | hfst-regexp2fst -f $FORMAT > RAgr8
echo '[[. .] -> {ma} || "<" _ [$[Agr & $Person3 & $Pl & $[5|6]]]]' | hfst-regexp2fst -f $FORMAT > RAgr9
echo '[[. .] -> {bi}  || "<" _ [$[Agr & $Person3 & $Pl & $7]]]' | hfst-regexp2fst -f $FORMAT > RAgr10
echo '[[. .] -> i  || "<" _ [$[Agr & $Person3 & $Pl & $[9a|10]]]]' | hfst-regexp2fst -f $FORMAT > RAgr11

# Rule Block 1

# Singular specific subject markers

echo '[[. .] -> {na} || "<" _ [$[SubjAgr & $Person1 & $Sg]]] ' | hfst-regexp2fst -f $FORMAT > R101
echo '[[. .] -> o || "<" _ [$[SubjAgr & $Person2 & $Sg]]] ' | hfst-regexp2fst -f $FORMAT > R102
echo '[[. .] -> a || "<" _ [$[SubjAgr & $Person3 & $Sg & $2]]]' | hfst-regexp2fst -f $FORMAT > R103
echo '[[. .] -> e  || "<" _ [$[SubjAgr & $Person3 & $Sg & $7]]]' | hfst-regexp2fst -f $FORMAT > R104
echo '[[. .] -> {ei} || "<" _[$[SubjAgr & $Person3 & $Sg & $15]]]' | hfst-regexp2fst -f $FORMAT > R105

# Rules of referral for singular subject markers

echo '`[RAgr1, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R106
echo '`[RAgr2, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R107
echo '`[RAgr3, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R108
echo '`[RAgr4, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R109
echo '`[RAgr5, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R110

# Plural specific subject markers

echo '[[. .] -> {to} || "<" _ [$[SubjAgr & $Person1 & $Pl]]] ' | hfst-regexp2fst -f $FORMAT > R111

# Rules of referral for plural subject markers.

echo '`[RAgr6, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R112
echo '`[RAgr7, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R113
echo '`[RAgr8, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R114
echo '`[RAgr9, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R115
echo '`[RAgr10, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R116
echo '`[RAgr11, Func, Sub]' | hfst-regexp2fst -f $FORMAT > R117

echo '[[. .] -> {ko} || "<" _ [$[Fut":"Immed]]] ' | hfst-regexp2fst -f $FORMAT > R201

# Singular specific object markers

echo '[[. .] -> n || "<" _ [$[ObjAgr & $Person1 & $Sg]]] ' | hfst-regexp2fst -f $FORMAT > R301
echo '[[. .] -> {ko} || "<" _ [$[ObjAgr & $Person2 & $Sg]]] ' | hfst-regexp2fst -f $FORMAT > R302
echo '[[. .] -> {mo} || "<" _ [$[ObjAgr & $Person3 & $Sg & $2]]]' | hfst-regexp2fst -f $FORMAT > R303
echo '[[. .] -> {ei}  || "<" _ [$[ObjAgr & $Person3 & $Sg & $7]]]' | hfst-regexp2fst -f $FORMAT > R304

# Rules of referral for singular object markers

echo '`[RAgr1, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R305
echo '`[RAgr2, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R306
echo '`[RAgr3, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R307
echo '`[RAgr4, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R308
echo '`[RAgr5, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R309

# Plural specific object markers

echo '[[. .] -> {lo} || "<" _ [$[ObjAgr & $Person1 & $Pl]]] ' | hfst-regexp2fst -f $FORMAT > R310

# Rules of referral for plural object markers

echo '`[RAgr6, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R311
echo '`[RAgr7, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R312
echo '`[RAgr8, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R313
echo '`[RAgr9, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R314
echo '`[RAgr10, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R315
echo '`[RAgr11, Func, Obj]' | hfst-regexp2fst -f $FORMAT > R316

# Tense rules
echo '[[. .] -> {ak} || _ "," [$[Pres":"[Hab1|Hab2]|
                                 Past":"[Hist|MostRem]]]]' | hfst-regexp2fst -f $FORMAT > R401 
echo '[[. .] -> a    || _ "," [$[Pres":"Cont|Fut":"Immed]]]' | hfst-regexp2fst -f $FORMAT > R402

echo '[[. .] -> i || _ "," [$[Fut":"MostRem|Past":"[Rec|Hist]]]]' | hfst-regexp2fst -f $FORMAT > R501

# Eliminate Features and auxiliary symbols from the lower side.

echo '\L -> 0' | hfst-regexp2fst -f $FORMAT > Cleanup

# Build the Lingala Verb Transducer

echo  'VerbLex
        .o.
   R301 .o. R302 .o. R303 .o. R304 .o. R305 .o.
   R306 .o. R307 .o. R308 .o. R309
        .o.
   R310 .o. R311 .o. R312 .o. R313 .o. R314 .o.
   R315 .o. R316
        .o.
        R201                                   
        .o.
   R101 .o. R102 .o. R103 .o. R104 .o. R105 .o.
   R106 .o. R107 .o. R108 .o. R109 .o. R110
        .o.
   R111 .o. R112 .o. R113 .o. R114 .o. R115 .o.
   R116 .o. R117
        .o.
   R401 .o. R402 .o. R501                      
        .o.
       Cleanup' | hfst-fst2regexp -f $FORMAT > Lingala


-- ErikAxelson - 2011-09-26