HFST: Finnish Prosody

NOTE: The character ´ shows as ´ inside the verbatim sections, probably due to a bug in KitWiki formalism.

We examplify the use of HFST command line tools with an example taken from Beesley & Karttunen that maps Finnish words into a prosodic representation. For more information on the representation, see the original solution. $FORMAT is the implementation type of the transducer. The solution given on this page can also be executed with a single script.

The data:

echo "kalastelet
kalasteleminen
ilmoittautuminen
jrjestelemttmyydestns
kalastelemme
ilmoittautumisesta
jrjestelmllisyydellni
jrjestelmllistmtnt
voimisteluttelemasta
opiskelija
opettamassa
kalastelet
strukturalismi
onnittelemanikin
mki
perij
repem
ergonomia
puhelimellani
matematiikka
puhelimistani
rakastajattariansa
kuningas
kainostelijat
ravintolat
merkonomin" | hfst-strings2fst -f $FORMAT -j > FinnWords.hfst

Some definitions:

echo "[u | y | i]" | hfst-regexp2fst -f $FORMAT > HighV                       # High vowel
echo "[e | o | ]" | hfst-regexp2fst -f $FORMAT > MidV                        # Mid vowel
echo "[a | ]" | hfst-regexp2fst -f $FORMAT > LowV                            # Low vowel
echo '[ @"HighV" | @"MidV" | @"LowV" ]' | hfst-regexp2fst -f $FORMAT > USV    # Unstressed Vowel

echo "[ b | c | d | f | g | h | j | k | l | m | n | p "\
     "| q | r | s | t | v | w | x | z ]" | hfst-regexp2fst -f $FORMAT > C     # Consonant

echo '[ |  |  |  |  |  | ´ | ´]' \
| hfst-regexp2fst -f $FORMAT > MSV
echo '[ |  |  |  |  | y` | ` | `]' \
| hfst-regexp2fst -f $FORMAT > SSV
echo '[ @"MSV" | @"SSV" ]' | hfst-regexp2fst -f $FORMAT > SV                  # Stressed vowel
echo '[ @"USV" | @"SV" ]' | hfst-regexp2fst -f $FORMAT > V                    # Vowel

echo '[ @"V" | @"C" ]' | hfst-regexp2fst -f $FORMAT > P                       # Phone

echo '[[\@"P"+] | .#.]' | hfst-regexp2fst -f $FORMAT > B                      # Boundary
echo '[.#. | "."]' | hfst-regexp2fst -f $FORMAT > E                           # Edge
echo '[~$"." "." ~$"."]' | hfst-regexp2fst -f $FORMAT > SB                    # At most one syllable boundary

echo '[ @"C"* @"V" ]'| hfst-regexp2fst -f $FORMAT > Light                     # Light syllable
echo '[ @"Light" @"P"+ ]'| hfst-regexp2fst -f $FORMAT > Heavy                 # Heavy syllable

echo '[ @"Heavy" | @"Light" ]' | hfst-regexp2fst -f $FORMAT > S               # Syllable
echo '[ @"S" & $@"SV" ]' | hfst-regexp2fst -f $FORMAT > SS                    # Stressed syllable
echo '[ @"S" & ~@"SV" ]' | hfst-regexp2fst -f $FORMAT > US                    # Unstressed syllable
echo '[ @"S" & $@"MSV" ]' | hfst-regexp2fst -f $FORMAT > MSS                  # Syllable with main stress

echo '[ @"S" "." @"S" ]' | hfst-regexp2fst -f $FORMAT > BF                    # Binary foot

Rules for prosody:

echo '[ [. .] -> "." || [ @"HighV" | @"MidV" ]'\
  '_ @"LowV",'\
  'i _ [@"MidV" - e],'\
  'u _ [@"MidV" - o],'\
  'y _ [@"MidV" - ] ]'\
| hfst-regexp2fst -f $FORMAT > MarkNonDiphtongs      # y.e

# The general syllabification rule has exceptions. In particular, loan
# words such as ate.isti 'atheist' must be partially syllabified in the
# lexicon.

echo ' @"C"*  @"V"+ @"C"* @-> ... "." || _ @"C" @"V" ' \
| hfst-regexp2fst -f $FORMAT > Syllabify

echo ' @"BF" "." @"Light" @-> "(" ... ")" '\
     '// [{).} | .#.] [@"BF" "."]*  _'\
     '["." @"Heavy" "." @"S" ] | .#. ' \
| hfst-regexp2fst -f $FORMAT > TernaryFeet

# Scan all the unfooted material into binary feet.

echo ' @"BF" @-> "(" ... ")" || .#.|"." _ .#.|"." ' | hfst-regexp2fst -f $FORMAT > BinaryFeet

# Assign the primary stress to the first vowel of the first syllable.

echo ' a -> , e -> , i -> , o -> ,'\
     'u -> , y -> ,  -> ´,  -> ´ || .#. "(" @"C"* _' \
| hfst-regexp2fst -f $FORMAT > MainStress

# Assign secondary stress to all initial vowels of non-initial syllables.

echo ' a -> , e -> , i -> , o -> ,'\
     'u -> , y -> y`,  -> `,  -> `'\
     '|| "(" @"C"* _ ' | hfst-regexp2fst -f $FORMAT > SecondaryStress

# Assign an optional secondary stress to an unfooted final syllable
# if it is heavy.

echo 'a (->) , e (->) , i (->) ,'\
     'o (->) , u (->) , y (->) y`,'\
     ' (->) `,  (->) `" || "." @"C"* _ @"P" .#. ' \
| hfst-regexp2fst -f $FORMAT > OptFinalStress

Calculate the composition of rules from MarkNonDiphtongs to OptFinalStress and compose the lexicon with the composition of rules.

cp MarkNonDiphtongs Rules;
for i in
  Syllabify \
  TernaryFeet \
  BinaryFeet \
  MainStress \
  SecondaryStress \
  OptFinalStress; \
do
  cat Rules | hfst-compose $i > TMP;
  mv TMP Rules;
done

cat FinnWords | hfst-compose Rules > FinnProsody

Print the lexicon with prosody indicated.

cat FinnProsody | hfst-project -p output | hfst-fst2strings

Here is the output:

(n.nit).(t.le).(m.ni).kn
(n.nit).(t.le).(m.ni).kin
(.pet.ta).(ms.sa)
(.pis).(k.li.ja)
(r.go).(n.mi.a)
(l.moit).(tu.tu).(m.nen)
(l.moit).(tu.tu.mi).(ss.ta)
(vi.mis.te).(lt.te.le).(ms.ta)
(strk.tu.ra).(ls.mi)
(r.kas.ta).(jt.ta.ri).(n.sa)
(r.vin).(t.lat)
(r.pe).(`.m)
(p.ri.j)
(p.he.li).(ml.la.ni)
(p.he.li).(ms.ta.ni)
(m´.ki)
(m.te.ma).(tik.ka)
(mr.ko).(n.min)
(ki.nos).(t.li).jt
(ki.nos).(t.li).jat
(k.las).(t.let)
(k.las).(t.le).(m.nen)
(k.las.te).(lm.me)
(k.nin).gs
(k.nin).gas
(j´r.jes).(tl.ml).(ls.t.m).(t`n.t)
(j´r.jes).(tl.ml.li).(sy`y.del).(l`.ni)
(j´r.jes).(t.le).(m`t.t).(my`y.des).(t`n.s)


-- ErikAxelson - 2011-09-19