Files
b2txt25/language_model/examples/speech/s0/local/build_lm.sh
2025-07-02 12:18:09 -07:00

47 lines
1.3 KiB
Bash
Executable File

#!/bin/bash
lm_src=$1
lm_tgt_dir=$2
dict_type=$3
lm_order=$4
prune_threshold=$5
dict=$6
[ ! -f $dict ] && echo "No such file $dict" && exit 1;
# Check SRILM tools
if ! which ngram-count > /dev/null; then
echo "srilm tools are not found, please download it and install it from: "
echo "http://www.speech.sri.com/projects/srilm/download.html"
echo "Then add the tools to your PATH"
exit 1
fi
mkdir -p $lm_tgt_dir || exit 1;
echo $lm_tgt_dir
if [ $dict_type == 'phn' ]; then
# Remove stress markers
python local/remove_stress_marker.py \
$dict $lm_tgt_dir/dict
dict=$lm_tgt_dir/dict
elif [ $dict_type == 'char' ]; then
cp $dict $lm_tgt_dir/dict
fi
# Unique words
cat $dict | awk '{print $1}' | uniq > $lm_tgt_dir/lexicons.txt
# n-gram LM
ngram-count -order $lm_order -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -gt5min 1 -gt6min 1 \
-unk -map-unk "<unk>" -limit-vocab -vocab $lm_tgt_dir/lexicons.txt \
-text $lm_src -lm $lm_tgt_dir/lm_orig.arpa
# Prune LM
echo "Prune LM with threshold $prune_threshold"
if [ $prune_threshold == "0" ]; then
ln -sf lm_orig.arpa $lm_tgt_dir/lm.arpa
else
ngram -prune $prune_threshold -order $lm_order -lm $lm_tgt_dir/lm_orig.arpa -write-lm $lm_tgt_dir/lm_pruned.arpa
ln -sf lm_pruned.arpa $lm_tgt_dir/lm.arpa
fi