b2txt25/language_model/examples/handwriting/s0/local/build_lm.sh

#!/bin/bash

lm_src=$1
lm_tgt_dir=$2
dict_type=$3
lm_order=$4
prune_threshold=$5
dict=$6

[ ! -f $dict ] && echo "No such file $dict" && exit 1;

# Check SRILM tools
if ! which ngram-count > /dev/null; then
    echo "srilm tools are not found, please download it and install it from: "
    echo "http://www.speech.sri.com/projects/srilm/download.html"
    echo "Then add the tools to your PATH"
    exit 1
fi

mkdir -p $lm_tgt_dir || exit 1;
echo $lm_tgt_dir

if [ $dict_type == 'phn' ]; then
  # Remove stress markers
  python local/remove_stress_marker.py \
    $dict $lm_tgt_dir/dict
  dict=$lm_tgt_dir/dict
elif [ $dict_type == 'char' ]; then
  cp $dict $lm_tgt_dir/dict
fi

# Unique words
cat $dict | awk '{print $1}' | uniq  > $lm_tgt_dir/lexicons.txt

# 3-gram LM
ngram-count -debug 1 -order $lm_order -wbdiscount -interpolate \
  -unk -map-unk "<unk>" -limit-vocab -vocab $lm_tgt_dir/lexicons.txt \
  -text $lm_src -lm $lm_tgt_dir/lm_orig.arpa

# Prune LM
if [ $prune_threshold ==  "0" ]; then
  ln -sf lm_orig.arpa $lm_tgt_dir/lm.arpa
else
  ngram -prune $prune_threshold -order $lm_order -lm $lm_tgt_dir/lm_orig.arpa -write-lm $lm_tgt_dir/lm_pruned.arpa
  ln -sf lm_pruned.arpa $lm_tgt_dir/lm.arpa
fi

#rm $lm_tgt_dir/webTextSentences_uppercase.txt
competition update 2025-07-02 12:18:09 -07:00			`#!/bin/bash`

			`lm_src=$1`
			`lm_tgt_dir=$2`
			`dict_type=$3`
			`lm_order=$4`
			`prune_threshold=$5`
			`dict=$6`

			`[ ! -f $dict ] && echo "No such file $dict" && exit 1;`

			`# Check SRILM tools`
			`if ! which ngram-count > /dev/null; then`
			`echo "srilm tools are not found, please download it and install it from: "`
			`echo "http://www.speech.sri.com/projects/srilm/download.html"`
			`echo "Then add the tools to your PATH"`
			`exit 1`
			`fi`

			`mkdir -p $lm_tgt_dir \|\| exit 1;`
			`echo $lm_tgt_dir`

			`if [ $dict_type == 'phn' ]; then`
			`# Remove stress markers`
			`python local/remove_stress_marker.py \`
			`$dict $lm_tgt_dir/dict`
			`dict=$lm_tgt_dir/dict`
			`elif [ $dict_type == 'char' ]; then`
			`cp $dict $lm_tgt_dir/dict`
			`fi`

			`# Unique words`
			`cat $dict \| awk '{print $1}' \| uniq > $lm_tgt_dir/lexicons.txt`

			`# 3-gram LM`
			`ngram-count -debug 1 -order $lm_order -wbdiscount -interpolate \`
			`-unk -map-unk "<unk>" -limit-vocab -vocab $lm_tgt_dir/lexicons.txt \`
			`-text $lm_src -lm $lm_tgt_dir/lm_orig.arpa`

			`# Prune LM`
			`if [ $prune_threshold == "0" ]; then`
			`ln -sf lm_orig.arpa $lm_tgt_dir/lm.arpa`
			`else`
			`ngram -prune $prune_threshold -order $lm_order -lm $lm_tgt_dir/lm_orig.arpa -write-lm $lm_tgt_dir/lm_pruned.arpa`
			`ln -sf lm_pruned.arpa $lm_tgt_dir/lm.arpa`
			`fi`

			`#rm $lm_tgt_dir/webTextSentences_uppercase.txt`