167 lines
5.2 KiB
Bash
Executable File
167 lines
5.2 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
|
|
# Mobvoi Corporation (Author: Di Wu)
|
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
|
|
|
echo "$0 $*" >&2 # Print the command line for logging
|
|
. ./path.sh
|
|
|
|
nj=1
|
|
cmd=run.pl
|
|
nlsyms=""
|
|
lang=""
|
|
feat=""
|
|
feat_type="kaldi"
|
|
oov="<unk>"
|
|
bpecode=""
|
|
allow_one_column=false
|
|
raw=""
|
|
verbose=0
|
|
trans_type=char
|
|
filetype=""
|
|
preprocess_conf=""
|
|
category=""
|
|
out="" # If omitted, write in stdout
|
|
help_message=$(cat << EOF
|
|
Usage: $0 <data-dir> <dict>
|
|
e.g. $0 data/train data/lang_1char/train_units.txt
|
|
Options:
|
|
--nj <nj> # number of parallel jobs
|
|
--cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
|
|
--feat <feat-scp> # feat.scp or feat1.scp,feat2.scp,...
|
|
--feat-type <feat-type> # kaldi or wav
|
|
--oov <oov-word> # Default: <unk>
|
|
--out <outputfile> # If omitted, write in stdout
|
|
--filetype <mat|hdf5|sound.hdf5> # Specify the format of feats file
|
|
--preprocess-conf <json> # Apply preprocess to feats when creating shape.scp
|
|
--verbose <num> # Default: 0
|
|
EOF
|
|
)
|
|
. tools/parse_options.sh
|
|
|
|
if [ $# != 2 ]; then
|
|
echo "${help_message}" 1>&2
|
|
exit 1;
|
|
fi
|
|
|
|
set -euo pipefail
|
|
|
|
dir=$1
|
|
dic=$2
|
|
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
|
|
#trap 'rm -rf ${tmpdir}' EXIT
|
|
|
|
# 1. Create scp files for inputs
|
|
# These are not necessary for decoding mode, and make it as an option
|
|
input=
|
|
if [ -n "${feat}" ]; then
|
|
_feat_scps=$(echo "${feat}" | tr ',' ' ' )
|
|
read -r -a feat_scps <<< $_feat_scps
|
|
num_feats=${#feat_scps[@]}
|
|
|
|
for (( i=1; i<=num_feats; i++ )); do
|
|
feat=${feat_scps[$((i-1))]}
|
|
mkdir -p ${tmpdir}/input_${i}
|
|
input+="input_${i} "
|
|
cat ${feat} > ${tmpdir}/input_${i}/feat.scp
|
|
|
|
# Dump in the "legacy" style JSON format
|
|
if [ -n "${filetype}" ]; then
|
|
awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \
|
|
> ${tmpdir}/input_${i}/filetype.scp
|
|
fi
|
|
|
|
if [ ${feat_type} == "kaldi" ]; then
|
|
tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \
|
|
--filetype "${filetype}" \
|
|
--preprocess-conf "${preprocess_conf}" \
|
|
--verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp
|
|
elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then
|
|
if [ -f $dir/segments ]; then
|
|
# used for segmented wav.scp
|
|
awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur
|
|
fi
|
|
if [ ! -f $dir/utt2dur ]; then
|
|
tools/wav_to_duration.sh --nj ${nj} \
|
|
${feat} ${tmpdir}/input_${i}/shape.scp
|
|
# use the existed utt2dur as shape.scp directly
|
|
else
|
|
cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp
|
|
fi
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# 2. Create scp files for outputs
|
|
mkdir -p ${tmpdir}/output
|
|
if [ -n "${bpecode}" ]; then
|
|
if [ "${trans_type}" == "cn_char_en_bpe" ]; then
|
|
tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
|
|
else
|
|
paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \
|
|
| tools/spm_encode --model=${bpecode} --output_format=piece) \
|
|
> ${tmpdir}/output/token.scp
|
|
fi
|
|
elif [ -n "${nlsyms}" ]; then
|
|
tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
|
|
elif [ -n "${raw}" ]; then
|
|
cat $dir/text > ${tmpdir}/output/token.scp
|
|
else
|
|
tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
|
|
fi
|
|
< ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
|
|
odim=$(cat ${dic} | wc -l)
|
|
< ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp
|
|
|
|
cat ${dir}/text > ${tmpdir}/output/text.scp
|
|
|
|
# 3. Create scp files for the others
|
|
mkdir -p ${tmpdir}/other
|
|
if [ -n "${lang}" ]; then
|
|
awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp
|
|
fi
|
|
|
|
if [ -n "${category}" ]; then
|
|
awk -v category=${category} '{print $1 " " category}' ${dir}/text \
|
|
> ${tmpdir}/other/category.scp
|
|
fi
|
|
#cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp
|
|
|
|
# 4. Merge scp files into a one file
|
|
opts=""
|
|
for intype in ${input} output other; do
|
|
if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then
|
|
continue
|
|
fi
|
|
|
|
if [ ${intype} != other ]; then
|
|
opts+="--${intype%_*}-scps "
|
|
else
|
|
opts+="--scps "
|
|
fi
|
|
|
|
for x in "${tmpdir}/${intype}"/*.scp; do
|
|
k=$(basename ${x} .scp)
|
|
if [ ${k} = shape ]; then
|
|
opts+="shape:${x}:shape "
|
|
else
|
|
opts+="${k}:${x} "
|
|
fi
|
|
done
|
|
done
|
|
|
|
if ${allow_one_column}; then
|
|
opts+="--allow-one-column true "
|
|
else
|
|
opts+="--allow-one-column false "
|
|
fi
|
|
|
|
if [ -n "${out}" ]; then
|
|
opts+="-O ${out}"
|
|
fi
|
|
|
|
tools/merge_scp2txt.py --verbose ${verbose} ${opts}
|
|
|
|
#rm -fr ${tmpdir}
|