167 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			167 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/bin/bash
 | |
| 
 | |
| # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 | |
| #                Mobvoi Corporation (Author: Di Wu)
 | |
| #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 | |
| 
 | |
| echo "$0 $*" >&2 # Print the command line for logging
 | |
| . ./path.sh
 | |
| 
 | |
| nj=1
 | |
| cmd=run.pl
 | |
| nlsyms=""
 | |
| lang=""
 | |
| feat=""
 | |
| feat_type="kaldi"
 | |
| oov="<unk>"
 | |
| bpecode=""
 | |
| allow_one_column=false
 | |
| raw=""
 | |
| verbose=0
 | |
| trans_type=char
 | |
| filetype=""
 | |
| preprocess_conf=""
 | |
| category=""
 | |
| out="" # If omitted, write in stdout
 | |
| help_message=$(cat << EOF
 | |
| Usage: $0 <data-dir> <dict>
 | |
| e.g. $0 data/train data/lang_1char/train_units.txt
 | |
| Options:
 | |
|   --nj <nj>                                        # number of parallel jobs
 | |
|   --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
 | |
|   --feat <feat-scp>                                # feat.scp or feat1.scp,feat2.scp,...
 | |
|   --feat-type <feat-type>                          # kaldi or wav
 | |
|   --oov <oov-word>                                 # Default: <unk>
 | |
|   --out <outputfile>                               # If omitted, write in stdout
 | |
|   --filetype <mat|hdf5|sound.hdf5>                 # Specify the format of feats file
 | |
|   --preprocess-conf <json>                         # Apply preprocess to feats when creating shape.scp
 | |
|   --verbose <num>                                  # Default: 0
 | |
| EOF
 | |
| )
 | |
| . tools/parse_options.sh
 | |
| 
 | |
| if [ $# != 2 ]; then
 | |
|     echo "${help_message}" 1>&2
 | |
|     exit 1;
 | |
| fi
 | |
| 
 | |
| set -euo pipefail
 | |
| 
 | |
| dir=$1
 | |
| dic=$2
 | |
| tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
 | |
| #trap 'rm -rf ${tmpdir}' EXIT
 | |
| 
 | |
| # 1. Create scp files for inputs
 | |
| #   These are not necessary for decoding mode, and make it as an option
 | |
| input=
 | |
| if [ -n "${feat}" ]; then
 | |
|     _feat_scps=$(echo "${feat}" | tr ',' ' ' )
 | |
|     read -r -a feat_scps <<< $_feat_scps
 | |
|     num_feats=${#feat_scps[@]}
 | |
| 
 | |
|     for (( i=1; i<=num_feats; i++ )); do
 | |
|         feat=${feat_scps[$((i-1))]}
 | |
|         mkdir -p ${tmpdir}/input_${i}
 | |
|         input+="input_${i} "
 | |
|         cat ${feat} > ${tmpdir}/input_${i}/feat.scp
 | |
| 
 | |
|         # Dump in the "legacy" style JSON format
 | |
|         if [ -n "${filetype}" ]; then
 | |
|             awk -v filetype=${filetype} '{print $1 " " filetype}' ${feat} \
 | |
|                 > ${tmpdir}/input_${i}/filetype.scp
 | |
|         fi
 | |
| 
 | |
|         if [ ${feat_type} == "kaldi" ]; then
 | |
|             tools/feat_to_shape.sh --cmd "${cmd}" --nj ${nj} \
 | |
|                 --filetype "${filetype}" \
 | |
|                 --preprocess-conf "${preprocess_conf}" \
 | |
|                 --verbose ${verbose} ${feat} ${tmpdir}/input_${i}/shape.scp
 | |
|         elif [ ${feat_type} == "wav" ] || [ ${feat_type} == "flac" ] || [ ${feat_type} == "opus" ]; then
 | |
|             if [ -f $dir/segments ]; then
 | |
|                 # used for segmented wav.scp
 | |
|                 awk '{print $1" "$4-$3}' $dir/segments > $dir/utt2dur
 | |
|             fi
 | |
|             if [ ! -f $dir/utt2dur ]; then
 | |
|                 tools/wav_to_duration.sh --nj ${nj} \
 | |
|                     ${feat} ${tmpdir}/input_${i}/shape.scp
 | |
|             # use the existed utt2dur as shape.scp directly
 | |
|             else
 | |
|                 cp $dir/utt2dur ${tmpdir}/input_${i}/shape.scp
 | |
|             fi
 | |
|         fi
 | |
|     done
 | |
| fi
 | |
| 
 | |
| # 2. Create scp files for outputs
 | |
| mkdir -p ${tmpdir}/output
 | |
| if [ -n "${bpecode}" ]; then
 | |
|     if [ "${trans_type}" == "cn_char_en_bpe" ]; then
 | |
|         tools/text2token.py -s 1 -n 1 -m ${bpecode} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
 | |
|     else
 | |
|         paste -d " " <(awk '{print $1}' ${dir}/text) <(cut -f 2- -d" " ${dir}/text \
 | |
|             | tools/spm_encode --model=${bpecode} --output_format=piece) \
 | |
|             > ${tmpdir}/output/token.scp
 | |
|     fi
 | |
| elif [ -n "${nlsyms}" ]; then
 | |
|     tools/text2token.py -s 1 -n 1 -l ${nlsyms} ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
 | |
| elif [ -n "${raw}" ]; then
 | |
|     cat $dir/text > ${tmpdir}/output/token.scp
 | |
| else
 | |
|     tools/text2token.py -s 1 -n 1 ${dir}/text --trans_type ${trans_type} > ${tmpdir}/output/token.scp
 | |
| fi
 | |
| < ${tmpdir}/output/token.scp tools/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
 | |
| odim=$(cat ${dic} | wc -l)
 | |
| < ${tmpdir}/output/tokenid.scp awk -v odim=${odim} '{print $1 " " NF-1 "," odim}' > ${tmpdir}/output/shape.scp
 | |
| 
 | |
| cat ${dir}/text > ${tmpdir}/output/text.scp
 | |
| 
 | |
| # 3. Create scp files for the others
 | |
| mkdir -p ${tmpdir}/other
 | |
| if [ -n "${lang}" ]; then
 | |
|     awk -v lang=${lang} '{print $1 " " lang}' ${dir}/text > ${tmpdir}/other/lang.scp
 | |
| fi
 | |
| 
 | |
| if [ -n "${category}" ]; then
 | |
|     awk -v category=${category} '{print $1 " " category}' ${dir}/text \
 | |
|         > ${tmpdir}/other/category.scp
 | |
| fi
 | |
| #cat ${dir}/utt2spk > ${tmpdir}/other/utt2spk.scp
 | |
| 
 | |
| # 4. Merge scp files into a one file
 | |
| opts=""
 | |
| for intype in ${input} output other; do
 | |
|     if [ -z "$(find "${tmpdir}/${intype}" -name "*.scp")" ]; then
 | |
|         continue
 | |
|     fi
 | |
| 
 | |
|     if [ ${intype} != other ]; then
 | |
|         opts+="--${intype%_*}-scps "
 | |
|     else
 | |
|         opts+="--scps "
 | |
|     fi
 | |
| 
 | |
|     for x in "${tmpdir}/${intype}"/*.scp; do
 | |
|         k=$(basename ${x} .scp)
 | |
|         if [ ${k} = shape ]; then
 | |
|             opts+="shape:${x}:shape "
 | |
|         else
 | |
|             opts+="${k}:${x} "
 | |
|         fi
 | |
|     done
 | |
| done
 | |
| 
 | |
| if ${allow_one_column}; then
 | |
|     opts+="--allow-one-column true "
 | |
| else
 | |
|     opts+="--allow-one-column false "
 | |
| fi
 | |
| 
 | |
| if [ -n "${out}" ]; then
 | |
|     opts+="-O ${out}"
 | |
| fi
 | |
| 
 | |
| tools/merge_scp2txt.py --verbose ${verbose} ${opts}
 | |
| 
 | |
| #rm -fr ${tmpdir}
 | 
