147 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
		
		
			
		
	
	
			147 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
|   | #!/bin/bash
 | ||
|  | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0. | ||
|  | #           2014  David Snyder | ||
|  | 
 | ||
|  | # This script combines the data from multiple source directories into | ||
|  | # a single destination directory. | ||
|  | 
 | ||
|  | # See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information | ||
|  | # about what these directories contain. | ||
|  | 
 | ||
|  | # Begin configuration section. | ||
|  | extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." | ||
|  | skip_fix=false # skip the fix_data_dir.sh in the end | ||
|  | # End configuration section. | ||
|  | 
 | ||
|  | echo "$0 $@"  # Print the command line for logging | ||
|  | 
 | ||
|  | if [ -f path.sh ]; then . ./path.sh; fi | ||
|  | if [ -f parse_options.sh ]; then . parse_options.sh || exit 1; fi | ||
|  | 
 | ||
|  | if [ $# -lt 2 ]; then | ||
|  |   echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..." | ||
|  |   echo "Note, files that don't appear in all source dirs will not be combined," | ||
|  |   echo "with the exception of utt2uniq and segments, which are created where necessary." | ||
|  |   exit 1 | ||
|  | fi | ||
|  | 
 | ||
|  | dest=$1; | ||
|  | shift; | ||
|  | 
 | ||
|  | first_src=$1; | ||
|  | 
 | ||
|  | rm -r $dest 2>/dev/null | ||
|  | mkdir -p $dest; | ||
|  | 
 | ||
|  | export LC_ALL=C | ||
|  | 
 | ||
|  | for dir in $*; do | ||
|  |   if [ ! -f $dir/utt2spk ]; then | ||
|  |     echo "$0: no such file $dir/utt2spk" | ||
|  |     exit 1; | ||
|  |   fi | ||
|  | done | ||
|  | 
 | ||
|  | # Check that frame_shift are compatible, where present together with features. | ||
|  | dir_with_frame_shift= | ||
|  | for dir in $*; do | ||
|  |   if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then | ||
|  |     if [[ $dir_with_frame_shift ]] && | ||
|  |        ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then | ||
|  |       echo "$0:error: different frame_shift in directories $dir and " \
 | ||
|  |            "$dir_with_frame_shift. Cannot combine features." | ||
|  |       exit 1; | ||
|  |     fi | ||
|  |     dir_with_frame_shift=$dir | ||
|  |   fi | ||
|  | done | ||
|  | 
 | ||
|  | # W.r.t. utt2uniq file the script has different behavior compared to other files | ||
|  | # it is not compulsary for it to exist in src directories, but if it exists in | ||
|  | # even one it should exist in all. We will create the files where necessary | ||
|  | has_utt2uniq=false | ||
|  | for in_dir in $*; do | ||
|  |   if [ -f $in_dir/utt2uniq ]; then | ||
|  |     has_utt2uniq=true | ||
|  |     break | ||
|  |   fi | ||
|  | done | ||
|  | 
 | ||
|  | if $has_utt2uniq; then | ||
|  |   # we are going to create an utt2uniq file in the destdir | ||
|  |   for in_dir in $*; do | ||
|  |     if [ ! -f $in_dir/utt2uniq ]; then | ||
|  |       # we assume that utt2uniq is a one to one mapping | ||
|  |       cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' | ||
|  |     else | ||
|  |       cat $in_dir/utt2uniq | ||
|  |     fi | ||
|  |   done | sort -k1 > $dest/utt2uniq | ||
|  |   echo "$0: combined utt2uniq" | ||
|  | else | ||
|  |   echo "$0 [info]: not combining utt2uniq as it does not exist" | ||
|  | fi | ||
|  | # some of the old scripts might provide utt2uniq as an extrafile, so just remove it | ||
|  | extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") | ||
|  | 
 | ||
|  | # segments are treated similarly to utt2uniq. If it exists in some, but not all | ||
|  | # src directories, then we generate segments where necessary. | ||
|  | has_segments=false | ||
|  | for in_dir in $*; do | ||
|  |   if [ -f $in_dir/segments ]; then | ||
|  |     has_segments=true | ||
|  |     break | ||
|  |   fi | ||
|  | done | ||
|  | 
 | ||
|  | if $has_segments; then | ||
|  |   for in_dir in $*; do | ||
|  |     if [ ! -f $in_dir/segments ]; then | ||
|  |       echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 | ||
|  |       utils/data/get_segments_for_data.sh $in_dir | ||
|  |     else | ||
|  |       cat $in_dir/segments | ||
|  |     fi | ||
|  |   done | sort -k1 > $dest/segments | ||
|  |   echo "$0: combined segments" | ||
|  | else | ||
|  |   echo "$0 [info]: not combining segments as it does not exist" | ||
|  | fi | ||
|  | 
 | ||
|  | for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do | ||
|  |   exists_somewhere=false | ||
|  |   absent_somewhere=false | ||
|  |   for d in $*; do | ||
|  |     if [ -f $d/$file ]; then | ||
|  |       exists_somewhere=true | ||
|  |     else | ||
|  |       absent_somewhere=true | ||
|  |       fi | ||
|  |   done | ||
|  | 
 | ||
|  |   if ! $absent_somewhere; then | ||
|  |     set -o pipefail | ||
|  |     ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; | ||
|  |     set +o pipefail | ||
|  |     echo "$0: combined $file" | ||
|  |   else | ||
|  |     if ! $exists_somewhere; then | ||
|  |       echo "$0 [info]: not combining $file as it does not exist" | ||
|  |     else | ||
|  |       echo "$0 [info]: **not combining $file as it does not exist everywhere**" | ||
|  |     fi | ||
|  |   fi | ||
|  | done | ||
|  | 
 | ||
|  | tools/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt | ||
|  | 
 | ||
|  | if [[ $dir_with_frame_shift ]]; then | ||
|  |   cp $dir_with_frame_shift/frame_shift $dest | ||
|  | fi | ||
|  | 
 | ||
|  | if ! $skip_fix ; then | ||
|  |   tools/fix_data_dir.sh $dest || exit 1; | ||
|  | fi | ||
|  | 
 | ||
|  | exit 0 |