60 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
		
		
			
		
	
	
			60 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
|   | #!/bin/bash
 | ||
|  | 
 | ||
|  | # koried, 10/29/2012 | ||
|  | 
 | ||
|  | # Reduce a data set based on a list of turn-ids | ||
|  | 
 | ||
|  | help_message="usage: $0 srcdir turnlist destdir" | ||
|  | 
 | ||
|  | if [ $1 == "--help" ]; then | ||
|  |     echo "${help_message}" | ||
|  |     exit 0; | ||
|  | fi | ||
|  | 
 | ||
|  | if [ $# != 3 ]; then | ||
|  |     echo "${help_message}" | ||
|  |     exit 1; | ||
|  | fi | ||
|  | 
 | ||
|  | srcdir=$1 | ||
|  | reclist=$2 | ||
|  | destdir=$3 | ||
|  | 
 | ||
|  | if [ ! -f ${srcdir}/utt2spk ]; then | ||
|  | echo "$0: no such file $srcdir/utt2spk" | ||
|  | exit 1; | ||
|  | fi | ||
|  | 
 | ||
|  | function do_filtering { | ||
|  | # assumes the utt2spk and spk2utt files already exist. | ||
|  |     [ -f ${srcdir}/feats.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/feats.scp >${destdir}/feats.scp | ||
|  |     [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/wav.scp >${destdir}/wav.scp | ||
|  |     [ -f ${srcdir}/text ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/text >${destdir}/text | ||
|  |     [ -f ${srcdir}/utt2num_frames ] && utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/utt2num_frames >${destdir}/utt2num_frames | ||
|  |     [ -f ${srcdir}/spk2gender ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/spk2gender >${destdir}/spk2gender | ||
|  |     [ -f ${srcdir}/cmvn.scp ] && utils/filter_scp.pl ${destdir}/spk2utt <${srcdir}/cmvn.scp >${destdir}/cmvn.scp | ||
|  |     if [ -f ${srcdir}/segments ]; then | ||
|  |         utils/filter_scp.pl ${destdir}/utt2spk <${srcdir}/segments >${destdir}/segments | ||
|  |         awk '{print $2;}' ${destdir}/segments | sort | uniq > ${destdir}/reco # recordings. | ||
|  |         # The next line would override the command above for wav.scp, which would be incorrect. | ||
|  |         [ -f ${srcdir}/wav.scp ] && utils/filter_scp.pl ${destdir}/reco <${srcdir}/wav.scp >${destdir}/wav.scp | ||
|  |         [ -f ${srcdir}/reco2file_and_channel ] && \
 | ||
|  |             utils/filter_scp.pl ${destdir}/reco <${srcdir}/reco2file_and_channel >${destdir}/reco2file_and_channel | ||
|  | 
 | ||
|  |         # Filter the STM file for proper sclite scoring (this will also remove the comments lines) | ||
|  |         [ -f ${srcdir}/stm ] && utils/filter_scp.pl ${destdir}/reco < ${srcdir}/stm > ${destdir}/stm | ||
|  |         rm ${destdir}/reco | ||
|  |     fi | ||
|  |     srcutts=$(wc -l < ${srcdir}/utt2spk) | ||
|  |     destutts=$(wc -l < ${destdir}/utt2spk) | ||
|  |     echo "Reduced #utt from $srcutts to $destutts" | ||
|  | } | ||
|  | 
 | ||
|  | mkdir -p ${destdir} | ||
|  | 
 | ||
|  | # filter the utt2spk based on the set of recordings | ||
|  | utils/filter_scp.pl ${reclist} < ${srcdir}/utt2spk > ${destdir}/utt2spk | ||
|  | 
 | ||
|  | utils/utt2spk_to_spk2utt.pl < ${destdir}/utt2spk > ${destdir}/spk2utt | ||
|  | do_filtering; |