62 lines
2.8 KiB
Python
Executable File
62 lines
2.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# encoding: utf-8
|
|
|
|
import argparse
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(
|
|
description='remove too long or too short data in format.data')
|
|
parser.add_argument('--data_file',
|
|
type=str,
|
|
help='input format data')
|
|
parser.add_argument('--output_data_file',
|
|
type=str,
|
|
help='output format data')
|
|
parser.add_argument('--min_input_len', type=float,
|
|
default=0,
|
|
help='minimum input seq length, in seconds for raw wav, \
|
|
in frame numbers for feature data')
|
|
parser.add_argument('--max_input_len', type=float,
|
|
default=20,
|
|
help='maximum output seq length, in seconds for raw wav, \
|
|
in frame numbers for feature data')
|
|
parser.add_argument('--min_output_len', type=float,
|
|
default=0, help='minimum input seq length, in modeling units')
|
|
parser.add_argument('--max_output_len', type=float,
|
|
default=500,
|
|
help='maximum output seq length, in modeling units')
|
|
parser.add_argument('--min_output_input_ratio', type=float, default=0.05,
|
|
help='minimum output seq length/output seq length ratio')
|
|
parser.add_argument('--max_output_input_ratio', type=float, default=10,
|
|
help='maximum output seq length/output seq length ratio')
|
|
args = parser.parse_args()
|
|
|
|
data_file = args.data_file
|
|
output_data_file = args.output_data_file
|
|
min_input_len = args.min_input_len
|
|
max_input_len = args.max_input_len
|
|
min_output_len = args.min_output_len
|
|
max_output_len = args.max_output_len
|
|
min_output_input_ratio = args.min_output_input_ratio
|
|
max_output_input_ratio = args.max_output_input_ratio
|
|
|
|
with open(data_file, 'r') as f, open(output_data_file, 'w') as fout:
|
|
for l in f:
|
|
l = l.strip()
|
|
if l:
|
|
items = l.strip().split('\t')
|
|
token_shape = items[6]
|
|
feature_shape = items[2]
|
|
feat_len = float(feature_shape.split(':')[1].split(',')[0])
|
|
token_len = float(token_shape.split(':')[1].split(',')[0])
|
|
condition = [feat_len > min_input_len,
|
|
feat_len < max_input_len,
|
|
token_len > min_output_len,
|
|
token_len < max_output_len,
|
|
token_len / feat_len > min_output_input_ratio,
|
|
token_len / feat_len < max_output_input_ratio,
|
|
]
|
|
if all(condition):
|
|
fout.write('{}\n'.format(l))
|
|
continue
|