competition update
This commit is contained in:
61
language_model/tools/remove_longshortdata.py
Executable file
61
language_model/tools/remove_longshortdata.py
Executable file
@@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env python3
|
||||
# encoding: utf-8
|
||||
|
||||
import argparse
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='remove too long or too short data in format.data')
|
||||
parser.add_argument('--data_file',
|
||||
type=str,
|
||||
help='input format data')
|
||||
parser.add_argument('--output_data_file',
|
||||
type=str,
|
||||
help='output format data')
|
||||
parser.add_argument('--min_input_len', type=float,
|
||||
default=0,
|
||||
help='minimum input seq length, in seconds for raw wav, \
|
||||
in frame numbers for feature data')
|
||||
parser.add_argument('--max_input_len', type=float,
|
||||
default=20,
|
||||
help='maximum output seq length, in seconds for raw wav, \
|
||||
in frame numbers for feature data')
|
||||
parser.add_argument('--min_output_len', type=float,
|
||||
default=0, help='minimum input seq length, in modeling units')
|
||||
parser.add_argument('--max_output_len', type=float,
|
||||
default=500,
|
||||
help='maximum output seq length, in modeling units')
|
||||
parser.add_argument('--min_output_input_ratio', type=float, default=0.05,
|
||||
help='minimum output seq length/output seq length ratio')
|
||||
parser.add_argument('--max_output_input_ratio', type=float, default=10,
|
||||
help='maximum output seq length/output seq length ratio')
|
||||
args = parser.parse_args()
|
||||
|
||||
data_file = args.data_file
|
||||
output_data_file = args.output_data_file
|
||||
min_input_len = args.min_input_len
|
||||
max_input_len = args.max_input_len
|
||||
min_output_len = args.min_output_len
|
||||
max_output_len = args.max_output_len
|
||||
min_output_input_ratio = args.min_output_input_ratio
|
||||
max_output_input_ratio = args.max_output_input_ratio
|
||||
|
||||
with open(data_file, 'r') as f, open(output_data_file, 'w') as fout:
|
||||
for l in f:
|
||||
l = l.strip()
|
||||
if l:
|
||||
items = l.strip().split('\t')
|
||||
token_shape = items[6]
|
||||
feature_shape = items[2]
|
||||
feat_len = float(feature_shape.split(':')[1].split(',')[0])
|
||||
token_len = float(token_shape.split(':')[1].split(',')[0])
|
||||
condition = [feat_len > min_input_len,
|
||||
feat_len < max_input_len,
|
||||
token_len > min_output_len,
|
||||
token_len < max_output_len,
|
||||
token_len / feat_len > min_output_input_ratio,
|
||||
token_len / feat_len < max_output_input_ratio,
|
||||
]
|
||||
if all(condition):
|
||||
fout.write('{}\n'.format(l))
|
||||
continue
|
Reference in New Issue
Block a user