172 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			172 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| 
 | |
| # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 | |
| # Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
 | |
| # Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
 | |
| #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 | |
| 
 | |
| from __future__ import print_function
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import argparse
 | |
| import codecs
 | |
| import re
 | |
| import sys
 | |
| 
 | |
| is_python2 = sys.version_info[0] == 2
 | |
| 
 | |
| 
 | |
| def exist_or_not(i, match_pos):
 | |
|     start_pos = None
 | |
|     end_pos = None
 | |
|     for pos in match_pos:
 | |
|         if pos[0] <= i < pos[1]:
 | |
|             start_pos = pos[0]
 | |
|             end_pos = pos[1]
 | |
|             break
 | |
| 
 | |
|     return start_pos, end_pos
 | |
| 
 | |
| def seg_char(sent):
 | |
|     pattern = re.compile(r'([\u4e00-\u9fa5])')
 | |
|     chars = pattern.split(sent)
 | |
|     chars = [w for w in chars if len(w.strip()) > 0]
 | |
|     return chars
 | |
| 
 | |
| def get_parser():
 | |
|     parser = argparse.ArgumentParser(
 | |
|         description='convert raw text to tokenized text',
 | |
|         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 | |
|     parser.add_argument('--nchar',
 | |
|                         '-n',
 | |
|                         default=1,
 | |
|                         type=int,
 | |
|                         help='number of characters to split, i.e., \
 | |
|                         aabb -> a a b b with -n 1 and aa bb with -n 2')
 | |
|     parser.add_argument('--skip-ncols',
 | |
|                         '-s',
 | |
|                         default=0,
 | |
|                         type=int,
 | |
|                         help='skip first n columns')
 | |
|     parser.add_argument('--space',
 | |
|                         default='<space>',
 | |
|                         type=str,
 | |
|                         help='space symbol')
 | |
|     parser.add_argument('--bpe-model',
 | |
|                         '-m',
 | |
|                         default=None,
 | |
|                         type=str,
 | |
|                         help='bpe model for english part')
 | |
|     parser.add_argument('--non-lang-syms',
 | |
|                         '-l',
 | |
|                         default=None,
 | |
|                         type=str,
 | |
|                         help='list of non-linguistic symobles,'
 | |
|                         ' e.g., <NOISE> etc.')
 | |
|     parser.add_argument('text',
 | |
|                         type=str,
 | |
|                         default=False,
 | |
|                         nargs='?',
 | |
|                         help='input text')
 | |
|     parser.add_argument('--trans_type',
 | |
|                         '-t',
 | |
|                         type=str,
 | |
|                         default="char",
 | |
|                         choices=["char", "phn", "cn_char_en_bpe"],
 | |
|                         help="""Transcript type. char/phn. e.g., for TIMIT
 | |
|                              FADG0_SI1279 -
 | |
|                              If trans_type is char, read from
 | |
|                              SI1279.WRD file -> "bricks are an alternative"
 | |
|                              Else if trans_type is phn,
 | |
|                              read from SI1279.PHN file ->
 | |
|                              "sil b r ih sil k s aa r er n aa l
 | |
|                              sil t er n ih sil t ih v sil" """)
 | |
|     return parser
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     parser = get_parser()
 | |
|     args = parser.parse_args()
 | |
| 
 | |
|     rs = []
 | |
|     if args.non_lang_syms is not None:
 | |
|         with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f:
 | |
|             nls = [x.rstrip() for x in f.readlines()]
 | |
|             rs = [re.compile(re.escape(x)) for x in nls]
 | |
| 
 | |
|     if args.bpe_model is not None:
 | |
|         import sentencepiece as spm
 | |
|         sp = spm.SentencePieceProcessor()
 | |
|         sp.load(args.bpe_model)
 | |
| 
 | |
|     if args.text:
 | |
|         f = codecs.open(args.text, encoding="utf-8")
 | |
|     else:
 | |
|         f = codecs.getreader("utf-8")(
 | |
|             sys.stdin if is_python2 else sys.stdin.buffer)
 | |
| 
 | |
|     sys.stdout = codecs.getwriter("utf-8")(
 | |
|         sys.stdout if is_python2 else sys.stdout.buffer)
 | |
|     line = f.readline()
 | |
|     n = args.nchar
 | |
|     while line:
 | |
|         x = line.split()
 | |
|         print(' '.join(x[:args.skip_ncols]), end=" ")
 | |
|         a = ' '.join(x[args.skip_ncols:])
 | |
| 
 | |
|         # get all matched positions
 | |
|         match_pos = []
 | |
|         for r in rs:
 | |
|             i = 0
 | |
|             while i >= 0:
 | |
|                 m = r.search(a, i)
 | |
|                 if m:
 | |
|                     match_pos.append([m.start(), m.end()])
 | |
|                     i = m.end()
 | |
|                 else:
 | |
|                     break
 | |
| 
 | |
|         if len(match_pos) > 0:
 | |
|             chars = []
 | |
|             i = 0
 | |
|             while i < len(a):
 | |
|                 start_pos, end_pos = exist_or_not(i, match_pos)
 | |
|                 if start_pos is not None:
 | |
|                     chars.append(a[start_pos:end_pos])
 | |
|                     i = end_pos
 | |
|                 else:
 | |
|                     chars.append(a[i])
 | |
|                     i += 1
 | |
|             a = chars
 | |
| 
 | |
|         if (args.trans_type == "phn"):
 | |
|             a = a.split(" ")
 | |
|         elif args.trans_type == "cn_char_en_bpe":
 | |
|             b = seg_char(a)
 | |
|             a = []
 | |
|             for j in b:
 | |
|                 # we use "▁" to instead of blanks among english words
 | |
|                 # warning: here is "▁", not "_"
 | |
|                 for l in j.strip().split("▁"):
 | |
|                     if not l.encode('UTF-8').isalpha():
 | |
|                         a.append(l)
 | |
|                     else:
 | |
|                         for k in sp.encode_as_pieces(l):
 | |
|                             a.append(k)
 | |
|         else:
 | |
|             a = [a[j:j + n] for j in range(0, len(a), n)]
 | |
| 
 | |
|         a_flat = []
 | |
|         for z in a:
 | |
|             a_flat.append("".join(z))
 | |
| 
 | |
|         a_chars = [z.replace(' ', args.space) for z in a_flat]
 | |
|         if (args.trans_type == "phn"):
 | |
|             a_chars = [z.replace("sil", args.space) for z in a_chars]
 | |
|         print(' '.join(a_chars))
 | |
|         line = f.readline()
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     main()
 | 
