competition update
This commit is contained in:
2914
language_model/utils/build_ngram_lm.ipynb
Normal file
2914
language_model/utils/build_ngram_lm.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
114
language_model/utils/convert_number_to_words.py
Normal file
114
language_model/utils/convert_number_to_words.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import re
|
||||
import time
|
||||
|
||||
from tqdm import tqdm
|
||||
from num2words import num2words
|
||||
|
||||
from multiprocessing import Pool, Queue
|
||||
|
||||
def buf_count_newlines_gen(fname):
|
||||
def _make_gen(reader):
|
||||
while True:
|
||||
b = reader(2 ** 16)
|
||||
if not b: break
|
||||
yield b
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
|
||||
return count
|
||||
|
||||
def number_to_words(line:str):
|
||||
def transform(match):
|
||||
number_str = match.group()
|
||||
# print('\t' + number_str, end=' ')
|
||||
|
||||
number_str = re.sub(r'\.+', '.', number_str)
|
||||
if number_str.endswith('.'):
|
||||
number_str = number_str[:-1]
|
||||
|
||||
# check if number is a percentage
|
||||
if '%' in number_str:
|
||||
new_number_str = num2words(re.sub('\.?[$%\b]+', '', number_str)) + ' percent'
|
||||
# check if number is a valid year
|
||||
elif re.match(r'\b^\d{4}\b', number_str) and 1800 <= int(re.sub('[^\d]+', '', number_str)) <= 2100:
|
||||
new_number_str = num2words(re.sub('[^\d]+', '', number_str), to='year')
|
||||
# check if multiple '.' in number (e.g. 3.1.1)
|
||||
elif number_str.count('.') > 0:
|
||||
|
||||
new_number_str = ' point '.join([num2words(re.sub('[^\d]+', '', num)) for num in number_str.split('.')])
|
||||
else:
|
||||
new_number_str = num2words(re.sub('[^\d.]+', ' ', number_str))
|
||||
# print(new_number_str)
|
||||
return ' ' + new_number_str + ' '
|
||||
|
||||
new_line = re.sub(r'\$?[\d]+[\d\.]*%?', transform, line)
|
||||
new_line = re.sub(r'\s+', ' ', new_line)
|
||||
return new_line.strip()
|
||||
|
||||
def process_line(input_queue:Queue, output_queue:Queue, error_queue:Queue):
|
||||
while True:
|
||||
line = input_queue.get(True)
|
||||
|
||||
line = line.strip()[:-1]
|
||||
if '...' in line:
|
||||
error_queue.put(line)
|
||||
continue
|
||||
|
||||
try:
|
||||
# Replace numbers with words
|
||||
new_line = number_to_words(line)
|
||||
new_line = re.sub(r'[^a-zA-z0-9\' ]', '', new_line)
|
||||
new_line = re.sub(r'\s+', ' ', new_line)
|
||||
output_queue.put(new_line)
|
||||
except:
|
||||
error_queue.put(line)
|
||||
|
||||
def write_queue_to_file(queue:Queue, file:str, wait_for_queue:Queue):
|
||||
while not wait_for_queue.empty():
|
||||
time.sleep(1)
|
||||
with open(file, 'a') as f:
|
||||
while not queue.empty():
|
||||
f.write(queue.get() + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
SOURCE_FILE = 'financial-reports-sec.txt'
|
||||
OUTPUT_FILE = 'financial-reports-sec_processed.txt'
|
||||
ERROR_FILE = 'financial-reports-sec_error.txt'
|
||||
|
||||
total_line_count = buf_count_newlines_gen(SOURCE_FILE)
|
||||
|
||||
input_queue = Queue()
|
||||
output_queue = Queue()
|
||||
error_queue = Queue()
|
||||
pool = Pool(20, process_line, (input_queue, output_queue, error_queue,))
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Clear files
|
||||
open(OUTPUT_FILE, 'w').close()
|
||||
open(ERROR_FILE, 'w').close()
|
||||
|
||||
pb = tqdm(total=total_line_count)
|
||||
with open(SOURCE_FILE, 'r') as fp:
|
||||
for i, line in enumerate(fp):
|
||||
input_queue.put(line)
|
||||
|
||||
if i % 100 == 0 and i != 0:
|
||||
pb.update(100)
|
||||
pb.update(total_line_count % 100)
|
||||
pb.close()
|
||||
|
||||
print('Finished reading file, processing...')
|
||||
pb = tqdm(total=total_line_count)
|
||||
while not input_queue.empty():
|
||||
time.sleep(1)
|
||||
pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
|
||||
pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
|
||||
pb.close()
|
||||
|
||||
print('Finished processing file, writing to file...')
|
||||
write_queue_to_file(output_queue, OUTPUT_FILE, input_queue)
|
||||
write_queue_to_file(error_queue, ERROR_FILE, input_queue)
|
||||
|
||||
pool.close()
|
29
language_model/utils/download_hf_dataset.py
Normal file
29
language_model/utils/download_hf_dataset.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import os
|
||||
import argparse
|
||||
import datasets
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def convert_dataset_to_corpus(dataset, output_file):
|
||||
assert output_file is not None and os.path.exists(output_file), "Please provide an valid output file path"
|
||||
|
||||
with open(os.path.join('corpus', 'financial-reports-sec.txt'), 'w') as fw:
|
||||
for split in tqdm(dataset.keys(), desc="Writing dataset splits"):
|
||||
for sample in tqdm(dataset[split], desc=f"Writing {split} split"):
|
||||
fw.write(sample['sentence'] + '\n')
|
||||
|
||||
def main(args):
|
||||
# Load the lite configuration of the dataset
|
||||
raw_dataset = datasets.load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
convert_dataset_to_corpus(raw_dataset, args.output_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
argparser = argparse.ArgumentParser()
|
||||
argparser.add_argument("--output_file", type=str, default=None)
|
||||
argparser.add_argument("--dataset_name", type=str, default="JanosAudran/financial-reports-sec")
|
||||
argparser.add_argument("--dataset_config_name", type=str, default="all")
|
||||
args = argparser.parse_args()
|
||||
# Load the lite configuration of the dataset
|
||||
main(args)
|
Reference in New Issue
Block a user