competition update

This commit is contained in:
nckcard
2025-07-02 12:18:09 -07:00
parent 9e17716a4a
commit 77dbcf868f
2615 changed files with 1648116 additions and 125 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,114 @@
import re
import time
from tqdm import tqdm
from num2words import num2words
from multiprocessing import Pool, Queue
def buf_count_newlines_gen(fname):
def _make_gen(reader):
while True:
b = reader(2 ** 16)
if not b: break
yield b
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
def number_to_words(line:str):
def transform(match):
number_str = match.group()
# print('\t' + number_str, end=' ')
number_str = re.sub(r'\.+', '.', number_str)
if number_str.endswith('.'):
number_str = number_str[:-1]
# check if number is a percentage
if '%' in number_str:
new_number_str = num2words(re.sub('\.?[$%\b]+', '', number_str)) + ' percent'
# check if number is a valid year
elif re.match(r'\b^\d{4}\b', number_str) and 1800 <= int(re.sub('[^\d]+', '', number_str)) <= 2100:
new_number_str = num2words(re.sub('[^\d]+', '', number_str), to='year')
# check if multiple '.' in number (e.g. 3.1.1)
elif number_str.count('.') > 0:
new_number_str = ' point '.join([num2words(re.sub('[^\d]+', '', num)) for num in number_str.split('.')])
else:
new_number_str = num2words(re.sub('[^\d.]+', ' ', number_str))
# print(new_number_str)
return ' ' + new_number_str + ' '
new_line = re.sub(r'\$?[\d]+[\d\.]*%?', transform, line)
new_line = re.sub(r'\s+', ' ', new_line)
return new_line.strip()
def process_line(input_queue:Queue, output_queue:Queue, error_queue:Queue):
while True:
line = input_queue.get(True)
line = line.strip()[:-1]
if '...' in line:
error_queue.put(line)
continue
try:
# Replace numbers with words
new_line = number_to_words(line)
new_line = re.sub(r'[^a-zA-z0-9\' ]', '', new_line)
new_line = re.sub(r'\s+', ' ', new_line)
output_queue.put(new_line)
except:
error_queue.put(line)
def write_queue_to_file(queue:Queue, file:str, wait_for_queue:Queue):
while not wait_for_queue.empty():
time.sleep(1)
with open(file, 'a') as f:
while not queue.empty():
f.write(queue.get() + '\n')
if __name__ == '__main__':
SOURCE_FILE = 'financial-reports-sec.txt'
OUTPUT_FILE = 'financial-reports-sec_processed.txt'
ERROR_FILE = 'financial-reports-sec_error.txt'
total_line_count = buf_count_newlines_gen(SOURCE_FILE)
input_queue = Queue()
output_queue = Queue()
error_queue = Queue()
pool = Pool(20, process_line, (input_queue, output_queue, error_queue,))
start_time = time.time()
# Clear files
open(OUTPUT_FILE, 'w').close()
open(ERROR_FILE, 'w').close()
pb = tqdm(total=total_line_count)
with open(SOURCE_FILE, 'r') as fp:
for i, line in enumerate(fp):
input_queue.put(line)
if i % 100 == 0 and i != 0:
pb.update(100)
pb.update(total_line_count % 100)
pb.close()
print('Finished reading file, processing...')
pb = tqdm(total=total_line_count)
while not input_queue.empty():
time.sleep(1)
pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
pb.close()
print('Finished processing file, writing to file...')
write_queue_to_file(output_queue, OUTPUT_FILE, input_queue)
write_queue_to_file(error_queue, ERROR_FILE, input_queue)
pool.close()

View File

@@ -0,0 +1,29 @@
import os
import argparse
import datasets
from tqdm import tqdm
def convert_dataset_to_corpus(dataset, output_file):
assert output_file is not None and os.path.exists(output_file), "Please provide an valid output file path"
with open(os.path.join('corpus', 'financial-reports-sec.txt'), 'w') as fw:
for split in tqdm(dataset.keys(), desc="Writing dataset splits"):
for sample in tqdm(dataset[split], desc=f"Writing {split} split"):
fw.write(sample['sentence'] + '\n')
def main(args):
# Load the lite configuration of the dataset
raw_dataset = datasets.load_dataset(args.dataset_name, args.dataset_config_name)
convert_dataset_to_corpus(raw_dataset, args.output_file)
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument("--output_file", type=str, default=None)
argparser.add_argument("--dataset_name", type=str, default="JanosAudran/financial-reports-sec")
argparser.add_argument("--dataset_config_name", type=str, default="all")
args = argparser.parse_args()
# Load the lite configuration of the dataset
main(args)