114 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			114 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | import re | ||
|  | import time | ||
|  | 
 | ||
|  | from tqdm import tqdm | ||
|  | from num2words import num2words | ||
|  | 
 | ||
|  | from multiprocessing import Pool, Queue | ||
|  | 
 | ||
|  | def buf_count_newlines_gen(fname): | ||
|  |     def _make_gen(reader): | ||
|  |         while True: | ||
|  |             b = reader(2 ** 16) | ||
|  |             if not b: break | ||
|  |             yield b | ||
|  | 
 | ||
|  |     with open(fname, "rb") as f: | ||
|  |         count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read)) | ||
|  |     return count | ||
|  | 
 | ||
|  | def number_to_words(line:str): | ||
|  |     def transform(match): | ||
|  |         number_str = match.group() | ||
|  |         # print('\t' + number_str, end=' ') | ||
|  | 
 | ||
|  |         number_str = re.sub(r'\.+', '.', number_str) | ||
|  |         if number_str.endswith('.'): | ||
|  |             number_str = number_str[:-1] | ||
|  | 
 | ||
|  |         # check if number is a percentage | ||
|  |         if '%' in number_str:  | ||
|  |             new_number_str = num2words(re.sub('\.?[$%\b]+', '', number_str)) + ' percent' | ||
|  |         # check if number is a valid year | ||
|  |         elif re.match(r'\b^\d{4}\b', number_str) and 1800 <= int(re.sub('[^\d]+', '', number_str)) <= 2100: | ||
|  |             new_number_str =  num2words(re.sub('[^\d]+', '', number_str), to='year') | ||
|  |         # check if multiple '.' in number (e.g. 3.1.1) | ||
|  |         elif number_str.count('.') > 0: | ||
|  | 
 | ||
|  |             new_number_str =  ' point '.join([num2words(re.sub('[^\d]+', '', num)) for num in number_str.split('.')]) | ||
|  |         else: | ||
|  |             new_number_str =  num2words(re.sub('[^\d.]+', ' ', number_str)) | ||
|  |         # print(new_number_str) | ||
|  |         return ' ' + new_number_str + ' ' | ||
|  |      | ||
|  |     new_line = re.sub(r'\$?[\d]+[\d\.]*%?', transform, line) | ||
|  |     new_line = re.sub(r'\s+', ' ', new_line) | ||
|  |     return new_line.strip() | ||
|  | 
 | ||
|  | def process_line(input_queue:Queue, output_queue:Queue, error_queue:Queue): | ||
|  |     while True: | ||
|  |         line = input_queue.get(True) | ||
|  | 
 | ||
|  |         line = line.strip()[:-1] | ||
|  |         if '...' in line: | ||
|  |             error_queue.put(line) | ||
|  |             continue | ||
|  | 
 | ||
|  |         try: | ||
|  |             # Replace numbers with words | ||
|  |             new_line = number_to_words(line) | ||
|  |             new_line = re.sub(r'[^a-zA-z0-9\' ]', '', new_line) | ||
|  |             new_line = re.sub(r'\s+', ' ', new_line) | ||
|  |             output_queue.put(new_line) | ||
|  |         except: | ||
|  |             error_queue.put(line) | ||
|  | 
 | ||
|  | def write_queue_to_file(queue:Queue, file:str, wait_for_queue:Queue): | ||
|  |     while not wait_for_queue.empty(): | ||
|  |         time.sleep(1) | ||
|  |     with open(file, 'a') as f: | ||
|  |         while not queue.empty(): | ||
|  |             f.write(queue.get() + '\n') | ||
|  | 
 | ||
|  | 
 | ||
|  | if __name__ == '__main__': | ||
|  |     SOURCE_FILE = 'financial-reports-sec.txt' | ||
|  |     OUTPUT_FILE = 'financial-reports-sec_processed.txt' | ||
|  |     ERROR_FILE = 'financial-reports-sec_error.txt' | ||
|  | 
 | ||
|  |     total_line_count = buf_count_newlines_gen(SOURCE_FILE) | ||
|  | 
 | ||
|  |     input_queue = Queue() | ||
|  |     output_queue = Queue() | ||
|  |     error_queue = Queue() | ||
|  |     pool = Pool(20, process_line, (input_queue, output_queue, error_queue,)) | ||
|  | 
 | ||
|  |     start_time = time.time() | ||
|  | 
 | ||
|  |     # Clear files | ||
|  |     open(OUTPUT_FILE, 'w').close() | ||
|  |     open(ERROR_FILE, 'w').close() | ||
|  | 
 | ||
|  |     pb = tqdm(total=total_line_count) | ||
|  |     with open(SOURCE_FILE, 'r') as fp: | ||
|  |         for i, line in enumerate(fp): | ||
|  |             input_queue.put(line) | ||
|  |          | ||
|  |             if i % 100 == 0 and i != 0: | ||
|  |                 pb.update(100) | ||
|  |     pb.update(total_line_count % 100) | ||
|  |     pb.close() | ||
|  |      | ||
|  |     print('Finished reading file, processing...') | ||
|  |     pb = tqdm(total=total_line_count) | ||
|  |     while not input_queue.empty(): | ||
|  |         time.sleep(1) | ||
|  |         pb.update(output_queue.qsize() + error_queue.qsize() - pb.n) | ||
|  |     pb.update(output_queue.qsize() + error_queue.qsize() - pb.n) | ||
|  |     pb.close() | ||
|  | 
 | ||
|  |     print('Finished processing file, writing to file...') | ||
|  |     write_queue_to_file(output_queue, OUTPUT_FILE, input_queue) | ||
|  |     write_queue_to_file(error_queue, ERROR_FILE, input_queue) | ||
|  | 
 | ||
|  |     pool.close() |