114 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			114 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import re
 | |
| import time
 | |
| 
 | |
| from tqdm import tqdm
 | |
| from num2words import num2words
 | |
| 
 | |
| from multiprocessing import Pool, Queue
 | |
| 
 | |
| def buf_count_newlines_gen(fname):
 | |
|     def _make_gen(reader):
 | |
|         while True:
 | |
|             b = reader(2 ** 16)
 | |
|             if not b: break
 | |
|             yield b
 | |
| 
 | |
|     with open(fname, "rb") as f:
 | |
|         count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
 | |
|     return count
 | |
| 
 | |
| def number_to_words(line:str):
 | |
|     def transform(match):
 | |
|         number_str = match.group()
 | |
|         # print('\t' + number_str, end=' ')
 | |
| 
 | |
|         number_str = re.sub(r'\.+', '.', number_str)
 | |
|         if number_str.endswith('.'):
 | |
|             number_str = number_str[:-1]
 | |
| 
 | |
|         # check if number is a percentage
 | |
|         if '%' in number_str: 
 | |
|             new_number_str = num2words(re.sub('\.?[$%\b]+', '', number_str)) + ' percent'
 | |
|         # check if number is a valid year
 | |
|         elif re.match(r'\b^\d{4}\b', number_str) and 1800 <= int(re.sub('[^\d]+', '', number_str)) <= 2100:
 | |
|             new_number_str =  num2words(re.sub('[^\d]+', '', number_str), to='year')
 | |
|         # check if multiple '.' in number (e.g. 3.1.1)
 | |
|         elif number_str.count('.') > 0:
 | |
| 
 | |
|             new_number_str =  ' point '.join([num2words(re.sub('[^\d]+', '', num)) for num in number_str.split('.')])
 | |
|         else:
 | |
|             new_number_str =  num2words(re.sub('[^\d.]+', ' ', number_str))
 | |
|         # print(new_number_str)
 | |
|         return ' ' + new_number_str + ' '
 | |
|     
 | |
|     new_line = re.sub(r'\$?[\d]+[\d\.]*%?', transform, line)
 | |
|     new_line = re.sub(r'\s+', ' ', new_line)
 | |
|     return new_line.strip()
 | |
| 
 | |
| def process_line(input_queue:Queue, output_queue:Queue, error_queue:Queue):
 | |
|     while True:
 | |
|         line = input_queue.get(True)
 | |
| 
 | |
|         line = line.strip()[:-1]
 | |
|         if '...' in line:
 | |
|             error_queue.put(line)
 | |
|             continue
 | |
| 
 | |
|         try:
 | |
|             # Replace numbers with words
 | |
|             new_line = number_to_words(line)
 | |
|             new_line = re.sub(r'[^a-zA-z0-9\' ]', '', new_line)
 | |
|             new_line = re.sub(r'\s+', ' ', new_line)
 | |
|             output_queue.put(new_line)
 | |
|         except:
 | |
|             error_queue.put(line)
 | |
| 
 | |
| def write_queue_to_file(queue:Queue, file:str, wait_for_queue:Queue):
 | |
|     while not wait_for_queue.empty():
 | |
|         time.sleep(1)
 | |
|     with open(file, 'a') as f:
 | |
|         while not queue.empty():
 | |
|             f.write(queue.get() + '\n')
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     SOURCE_FILE = 'financial-reports-sec.txt'
 | |
|     OUTPUT_FILE = 'financial-reports-sec_processed.txt'
 | |
|     ERROR_FILE = 'financial-reports-sec_error.txt'
 | |
| 
 | |
|     total_line_count = buf_count_newlines_gen(SOURCE_FILE)
 | |
| 
 | |
|     input_queue = Queue()
 | |
|     output_queue = Queue()
 | |
|     error_queue = Queue()
 | |
|     pool = Pool(20, process_line, (input_queue, output_queue, error_queue,))
 | |
| 
 | |
|     start_time = time.time()
 | |
| 
 | |
|     # Clear files
 | |
|     open(OUTPUT_FILE, 'w').close()
 | |
|     open(ERROR_FILE, 'w').close()
 | |
| 
 | |
|     pb = tqdm(total=total_line_count)
 | |
|     with open(SOURCE_FILE, 'r') as fp:
 | |
|         for i, line in enumerate(fp):
 | |
|             input_queue.put(line)
 | |
|         
 | |
|             if i % 100 == 0 and i != 0:
 | |
|                 pb.update(100)
 | |
|     pb.update(total_line_count % 100)
 | |
|     pb.close()
 | |
|     
 | |
|     print('Finished reading file, processing...')
 | |
|     pb = tqdm(total=total_line_count)
 | |
|     while not input_queue.empty():
 | |
|         time.sleep(1)
 | |
|         pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
 | |
|     pb.update(output_queue.qsize() + error_queue.qsize() - pb.n)
 | |
|     pb.close()
 | |
| 
 | |
|     print('Finished processing file, writing to file...')
 | |
|     write_queue_to_file(output_queue, OUTPUT_FILE, input_queue)
 | |
|     write_queue_to_file(error_queue, ERROR_FILE, input_queue)
 | |
| 
 | |
|     pool.close() | 
