29 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			29 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import argparse
 | |
| import datasets
 | |
| 
 | |
| from tqdm import tqdm
 | |
| 
 | |
| 
 | |
| def convert_dataset_to_corpus(dataset, output_file):
 | |
|     assert output_file is not None and os.path.exists(output_file), "Please provide an valid output file path"
 | |
| 
 | |
|     with open(os.path.join('corpus', 'financial-reports-sec.txt'), 'w') as fw:
 | |
|         for split in tqdm(dataset.keys(), desc="Writing dataset splits"):
 | |
|             for sample in tqdm(dataset[split], desc=f"Writing {split} split"):
 | |
|                 fw.write(sample['sentence'] + '\n')
 | |
| 
 | |
| def main(args):
 | |
|     # Load the lite configuration of the dataset
 | |
|     raw_dataset = datasets.load_dataset(args.dataset_name, args.dataset_config_name)
 | |
|     convert_dataset_to_corpus(raw_dataset, args.output_file)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     argparser = argparse.ArgumentParser()
 | |
|     argparser.add_argument("--output_file", type=str, default=None)
 | |
|     argparser.add_argument("--dataset_name", type=str, default="JanosAudran/financial-reports-sec")
 | |
|     argparser.add_argument("--dataset_config_name", type=str, default="all")
 | |
|     args = argparser.parse_args()
 | |
|     # Load the lite configuration of the dataset
 | |
|     main(args) | 
