87 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			87 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | #!/usr/bin/env python3 | |||
|  | """
 | |||
|  | 安全检查音素数据集文件 | |||
|  | 处理大文件和可能的损坏问题 | |||
|  | """
 | |||
|  | 
 | |||
|  | import pickle | |||
|  | import os | |||
|  | 
 | |||
|  | def safe_inspect_phoneme_dataset(file_path): | |||
|  |     """安全地检查音素数据集文件""" | |||
|  | 
 | |||
|  |     print(f"检查文件: {file_path}") | |||
|  | 
 | |||
|  |     if not os.path.exists(file_path): | |||
|  |         print("文件不存在!") | |||
|  |         return | |||
|  | 
 | |||
|  |     # 文件基本信息 | |||
|  |     file_size = os.path.getsize(file_path) | |||
|  |     print(f"文件大小: {file_size / (1024*1024*1024):.2f} GB") | |||
|  | 
 | |||
|  |     try: | |||
|  |         # 尝试逐步读取 | |||
|  |         with open(file_path, 'rb') as f: | |||
|  |             print("开始读取pickle文件...") | |||
|  | 
 | |||
|  |             # 尝试读取文件头部 | |||
|  |             try: | |||
|  |                 data = pickle.load(f) | |||
|  |                 print("文件读取成功!") | |||
|  | 
 | |||
|  |                 print(f"数据类型: {type(data)}") | |||
|  | 
 | |||
|  |                 if isinstance(data, dict): | |||
|  |                     print(f"\n=== 数据集统计 ===") | |||
|  |                     print(f"音素类型数量: {len(data)}") | |||
|  | 
 | |||
|  |                     # 统计总片段数(不加载所有数据到内存) | |||
|  |                     total_segments = 0 | |||
|  |                     phoneme_counts = {} | |||
|  | 
 | |||
|  |                     for phoneme, segments in data.items(): | |||
|  |                         segment_count = len(segments) if segments else 0 | |||
|  |                         phoneme_counts[phoneme] = segment_count | |||
|  |                         total_segments += segment_count | |||
|  | 
 | |||
|  |                         # 只显示前20个音素,避免输出过长 | |||
|  |                         if len(phoneme_counts) <= 20: | |||
|  |                             print(f"  {phoneme}: {segment_count} 个片段") | |||
|  | 
 | |||
|  |                     if len(phoneme_counts) > 20: | |||
|  |                         print(f"  ... 还有 {len(phoneme_counts) - 20} 个其他音素") | |||
|  | 
 | |||
|  |                     print(f"\n总片段数: {total_segments}") | |||
|  | 
 | |||
|  |                     # 查看第一个音素的第一个片段示例 | |||
|  |                     if data: | |||
|  |                         first_phoneme = list(data.keys())[0] | |||
|  |                         if data[first_phoneme]: | |||
|  |                             first_segment = data[first_phoneme][0] | |||
|  |                             print(f"\n=== 数据片段示例 (音素: {first_phoneme}) ===") | |||
|  |                             for key, value in first_segment.items(): | |||
|  |                                 if hasattr(value, 'shape'): | |||
|  |                                     print(f"  {key}: shape {value.shape}") | |||
|  |                                 elif hasattr(value, '__len__'): | |||
|  |                                     print(f"  {key}: length {len(value)}") | |||
|  |                                 else: | |||
|  |                                     print(f"  {key}: {value}") | |||
|  | 
 | |||
|  |                 else: | |||
|  |                     print(f"数据不是字典格式: {type(data)}") | |||
|  |                     if hasattr(data, '__len__'): | |||
|  |                         print(f"数据长度: {len(data)}") | |||
|  | 
 | |||
|  |             except EOFError as e: | |||
|  |                 print(f"文件可能损坏或不完整: {e}") | |||
|  |             except pickle.UnpicklingError as e: | |||
|  |                 print(f"Pickle解析错误: {e}") | |||
|  | 
 | |||
|  |     except Exception as e: | |||
|  |         print(f"读取文件时发生错误: {e}") | |||
|  |         print(f"错误类型: {type(e)}") | |||
|  | 
 | |||
|  | if __name__ == "__main__": | |||
|  |     file_path = "../phoneme_segmented_data/phoneme_dataset_20251007_194413.pkl" | |||
|  |     safe_inspect_phoneme_dataset(file_path) |