76 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			76 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | |
| 
 | |
| import pickle
 | |
| from pathlib import Path
 | |
| 
 | |
| def check_dataset():
 | |
|     """Pure Python check of the dataset without dependencies"""
 | |
| 
 | |
|     dataset_file = Path("phoneme_segmented_data/ctc_results_20251009_000024.pkl")
 | |
| 
 | |
|     if not dataset_file.exists():
 | |
|         print(f"❌ File not found: {dataset_file}")
 | |
|         return False
 | |
| 
 | |
|     print(f"📁 Dataset: {dataset_file}")
 | |
|     print(f"📏 Size: {dataset_file.stat().st_size / (1024*1024):.1f} MB")
 | |
| 
 | |
|     try:
 | |
|         # Load the dataset (it's saved as batches)
 | |
|         all_trials = []
 | |
|         with open(dataset_file, 'rb') as f:
 | |
|             while True:
 | |
|                 try:
 | |
|                     batch = pickle.load(f)
 | |
|                     all_trials.extend(batch)
 | |
|                 except EOFError:
 | |
|                     break
 | |
| 
 | |
|         print(f"✅ Loaded {len(all_trials)} total trials")
 | |
| 
 | |
|         # Analyze alignment_info for temporal ordering
 | |
|         total_segments = 0
 | |
|         temporal_errors = 0
 | |
|         sample_outputs = []
 | |
| 
 | |
|         for i, trial in enumerate(all_trials):
 | |
|             if 'alignment_info' not in trial:
 | |
|                 continue
 | |
| 
 | |
|             alignment_info = trial['alignment_info']
 | |
| 
 | |
|             for phoneme, start_time, end_time, confidence in alignment_info:
 | |
|                 total_segments += 1
 | |
| 
 | |
|                 # Check temporal ordering
 | |
|                 if end_time < start_time:
 | |
|                     temporal_errors += 1
 | |
|                     if len(sample_outputs) < 5:  # Collect first 5 errors as examples
 | |
|                         sample_outputs.append(f"Trial {i}: '{phoneme}' {start_time}->{end_time}")
 | |
| 
 | |
|         print(f"📊 Analysis Results:")
 | |
|         print(f"  Total phoneme segments: {total_segments}")
 | |
|         print(f"  Temporal ordering errors: {temporal_errors}")
 | |
| 
 | |
|         if temporal_errors > 0:
 | |
|             error_rate = (temporal_errors / total_segments) * 100
 | |
|             print(f"  Error rate: {error_rate:.3f}%")
 | |
|             print(f"  Sample errors:")
 | |
|             for error in sample_outputs:
 | |
|                 print(f"    {error}")
 | |
| 
 | |
|         return temporal_errors == 0
 | |
| 
 | |
|     except Exception as e:
 | |
|         print(f"❌ Error loading dataset: {e}")
 | |
|         return False
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     success = check_dataset()
 | |
| 
 | |
|     if success:
 | |
|         print(f"\n🎉 VERIFICATION SUCCESS!")
 | |
|         print(f"All phoneme segments have proper temporal ordering (end_time >= start_time)")
 | |
|     else:
 | |
|         print(f"\n❌ VERIFICATION FAILED!")
 | |
|         print(f"Some temporal ordering issues detected") | 
