Files
b2txt25/data_analyse/verify_results.py
2025-10-12 09:11:32 +08:00

159 lines
4.9 KiB
Python

#!/usr/bin/env python3
import pickle
import numpy as np
from pathlib import Path
def examine_latest_dataset():
"""Examine the latest processed dataset"""
data_dir = Path("phoneme_segmented_data")
# Find latest file
latest_file = data_dir / "ctc_results_20251009_000024.pkl"
if not latest_file.exists():
print(f"❌ File not found: {latest_file}")
return False
print(f"📁 Loading dataset: {latest_file}")
print(f"📏 File size: {latest_file.stat().st_size / (1024*1024):.1f} MB")
try:
with open(latest_file, 'rb') as f:
data = pickle.load(f)
print(f"✅ Successfully loaded dataset")
if isinstance(data, dict):
print(f"📊 Dataset keys: {list(data.keys())}")
# Look for trial data
trial_count = 0
temporal_errors = 0
total_segments = 0
for key, value in data.items():
if 'trial_' in str(key):
trial_count += 1
if isinstance(value, dict) and 'phoneme_segments' in value:
segments = value['phoneme_segments']
total_segments += len(segments)
for seg in segments:
if isinstance(seg, dict):
start_time = seg.get('start_time', 0)
end_time = seg.get('end_time', 0)
if end_time < start_time:
temporal_errors += 1
print(f"🔢 Trials processed: {trial_count}")
print(f"🔤 Total phoneme segments: {total_segments}")
print(f"⏰ Temporal ordering errors: {temporal_errors}")
if temporal_errors == 0:
print("✅ SUCCESS: No temporal ordering bugs found!")
return True
else:
print(f"❌ FAILED: Found {temporal_errors} temporal ordering bugs!")
return False
else:
print(f"❌ Unexpected data format: {type(data)}")
return False
except Exception as e:
print(f"❌ Error loading dataset: {e}")
return False
def test_alignment_logic():
"""Test the corrected alignment logic"""
print("\n=== Testing Alignment Logic ===")
# Simple manual test of the alignment logic
# Simulate what the fixed code should do
# Test case: sequence [1, 2, 1] at times [0, 1, 2]
# This should create segments that don't overlap incorrectly
test_cases = [
{
"sequence": [1, 2, 1],
"path": [0, 1, 2],
"description": "Simple case"
},
{
"sequence": [1, 1, 2],
"path": [0, 1, 3],
"description": "Repeated phoneme"
}
]
all_valid = True
for case in test_cases:
print(f"\nTesting: {case['description']}")
sequence = case['sequence']
path = case['path']
# Simulate the corrected segment creation
segments = []
current_phoneme = None
start_time = None
for i, (phoneme, time_idx) in enumerate(zip(sequence, path)):
if phoneme != current_phoneme:
# End previous segment
if current_phoneme is not None:
end_time = path[i-1]
segments.append({
'phoneme': current_phoneme,
'start_time': start_time,
'end_time': end_time
})
# Start new segment
current_phoneme = phoneme
start_time = time_idx
# Close final segment
if current_phoneme is not None:
segments.append({
'phoneme': current_phoneme,
'start_time': start_time,
'end_time': path[-1]
})
# Check temporal ordering
case_valid = True
for seg in segments:
start = seg['start_time']
end = seg['end_time']
status = "" if end >= start else "❌ BUG!"
if end < start:
case_valid = False
all_valid = False
print(f" Phoneme {seg['phoneme']}: {start}-{end} {status}")
print(f" Result: {'✅ PASS' if case_valid else '❌ FAIL'}")
return all_valid
if __name__ == "__main__":
print("=== CTC Alignment Verification ===\n")
dataset_ok = examine_latest_dataset()
logic_ok = test_alignment_logic()
print(f"\n=== FINAL RESULTS ===")
print(f"Dataset check: {'✅ PASS' if dataset_ok else '❌ FAIL'}")
print(f"Logic test: {'✅ PASS' if logic_ok else '❌ FAIL'}")
if dataset_ok and logic_ok:
print(f"\n🎉 VERIFICATION SUCCESSFUL: Bug fix appears to be working!")
else:
print(f"\n❌ VERIFICATION FAILED: Issues detected")