b2txt25/data_analyse/pure_python_check.py

#!/usr/bin/env python3

import pickle
from pathlib import Path

def check_dataset():
    """Pure Python check of the dataset without dependencies"""

    dataset_file = Path("phoneme_segmented_data/ctc_results_20251009_000024.pkl")

    if not dataset_file.exists():
        print(f"❌ File not found: {dataset_file}")
        return False

    print(f"📁 Dataset: {dataset_file}")
    print(f"📏 Size: {dataset_file.stat().st_size / (1024*1024):.1f} MB")

    try:
        # Load the dataset (it's saved as batches)
        all_trials = []
        with open(dataset_file, 'rb') as f:
            while True:
                try:
                    batch = pickle.load(f)
                    all_trials.extend(batch)
                except EOFError:
                    break

        print(f"✅ Loaded {len(all_trials)} total trials")

        # Analyze alignment_info for temporal ordering
        total_segments = 0
        temporal_errors = 0
        sample_outputs = []

        for i, trial in enumerate(all_trials):
            if 'alignment_info' not in trial:
                continue

            alignment_info = trial['alignment_info']

            for phoneme, start_time, end_time, confidence in alignment_info:
                total_segments += 1

                # Check temporal ordering
                if end_time < start_time:
                    temporal_errors += 1
                    if len(sample_outputs) < 5:  # Collect first 5 errors as examples
                        sample_outputs.append(f"Trial {i}: '{phoneme}' {start_time}->{end_time}")

        print(f"📊 Analysis Results:")
        print(f"  Total phoneme segments: {total_segments}")
        print(f"  Temporal ordering errors: {temporal_errors}")

        if temporal_errors > 0:
            error_rate = (temporal_errors / total_segments) * 100
            print(f"  Error rate: {error_rate:.3f}%")
            print(f"  Sample errors:")
            for error in sample_outputs:
                print(f"    {error}")

        return temporal_errors == 0

    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return False

if __name__ == "__main__":
    success = check_dataset()

    if success:
        print(f"\n🎉 VERIFICATION SUCCESS!")
        print(f"All phoneme segments have proper temporal ordering (end_time >= start_time)")
    else:
        print(f"\n❌ VERIFICATION FAILED!")
        print(f"Some temporal ordering issues detected")