285 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			285 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | ||
| """
 | ||
| Create phoneme classification dataset from segmented data (simplified version)
 | ||
| 创建音素分类数据集(简化版本,不进行ground truth验证)
 | ||
| """
 | ||
| 
 | ||
| import pickle
 | ||
| import numpy as np
 | ||
| import torch
 | ||
| from pathlib import Path
 | ||
| from collections import defaultdict
 | ||
| import os
 | ||
| import sys
 | ||
| 
 | ||
| # Add parent directory to path for imports
 | ||
| sys.path.append(str(Path(__file__).parent.parent))
 | ||
| 
 | ||
| def load_neural_data_for_trial(session, trial_metadata):
 | ||
|     """Load neural features for a specific trial"""
 | ||
|     try:
 | ||
|         # Simplified approach - directly use h5py without the helper function
 | ||
|         import h5py
 | ||
|         import pandas as pd
 | ||
| 
 | ||
|         # Try to load the session data
 | ||
|         data_dir = Path(__file__).parent.parent / "data" / "hdf5_data_final"
 | ||
|         train_file = data_dir / session / "data_train.hdf5"
 | ||
| 
 | ||
|         if not train_file.exists():
 | ||
|             return None
 | ||
| 
 | ||
|         # Load HDF5 file directly
 | ||
|         with h5py.File(train_file, 'r') as f:
 | ||
|             if 'neural_features' in f:
 | ||
|                 neural_features = f['neural_features'][:]
 | ||
| 
 | ||
|                 # Find the matching trial by trial_num
 | ||
|                 trial_num = trial_metadata.get('trial_num', -1)
 | ||
|                 if trial_num >= 0 and trial_num < len(neural_features):
 | ||
|                     return neural_features[trial_num]
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         print(f"Warning: Could not load neural data for {session}, trial {trial_metadata.get('trial_num', 'unknown')}: {e}")
 | ||
| 
 | ||
|     return None
 | ||
| 
 | ||
| def create_phoneme_classification_dataset():
 | ||
|     """Create a phoneme classification dataset from segmented data without validation"""
 | ||
| 
 | ||
|     # Load the latest phoneme dataset
 | ||
|     data_dir = Path("phoneme_segmented_data")
 | ||
|     dataset_files = list(data_dir.glob("phoneme_dataset_*.pkl"))
 | ||
| 
 | ||
|     if not dataset_files:
 | ||
|         print("No phoneme dataset files found!")
 | ||
|         return
 | ||
| 
 | ||
|     latest_dataset = max(dataset_files, key=lambda x: x.stat().st_mtime)
 | ||
|     print(f"Loading dataset: {latest_dataset.name}")
 | ||
| 
 | ||
|     with open(latest_dataset, 'rb') as f:
 | ||
|         phoneme_data = pickle.load(f)
 | ||
| 
 | ||
|     print(f"Loaded {len(phoneme_data)} phoneme types")
 | ||
| 
 | ||
|     # Create classification dataset
 | ||
|     classification_data = {
 | ||
|         'features': [],      # Neural features for each segment
 | ||
|         'labels': [],        # Phoneme labels
 | ||
|         'phoneme_to_id': {}, # Phoneme to numeric ID mapping
 | ||
|         'id_to_phoneme': {}, # Numeric ID to phoneme mapping
 | ||
|         'metadata': []       # Additional metadata for each sample
 | ||
|     }
 | ||
| 
 | ||
|     # Create phoneme to ID mapping
 | ||
|     unique_phonemes = sorted(phoneme_data.keys())
 | ||
|     for i, phoneme in enumerate(unique_phonemes):
 | ||
|         classification_data['phoneme_to_id'][phoneme] = i
 | ||
|         classification_data['id_to_phoneme'][i] = phoneme
 | ||
| 
 | ||
|     print(f"\nPhoneme mapping created for {len(unique_phonemes)} phonemes:")
 | ||
|     for i, phoneme in enumerate(unique_phonemes[:10]):  # Show first 10
 | ||
|         print(f"  {i:2d}: '{phoneme}'")
 | ||
|     if len(unique_phonemes) > 10:
 | ||
|         print(f"  ... and {len(unique_phonemes) - 10} more")
 | ||
| 
 | ||
|     # Processing statistics
 | ||
|     processing_stats = {
 | ||
|         'total_segments': 0,
 | ||
|         'successful_extractions': 0,
 | ||
|         'failed_extractions': 0,
 | ||
|         'sessions_processed': set(),
 | ||
|         'trials_processed': set()
 | ||
|     }
 | ||
| 
 | ||
|     print(f"\nExtracting neural features for each phoneme segment...")
 | ||
| 
 | ||
|     for phoneme, segments in phoneme_data.items():
 | ||
|         phoneme_id = classification_data['phoneme_to_id'][phoneme]
 | ||
|         print(f"Processing '{phoneme}' ({len(segments)} segments)...")
 | ||
| 
 | ||
|         for segment_idx, segment in enumerate(segments):
 | ||
|             processing_stats['total_segments'] += 1
 | ||
| 
 | ||
|             # Get trial information
 | ||
|             session = segment['session']
 | ||
|             trial_num = segment.get('trial_num', -1)
 | ||
| 
 | ||
|             processing_stats['sessions_processed'].add(session)
 | ||
|             processing_stats['trials_processed'].add((session, trial_num))
 | ||
| 
 | ||
|             # Try to load neural data for this trial
 | ||
|             neural_features = load_neural_data_for_trial(session, segment)
 | ||
| 
 | ||
|             if neural_features is not None:
 | ||
|                 # Extract the specific time segment
 | ||
|                 start_time = int(segment['start_time'])
 | ||
|                 end_time = int(segment['end_time'])
 | ||
| 
 | ||
|                 # Ensure valid time range
 | ||
|                 if start_time <= end_time and end_time < len(neural_features):
 | ||
|                     # Extract neural features for this time segment
 | ||
|                     segment_features = neural_features[start_time:end_time+1]  # Include end_time
 | ||
| 
 | ||
|                     # Convert to numpy array and handle different cases
 | ||
|                     if isinstance(segment_features, torch.Tensor):
 | ||
|                         segment_features = segment_features.numpy()
 | ||
|                     elif isinstance(segment_features, list):
 | ||
|                         segment_features = np.array(segment_features)
 | ||
| 
 | ||
|                     # For classification, we need a fixed-size feature vector
 | ||
|                     # Option 1: Use mean across time steps
 | ||
|                     if len(segment_features.shape) == 2:  # (time, features)
 | ||
|                         feature_vector = np.mean(segment_features, axis=0)
 | ||
|                     elif len(segment_features.shape) == 1:  # Already 1D
 | ||
|                         feature_vector = segment_features
 | ||
|                     else:
 | ||
|                         print(f"Unexpected feature shape: {segment_features.shape}")
 | ||
|                         processing_stats['failed_extractions'] += 1
 | ||
|                         continue
 | ||
| 
 | ||
|                     # Add to dataset
 | ||
|                     classification_data['features'].append(feature_vector)
 | ||
|                     classification_data['labels'].append(phoneme_id)
 | ||
|                     classification_data['metadata'].append({
 | ||
|                         'phoneme': phoneme,
 | ||
|                         'session': session,
 | ||
|                         'trial_num': trial_num,
 | ||
|                         'trial_idx': segment.get('trial_idx', -1),
 | ||
|                         'start_time': start_time,
 | ||
|                         'end_time': end_time,
 | ||
|                         'duration': end_time - start_time + 1,
 | ||
|                         'confidence': segment.get('confidence', 0.0),
 | ||
|                         'corpus': segment.get('corpus', 'unknown'),
 | ||
|                         'block_num': segment.get('block_num', -1)
 | ||
|                     })
 | ||
| 
 | ||
|                     processing_stats['successful_extractions'] += 1
 | ||
|                 else:
 | ||
|                     processing_stats['failed_extractions'] += 1
 | ||
|             else:
 | ||
|                 processing_stats['failed_extractions'] += 1
 | ||
| 
 | ||
|             # Progress update
 | ||
|             if processing_stats['total_segments'] % 5000 == 0:
 | ||
|                 print(f"  Processed {processing_stats['total_segments']} segments, extracted {processing_stats['successful_extractions']} features")
 | ||
| 
 | ||
|     print(f"\nDataset creation completed!")
 | ||
|     print(f"Total segments processed: {processing_stats['total_segments']}")
 | ||
|     print(f"Successful feature extractions: {processing_stats['successful_extractions']}")
 | ||
|     print(f"Failed extractions: {processing_stats['failed_extractions']}")
 | ||
|     print(f"Success rate: {processing_stats['successful_extractions']/processing_stats['total_segments']*100:.1f}%")
 | ||
|     print(f"Sessions processed: {len(processing_stats['sessions_processed'])}")
 | ||
|     print(f"Unique trials processed: {len(processing_stats['trials_processed'])}")
 | ||
| 
 | ||
|     if processing_stats['successful_extractions'] == 0:
 | ||
|         print("No features were extracted. Check neural data availability.")
 | ||
|         return
 | ||
| 
 | ||
|     # Convert to numpy arrays
 | ||
|     classification_data['features'] = np.array(classification_data['features'])
 | ||
|     classification_data['labels'] = np.array(classification_data['labels'])
 | ||
| 
 | ||
|     print(f"\nFinal dataset shape:")
 | ||
|     print(f"Features: {classification_data['features'].shape}")
 | ||
|     print(f"Labels: {classification_data['labels'].shape}")
 | ||
| 
 | ||
|     # Show class distribution
 | ||
|     print(f"\nClass distribution:")
 | ||
|     unique_labels, counts = np.unique(classification_data['labels'], return_counts=True)
 | ||
|     for label_id, count in zip(unique_labels, counts):
 | ||
|         phoneme = classification_data['id_to_phoneme'][label_id]
 | ||
|         print(f"  {label_id:2d} ('{phoneme}'): {count:4d} samples")
 | ||
| 
 | ||
|     # Save the classification dataset
 | ||
|     timestamp = latest_dataset.name.split('_')[-1].replace('.pkl', '')
 | ||
|     output_file = f"phoneme_classification_dataset_simple_{timestamp}.pkl"
 | ||
|     output_path = data_dir / output_file
 | ||
| 
 | ||
|     # Add processing stats to the dataset
 | ||
|     classification_data['processing_stats'] = processing_stats
 | ||
| 
 | ||
|     with open(output_path, 'wb') as f:
 | ||
|         pickle.dump(classification_data, f)
 | ||
| 
 | ||
|     print(f"\nClassification dataset saved to: {output_file}")
 | ||
| 
 | ||
|     # Create a simple train/test split
 | ||
|     create_train_test_split(classification_data, data_dir, timestamp)
 | ||
| 
 | ||
|     return classification_data
 | ||
| 
 | ||
| def create_train_test_split(data, data_dir, timestamp):
 | ||
|     """Create train/test split for the classification dataset"""
 | ||
| 
 | ||
|     from sklearn.model_selection import train_test_split
 | ||
|     from sklearn.preprocessing import StandardScaler
 | ||
| 
 | ||
|     print(f"\nCreating train/test split...")
 | ||
| 
 | ||
|     X = data['features']
 | ||
|     y = data['labels']
 | ||
|     metadata = data['metadata']
 | ||
| 
 | ||
|     # Split by session to avoid data leakage
 | ||
|     sessions = [meta['session'] for meta in metadata]
 | ||
|     unique_sessions = list(set(sessions))
 | ||
| 
 | ||
|     print(f"Available sessions: {len(unique_sessions)}")
 | ||
| 
 | ||
|     if len(unique_sessions) >= 4:
 | ||
|         # Use session-based split
 | ||
|         train_sessions = unique_sessions[:int(len(unique_sessions) * 0.8)]
 | ||
|         test_sessions = unique_sessions[int(len(unique_sessions) * 0.8):]
 | ||
| 
 | ||
|         train_indices = [i for i, meta in enumerate(metadata) if meta['session'] in train_sessions]
 | ||
|         test_indices = [i for i, meta in enumerate(metadata) if meta['session'] in test_sessions]
 | ||
| 
 | ||
|         X_train, X_test = X[train_indices], X[test_indices]
 | ||
|         y_train, y_test = y[train_indices], y[test_indices]
 | ||
| 
 | ||
|         print(f"Session-based split:")
 | ||
|         print(f"  Train sessions: {len(train_sessions)}")
 | ||
|         print(f"  Test sessions: {len(test_sessions)}")
 | ||
|     else:
 | ||
|         # Use random split
 | ||
|         X_train, X_test, y_train, y_test = train_test_split(
 | ||
|             X, y, test_size=0.2, random_state=42, stratify=y
 | ||
|         )
 | ||
|         print(f"Random split (stratified):")
 | ||
| 
 | ||
|     print(f"  Train samples: {len(X_train)}")
 | ||
|     print(f"  Test samples: {len(X_test)}")
 | ||
| 
 | ||
|     # Standardize features
 | ||
|     scaler = StandardScaler()
 | ||
|     X_train_scaled = scaler.fit_transform(X_train)
 | ||
|     X_test_scaled = scaler.transform(X_test)
 | ||
| 
 | ||
|     # Save split data
 | ||
|     split_data = {
 | ||
|         'X_train': X_train_scaled,
 | ||
|         'X_test': X_test_scaled,
 | ||
|         'y_train': y_train,
 | ||
|         'y_test': y_test,
 | ||
|         'scaler': scaler,
 | ||
|         'phoneme_to_id': data['phoneme_to_id'],
 | ||
|         'id_to_phoneme': data['id_to_phoneme']
 | ||
|     }
 | ||
| 
 | ||
|     split_file = f"phoneme_classification_split_simple_{timestamp}.pkl"
 | ||
|     split_path = data_dir / split_file
 | ||
| 
 | ||
|     with open(split_path, 'wb') as f:
 | ||
|         pickle.dump(split_data, f)
 | ||
| 
 | ||
|     print(f"Train/test split saved to: {split_file}")
 | ||
| 
 | ||
| if __name__ == "__main__":
 | ||
|     try:
 | ||
|         classification_data = create_phoneme_classification_dataset()
 | ||
|     except Exception as e:
 | ||
|         print(f"Error creating classification dataset: {e}")
 | ||
|         import traceback
 | ||
|         traceback.print_exc() | 
