285 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			285 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | #!/usr/bin/env python3 | |||
|  | """
 | |||
|  | Create phoneme classification dataset from segmented data (simplified version) | |||
|  | 创建音素分类数据集(简化版本,不进行ground truth验证) | |||
|  | """
 | |||
|  | 
 | |||
|  | import pickle | |||
|  | import numpy as np | |||
|  | import torch | |||
|  | from pathlib import Path | |||
|  | from collections import defaultdict | |||
|  | import os | |||
|  | import sys | |||
|  | 
 | |||
|  | # Add parent directory to path for imports | |||
|  | sys.path.append(str(Path(__file__).parent.parent)) | |||
|  | 
 | |||
|  | def load_neural_data_for_trial(session, trial_metadata): | |||
|  |     """Load neural features for a specific trial""" | |||
|  |     try: | |||
|  |         # Simplified approach - directly use h5py without the helper function | |||
|  |         import h5py | |||
|  |         import pandas as pd | |||
|  | 
 | |||
|  |         # Try to load the session data | |||
|  |         data_dir = Path(__file__).parent.parent / "data" / "hdf5_data_final" | |||
|  |         train_file = data_dir / session / "data_train.hdf5" | |||
|  | 
 | |||
|  |         if not train_file.exists(): | |||
|  |             return None | |||
|  | 
 | |||
|  |         # Load HDF5 file directly | |||
|  |         with h5py.File(train_file, 'r') as f: | |||
|  |             if 'neural_features' in f: | |||
|  |                 neural_features = f['neural_features'][:] | |||
|  | 
 | |||
|  |                 # Find the matching trial by trial_num | |||
|  |                 trial_num = trial_metadata.get('trial_num', -1) | |||
|  |                 if trial_num >= 0 and trial_num < len(neural_features): | |||
|  |                     return neural_features[trial_num] | |||
|  | 
 | |||
|  |     except Exception as e: | |||
|  |         print(f"Warning: Could not load neural data for {session}, trial {trial_metadata.get('trial_num', 'unknown')}: {e}") | |||
|  | 
 | |||
|  |     return None | |||
|  | 
 | |||
|  | def create_phoneme_classification_dataset(): | |||
|  |     """Create a phoneme classification dataset from segmented data without validation""" | |||
|  | 
 | |||
|  |     # Load the latest phoneme dataset | |||
|  |     data_dir = Path("phoneme_segmented_data") | |||
|  |     dataset_files = list(data_dir.glob("phoneme_dataset_*.pkl")) | |||
|  | 
 | |||
|  |     if not dataset_files: | |||
|  |         print("No phoneme dataset files found!") | |||
|  |         return | |||
|  | 
 | |||
|  |     latest_dataset = max(dataset_files, key=lambda x: x.stat().st_mtime) | |||
|  |     print(f"Loading dataset: {latest_dataset.name}") | |||
|  | 
 | |||
|  |     with open(latest_dataset, 'rb') as f: | |||
|  |         phoneme_data = pickle.load(f) | |||
|  | 
 | |||
|  |     print(f"Loaded {len(phoneme_data)} phoneme types") | |||
|  | 
 | |||
|  |     # Create classification dataset | |||
|  |     classification_data = { | |||
|  |         'features': [],      # Neural features for each segment | |||
|  |         'labels': [],        # Phoneme labels | |||
|  |         'phoneme_to_id': {}, # Phoneme to numeric ID mapping | |||
|  |         'id_to_phoneme': {}, # Numeric ID to phoneme mapping | |||
|  |         'metadata': []       # Additional metadata for each sample | |||
|  |     } | |||
|  | 
 | |||
|  |     # Create phoneme to ID mapping | |||
|  |     unique_phonemes = sorted(phoneme_data.keys()) | |||
|  |     for i, phoneme in enumerate(unique_phonemes): | |||
|  |         classification_data['phoneme_to_id'][phoneme] = i | |||
|  |         classification_data['id_to_phoneme'][i] = phoneme | |||
|  | 
 | |||
|  |     print(f"\nPhoneme mapping created for {len(unique_phonemes)} phonemes:") | |||
|  |     for i, phoneme in enumerate(unique_phonemes[:10]):  # Show first 10 | |||
|  |         print(f"  {i:2d}: '{phoneme}'") | |||
|  |     if len(unique_phonemes) > 10: | |||
|  |         print(f"  ... and {len(unique_phonemes) - 10} more") | |||
|  | 
 | |||
|  |     # Processing statistics | |||
|  |     processing_stats = { | |||
|  |         'total_segments': 0, | |||
|  |         'successful_extractions': 0, | |||
|  |         'failed_extractions': 0, | |||
|  |         'sessions_processed': set(), | |||
|  |         'trials_processed': set() | |||
|  |     } | |||
|  | 
 | |||
|  |     print(f"\nExtracting neural features for each phoneme segment...") | |||
|  | 
 | |||
|  |     for phoneme, segments in phoneme_data.items(): | |||
|  |         phoneme_id = classification_data['phoneme_to_id'][phoneme] | |||
|  |         print(f"Processing '{phoneme}' ({len(segments)} segments)...") | |||
|  | 
 | |||
|  |         for segment_idx, segment in enumerate(segments): | |||
|  |             processing_stats['total_segments'] += 1 | |||
|  | 
 | |||
|  |             # Get trial information | |||
|  |             session = segment['session'] | |||
|  |             trial_num = segment.get('trial_num', -1) | |||
|  | 
 | |||
|  |             processing_stats['sessions_processed'].add(session) | |||
|  |             processing_stats['trials_processed'].add((session, trial_num)) | |||
|  | 
 | |||
|  |             # Try to load neural data for this trial | |||
|  |             neural_features = load_neural_data_for_trial(session, segment) | |||
|  | 
 | |||
|  |             if neural_features is not None: | |||
|  |                 # Extract the specific time segment | |||
|  |                 start_time = int(segment['start_time']) | |||
|  |                 end_time = int(segment['end_time']) | |||
|  | 
 | |||
|  |                 # Ensure valid time range | |||
|  |                 if start_time <= end_time and end_time < len(neural_features): | |||
|  |                     # Extract neural features for this time segment | |||
|  |                     segment_features = neural_features[start_time:end_time+1]  # Include end_time | |||
|  | 
 | |||
|  |                     # Convert to numpy array and handle different cases | |||
|  |                     if isinstance(segment_features, torch.Tensor): | |||
|  |                         segment_features = segment_features.numpy() | |||
|  |                     elif isinstance(segment_features, list): | |||
|  |                         segment_features = np.array(segment_features) | |||
|  | 
 | |||
|  |                     # For classification, we need a fixed-size feature vector | |||
|  |                     # Option 1: Use mean across time steps | |||
|  |                     if len(segment_features.shape) == 2:  # (time, features) | |||
|  |                         feature_vector = np.mean(segment_features, axis=0) | |||
|  |                     elif len(segment_features.shape) == 1:  # Already 1D | |||
|  |                         feature_vector = segment_features | |||
|  |                     else: | |||
|  |                         print(f"Unexpected feature shape: {segment_features.shape}") | |||
|  |                         processing_stats['failed_extractions'] += 1 | |||
|  |                         continue | |||
|  | 
 | |||
|  |                     # Add to dataset | |||
|  |                     classification_data['features'].append(feature_vector) | |||
|  |                     classification_data['labels'].append(phoneme_id) | |||
|  |                     classification_data['metadata'].append({ | |||
|  |                         'phoneme': phoneme, | |||
|  |                         'session': session, | |||
|  |                         'trial_num': trial_num, | |||
|  |                         'trial_idx': segment.get('trial_idx', -1), | |||
|  |                         'start_time': start_time, | |||
|  |                         'end_time': end_time, | |||
|  |                         'duration': end_time - start_time + 1, | |||
|  |                         'confidence': segment.get('confidence', 0.0), | |||
|  |                         'corpus': segment.get('corpus', 'unknown'), | |||
|  |                         'block_num': segment.get('block_num', -1) | |||
|  |                     }) | |||
|  | 
 | |||
|  |                     processing_stats['successful_extractions'] += 1 | |||
|  |                 else: | |||
|  |                     processing_stats['failed_extractions'] += 1 | |||
|  |             else: | |||
|  |                 processing_stats['failed_extractions'] += 1 | |||
|  | 
 | |||
|  |             # Progress update | |||
|  |             if processing_stats['total_segments'] % 5000 == 0: | |||
|  |                 print(f"  Processed {processing_stats['total_segments']} segments, extracted {processing_stats['successful_extractions']} features") | |||
|  | 
 | |||
|  |     print(f"\nDataset creation completed!") | |||
|  |     print(f"Total segments processed: {processing_stats['total_segments']}") | |||
|  |     print(f"Successful feature extractions: {processing_stats['successful_extractions']}") | |||
|  |     print(f"Failed extractions: {processing_stats['failed_extractions']}") | |||
|  |     print(f"Success rate: {processing_stats['successful_extractions']/processing_stats['total_segments']*100:.1f}%") | |||
|  |     print(f"Sessions processed: {len(processing_stats['sessions_processed'])}") | |||
|  |     print(f"Unique trials processed: {len(processing_stats['trials_processed'])}") | |||
|  | 
 | |||
|  |     if processing_stats['successful_extractions'] == 0: | |||
|  |         print("No features were extracted. Check neural data availability.") | |||
|  |         return | |||
|  | 
 | |||
|  |     # Convert to numpy arrays | |||
|  |     classification_data['features'] = np.array(classification_data['features']) | |||
|  |     classification_data['labels'] = np.array(classification_data['labels']) | |||
|  | 
 | |||
|  |     print(f"\nFinal dataset shape:") | |||
|  |     print(f"Features: {classification_data['features'].shape}") | |||
|  |     print(f"Labels: {classification_data['labels'].shape}") | |||
|  | 
 | |||
|  |     # Show class distribution | |||
|  |     print(f"\nClass distribution:") | |||
|  |     unique_labels, counts = np.unique(classification_data['labels'], return_counts=True) | |||
|  |     for label_id, count in zip(unique_labels, counts): | |||
|  |         phoneme = classification_data['id_to_phoneme'][label_id] | |||
|  |         print(f"  {label_id:2d} ('{phoneme}'): {count:4d} samples") | |||
|  | 
 | |||
|  |     # Save the classification dataset | |||
|  |     timestamp = latest_dataset.name.split('_')[-1].replace('.pkl', '') | |||
|  |     output_file = f"phoneme_classification_dataset_simple_{timestamp}.pkl" | |||
|  |     output_path = data_dir / output_file | |||
|  | 
 | |||
|  |     # Add processing stats to the dataset | |||
|  |     classification_data['processing_stats'] = processing_stats | |||
|  | 
 | |||
|  |     with open(output_path, 'wb') as f: | |||
|  |         pickle.dump(classification_data, f) | |||
|  | 
 | |||
|  |     print(f"\nClassification dataset saved to: {output_file}") | |||
|  | 
 | |||
|  |     # Create a simple train/test split | |||
|  |     create_train_test_split(classification_data, data_dir, timestamp) | |||
|  | 
 | |||
|  |     return classification_data | |||
|  | 
 | |||
|  | def create_train_test_split(data, data_dir, timestamp): | |||
|  |     """Create train/test split for the classification dataset""" | |||
|  | 
 | |||
|  |     from sklearn.model_selection import train_test_split | |||
|  |     from sklearn.preprocessing import StandardScaler | |||
|  | 
 | |||
|  |     print(f"\nCreating train/test split...") | |||
|  | 
 | |||
|  |     X = data['features'] | |||
|  |     y = data['labels'] | |||
|  |     metadata = data['metadata'] | |||
|  | 
 | |||
|  |     # Split by session to avoid data leakage | |||
|  |     sessions = [meta['session'] for meta in metadata] | |||
|  |     unique_sessions = list(set(sessions)) | |||
|  | 
 | |||
|  |     print(f"Available sessions: {len(unique_sessions)}") | |||
|  | 
 | |||
|  |     if len(unique_sessions) >= 4: | |||
|  |         # Use session-based split | |||
|  |         train_sessions = unique_sessions[:int(len(unique_sessions) * 0.8)] | |||
|  |         test_sessions = unique_sessions[int(len(unique_sessions) * 0.8):] | |||
|  | 
 | |||
|  |         train_indices = [i for i, meta in enumerate(metadata) if meta['session'] in train_sessions] | |||
|  |         test_indices = [i for i, meta in enumerate(metadata) if meta['session'] in test_sessions] | |||
|  | 
 | |||
|  |         X_train, X_test = X[train_indices], X[test_indices] | |||
|  |         y_train, y_test = y[train_indices], y[test_indices] | |||
|  | 
 | |||
|  |         print(f"Session-based split:") | |||
|  |         print(f"  Train sessions: {len(train_sessions)}") | |||
|  |         print(f"  Test sessions: {len(test_sessions)}") | |||
|  |     else: | |||
|  |         # Use random split | |||
|  |         X_train, X_test, y_train, y_test = train_test_split( | |||
|  |             X, y, test_size=0.2, random_state=42, stratify=y | |||
|  |         ) | |||
|  |         print(f"Random split (stratified):") | |||
|  | 
 | |||
|  |     print(f"  Train samples: {len(X_train)}") | |||
|  |     print(f"  Test samples: {len(X_test)}") | |||
|  | 
 | |||
|  |     # Standardize features | |||
|  |     scaler = StandardScaler() | |||
|  |     X_train_scaled = scaler.fit_transform(X_train) | |||
|  |     X_test_scaled = scaler.transform(X_test) | |||
|  | 
 | |||
|  |     # Save split data | |||
|  |     split_data = { | |||
|  |         'X_train': X_train_scaled, | |||
|  |         'X_test': X_test_scaled, | |||
|  |         'y_train': y_train, | |||
|  |         'y_test': y_test, | |||
|  |         'scaler': scaler, | |||
|  |         'phoneme_to_id': data['phoneme_to_id'], | |||
|  |         'id_to_phoneme': data['id_to_phoneme'] | |||
|  |     } | |||
|  | 
 | |||
|  |     split_file = f"phoneme_classification_split_simple_{timestamp}.pkl" | |||
|  |     split_path = data_dir / split_file | |||
|  | 
 | |||
|  |     with open(split_path, 'wb') as f: | |||
|  |         pickle.dump(split_data, f) | |||
|  | 
 | |||
|  |     print(f"Train/test split saved to: {split_file}") | |||
|  | 
 | |||
|  | if __name__ == "__main__": | |||
|  |     try: | |||
|  |         classification_data = create_phoneme_classification_dataset() | |||
|  |     except Exception as e: | |||
|  |         print(f"Error creating classification dataset: {e}") | |||
|  |         import traceback | |||
|  |         traceback.print_exc() |