150 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
			
		
		
	
	
			150 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			Bash
		
	
	
	
	
	
| #!/bin/bash
 | |
| # Setup script for TensorFlow Brain-to-Text training on TPU v5e-8
 | |
| #
 | |
| # Usage: ./setup_tensorflow_tpu.sh
 | |
| #
 | |
| # This script prepares the environment for training the brain-to-text model
 | |
| # using TensorFlow on TPU v5e-8 hardware.
 | |
| 
 | |
| set -e  # Exit on any error
 | |
| 
 | |
| echo "=== TensorFlow TPU v5e-8 Setup Script ==="
 | |
| echo "Setting up environment for brain-to-text training..."
 | |
| 
 | |
| # Check if we're in a TPU environment
 | |
| if [[ -z "${TPU_NAME}" ]] && [[ -z "${COLAB_TPU_ADDR}" ]]; then
 | |
|     echo "Warning: TPU environment variables not detected."
 | |
|     echo "Make sure you're running on a TPU v5e-8 instance."
 | |
| fi
 | |
| 
 | |
| # Create conda environment for TensorFlow TPU
 | |
| ENV_NAME="b2txt_tf"
 | |
| echo "Creating conda environment: ${ENV_NAME}"
 | |
| 
 | |
| if conda env list | grep -q "^${ENV_NAME} "; then
 | |
|     echo "Environment ${ENV_NAME} already exists. Activating..."
 | |
|     conda activate ${ENV_NAME}
 | |
| else
 | |
|     echo "Creating new environment..."
 | |
|     conda create -n ${ENV_NAME} python=3.10 -y
 | |
|     conda activate ${ENV_NAME}
 | |
| fi
 | |
| 
 | |
| # Install TensorFlow with TPU support
 | |
| echo "Installing TensorFlow with TPU support..."
 | |
| pip install tensorflow[and-cuda]>=2.15.0
 | |
| 
 | |
| # Install additional requirements
 | |
| echo "Installing additional requirements..."
 | |
| pip install -r requirements_tf.txt
 | |
| 
 | |
| # Set up TPU environment variables
 | |
| echo "Configuring TPU environment variables..."
 | |
| 
 | |
| # Create or update .bashrc with TPU optimizations
 | |
| cat >> ~/.bashrc << 'EOF'
 | |
| 
 | |
| # TPU v5e-8 Environment Variables
 | |
| export TPU_ML_PLATFORM="TensorFlow"
 | |
| export XLA_USE_BF16=1
 | |
| export TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit"
 | |
| export TPU_MEGACORE=1
 | |
| export LIBTPU_INIT_ARGS="--xla_tpu_spmd_threshold_for_allgather_cse=10000"
 | |
| 
 | |
| # Disable TensorFlow warnings for cleaner output
 | |
| export TF_CPP_MIN_LOG_LEVEL=2
 | |
| 
 | |
| # Memory optimizations
 | |
| export TF_FORCE_GPU_ALLOW_GROWTH=true
 | |
| export TF_GPU_THREAD_MODE=gpu_private
 | |
| 
 | |
| EOF
 | |
| 
 | |
| # Source the updated .bashrc
 | |
| source ~/.bashrc
 | |
| 
 | |
| # Test TPU connectivity
 | |
| echo "Testing TPU connectivity..."
 | |
| python3 << 'EOF'
 | |
| import tensorflow as tf
 | |
| print("TensorFlow version:", tf.__version__)
 | |
| 
 | |
| try:
 | |
|     resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
 | |
|     tf.config.experimental_connect_to_cluster(resolver)
 | |
|     tf.tpu.experimental.initialize_tpu_system(resolver)
 | |
|     strategy = tf.distribute.TPUStrategy(resolver)
 | |
|     print(f"TPU cluster initialized successfully!")
 | |
|     print(f"Number of TPU cores: {strategy.num_replicas_in_sync}")
 | |
|     print(f"TPU devices: {tf.config.list_logical_devices('TPU')}")
 | |
| except Exception as e:
 | |
|     print(f"TPU initialization failed: {e}")
 | |
|     print("You may be running on CPU/GPU instead of TPU")
 | |
| 
 | |
| # Test mixed precision
 | |
| policy = tf.keras.mixed_precision.Policy('mixed_bfloat16')
 | |
| tf.keras.mixed_precision.set_global_policy(policy)
 | |
| print(f"Mixed precision policy: {policy.name}")
 | |
| EOF
 | |
| 
 | |
| # Verify data directory exists
 | |
| DATA_DIR="../data/hdf5_data_final"
 | |
| if [ -d "$DATA_DIR" ]; then
 | |
|     echo "Data directory found: $DATA_DIR"
 | |
|     # Count available sessions
 | |
|     SESSION_COUNT=$(ls -d $DATA_DIR/t*.20* 2>/dev/null | wc -l)
 | |
|     echo "Available sessions: $SESSION_COUNT"
 | |
| else
 | |
|     echo "Warning: Data directory not found at $DATA_DIR"
 | |
|     echo "Please ensure the dataset is available before training."
 | |
| fi
 | |
| 
 | |
| # Create output directories
 | |
| echo "Creating output directories..."
 | |
| mkdir -p trained_models/tensorflow_tpu
 | |
| mkdir -p logs/tensorflow_tpu
 | |
| mkdir -p eval_output
 | |
| 
 | |
| # Make scripts executable
 | |
| echo "Setting script permissions..."
 | |
| chmod +x train_model_tf.py
 | |
| chmod +x evaluate_model_tf.py
 | |
| 
 | |
| # Display system information
 | |
| echo "=== System Information ==="
 | |
| echo "Python version: $(python --version)"
 | |
| echo "Conda environment: $CONDA_DEFAULT_ENV"
 | |
| echo "Available memory: $(free -h | grep '^Mem:' | awk '{print $7}')"
 | |
| echo "CPU cores: $(nproc)"
 | |
| 
 | |
| # Check for GPU/TPU
 | |
| echo "=== Hardware Information ==="
 | |
| if nvidia-smi &> /dev/null; then
 | |
|     echo "NVIDIA GPUs detected:"
 | |
|     nvidia-smi --list-gpus
 | |
| else
 | |
|     echo "No NVIDIA GPUs detected"
 | |
| fi
 | |
| 
 | |
| if [[ -n "${TPU_NAME}" ]]; then
 | |
|     echo "TPU Name: $TPU_NAME"
 | |
| elif [[ -n "${COLAB_TPU_ADDR}" ]]; then
 | |
|     echo "Colab TPU Address: $COLAB_TPU_ADDR"
 | |
| else
 | |
|     echo "No TPU environment variables detected"
 | |
| fi
 | |
| 
 | |
| echo ""
 | |
| echo "=== Setup Complete ==="
 | |
| echo "Environment '$ENV_NAME' is ready for TensorFlow TPU training."
 | |
| echo ""
 | |
| echo "To activate the environment:"
 | |
| echo "  conda activate $ENV_NAME"
 | |
| echo ""
 | |
| echo "To start training:"
 | |
| echo "  python train_model_tf.py --config_path rnn_args.yaml"
 | |
| echo ""
 | |
| echo "To run evaluation:"
 | |
| echo "  python evaluate_model_tf.py --model_path path/to/checkpoint --config_path rnn_args.yaml"
 | |
| echo ""
 | |
| echo "For more options, use --help with any script." | 
