#!/bin/bash # Setup script for TensorFlow Brain-to-Text training on TPU v5e-8 # # Usage: ./setup_tensorflow_tpu.sh # # This script prepares the environment for training the brain-to-text model # using TensorFlow on TPU v5e-8 hardware. set -e # Exit on any error echo "=== TensorFlow TPU v5e-8 Setup Script ===" echo "Setting up environment for brain-to-text training..." # Check if we're in a TPU environment if [[ -z "${TPU_NAME}" ]] && [[ -z "${COLAB_TPU_ADDR}" ]]; then echo "Warning: TPU environment variables not detected." echo "Make sure you're running on a TPU v5e-8 instance." fi # Create conda environment for TensorFlow TPU ENV_NAME="b2txt_tf" echo "Creating conda environment: ${ENV_NAME}" if conda env list | grep -q "^${ENV_NAME} "; then echo "Environment ${ENV_NAME} already exists. Activating..." conda activate ${ENV_NAME} else echo "Creating new environment..." conda create -n ${ENV_NAME} python=3.10 -y conda activate ${ENV_NAME} fi # Install TensorFlow with TPU support echo "Installing TensorFlow with TPU support..." pip install tensorflow[and-cuda]>=2.15.0 # Install additional requirements echo "Installing additional requirements..." pip install -r requirements_tf.txt # Set up TPU environment variables echo "Configuring TPU environment variables..." # Create or update .bashrc with TPU optimizations cat >> ~/.bashrc << 'EOF' # TPU v5e-8 Environment Variables export TPU_ML_PLATFORM="TensorFlow" export XLA_USE_BF16=1 export TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit" export TPU_MEGACORE=1 export LIBTPU_INIT_ARGS="--xla_tpu_spmd_threshold_for_allgather_cse=10000" # Disable TensorFlow warnings for cleaner output export TF_CPP_MIN_LOG_LEVEL=2 # Memory optimizations export TF_FORCE_GPU_ALLOW_GROWTH=true export TF_GPU_THREAD_MODE=gpu_private EOF # Source the updated .bashrc source ~/.bashrc # Test TPU connectivity echo "Testing TPU connectivity..." python3 << 'EOF' import tensorflow as tf print("TensorFlow version:", tf.__version__) try: resolver = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) print(f"TPU cluster initialized successfully!") print(f"Number of TPU cores: {strategy.num_replicas_in_sync}") print(f"TPU devices: {tf.config.list_logical_devices('TPU')}") except Exception as e: print(f"TPU initialization failed: {e}") print("You may be running on CPU/GPU instead of TPU") # Test mixed precision policy = tf.keras.mixed_precision.Policy('mixed_bfloat16') tf.keras.mixed_precision.set_global_policy(policy) print(f"Mixed precision policy: {policy.name}") EOF # Verify data directory exists DATA_DIR="../data/hdf5_data_final" if [ -d "$DATA_DIR" ]; then echo "Data directory found: $DATA_DIR" # Count available sessions SESSION_COUNT=$(ls -d $DATA_DIR/t*.20* 2>/dev/null | wc -l) echo "Available sessions: $SESSION_COUNT" else echo "Warning: Data directory not found at $DATA_DIR" echo "Please ensure the dataset is available before training." fi # Create output directories echo "Creating output directories..." mkdir -p trained_models/tensorflow_tpu mkdir -p logs/tensorflow_tpu mkdir -p eval_output # Make scripts executable echo "Setting script permissions..." chmod +x train_model_tf.py chmod +x evaluate_model_tf.py # Display system information echo "=== System Information ===" echo "Python version: $(python --version)" echo "Conda environment: $CONDA_DEFAULT_ENV" echo "Available memory: $(free -h | grep '^Mem:' | awk '{print $7}')" echo "CPU cores: $(nproc)" # Check for GPU/TPU echo "=== Hardware Information ===" if nvidia-smi &> /dev/null; then echo "NVIDIA GPUs detected:" nvidia-smi --list-gpus else echo "No NVIDIA GPUs detected" fi if [[ -n "${TPU_NAME}" ]]; then echo "TPU Name: $TPU_NAME" elif [[ -n "${COLAB_TPU_ADDR}" ]]; then echo "Colab TPU Address: $COLAB_TPU_ADDR" else echo "No TPU environment variables detected" fi echo "" echo "=== Setup Complete ===" echo "Environment '$ENV_NAME' is ready for TensorFlow TPU training." echo "" echo "To activate the environment:" echo " conda activate $ENV_NAME" echo "" echo "To start training:" echo " python train_model_tf.py --config_path rnn_args.yaml" echo "" echo "To run evaluation:" echo " python evaluate_model_tf.py --model_path path/to/checkpoint --config_path rnn_args.yaml" echo "" echo "For more options, use --help with any script."