From 426b72ef259226a036acd49b6eba2650211eeeba Mon Sep 17 00:00:00 2001 From: Zchen <161216199+ZH-CEN@users.noreply.github.com> Date: Thu, 16 Oct 2025 21:26:00 +0800 Subject: [PATCH] fix --- model_training_nnn_tpu/trainer_tf.py | 33 +++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/model_training_nnn_tpu/trainer_tf.py b/model_training_nnn_tpu/trainer_tf.py index 2264fc1..3e27fa4 100644 --- a/model_training_nnn_tpu/trainer_tf.py +++ b/model_training_nnn_tpu/trainer_tf.py @@ -102,18 +102,25 @@ class BrainToTextDecoderTrainerTF: try: # Check if strategy is properly initialized before applying gradients if hasattr(self.strategy, 'merge_call') and callable(getattr(self.strategy, 'merge_call')): + print("✅ Strategy has merge_call, building optimizer properly...") + + # Build optimizer by explicitly calling build method + self.optimizer.build(self.model.trainable_variables) + print("✅ Optimizer built with model variables") + + # Test with dummy gradients to ensure everything works dummy_grads = [tf.zeros_like(w) for w in self.model.trainable_variables] self.optimizer.apply_gradients(zip(dummy_grads, self.model.trainable_variables)) print("✅ Optimizer state pre-built successfully with TPU strategy") else: # Fallback: just build optimizer variables without applying gradients - print("⚠️ Strategy not fully initialized, skipping optimizer pre-build") - # Alternative: trigger optimizer variable creation - _ = self.optimizer.iterations - print("✅ Optimizer state initialized (fallback mode)") + print("⚠️ Strategy not fully initialized, using fallback optimizer build") + # Force build the optimizer with the model variables + self.optimizer.build(self.model.trainable_variables) + print("✅ Optimizer built in fallback mode") except Exception as e: print(f"⚠️ Warning: Could not pre-build optimizer state: {e}") - print("✅ Continuing without optimizer pre-build") + print("✅ Continuing without optimizer pre-build - optimizer will build during first training step") print("📅 Setting up learning rate scheduler...") self.lr_scheduler = self._create_lr_scheduler() @@ -414,6 +421,9 @@ class BrainToTextDecoderTrainerTF: """Create AdamW optimizer with parameter groups""" # Note: TensorFlow doesn't have the same parameter group functionality as PyTorch # We'll use a single optimizer and handle different learning rates in the scheduler + + # Create optimizer within strategy scope to ensure proper initialization + print(f"Creating optimizer with strategy: {type(self.strategy).__name__}") optimizer = tf.keras.optimizers.AdamW( learning_rate=self.args['lr_max'], beta_1=self.args['beta0'], @@ -565,7 +575,18 @@ class BrainToTextDecoderTrainerTF: # Apply gradients (only for variables that have gradients) if len(filtered_gradients) > 0: - self.optimizer.apply_gradients(zip(filtered_gradients, filtered_variables)) + # Ensure we're in the strategy scope when applying gradients + # This prevents the 'NoneType' extended attribute error + try: + self.optimizer.apply_gradients(zip(filtered_gradients, filtered_variables)) + except AttributeError as e: + if "'NoneType' object has no attribute 'extended'" in str(e): + # Strategy context was lost, this should not happen in a @tf.function + tf.print(f"ERROR: Strategy context lost during gradient application: {e}") + tf.print("This indicates a serious issue with the distributed training setup") + raise RuntimeError(f"Strategy context lost during training: {e}") + else: + raise return loss, grad_norm