Merge pull request #6 from kakuteki/main

Fix GPU device ID mismatch causing CUDA invalid device error
2025-07-28 21:40:21 -07:00
parent e93cff1e2e 9a146102ff
commit 980a5b7e96
1 changed files with 24 additions and 3 deletions
--- a/model_training/rnn_trainer.py
+++ b/model_training/rnn_trainer.py
@@ -83,12 +83,33 @@ class BrainToTextDecoder_Trainer:
        # Configure device pytorch will use 
        if torch.cuda.is_available():
-            self.device = f"cuda:{self.args['gpu_number']}"
+            gpu_num = self.args.get('gpu_number', 0)
-        else: 
+            try:
-            self.device = "cpu"
+                gpu_num = int(gpu_num)
            except ValueError:
                self.logger.warning(f"Invalid gpu_number value: {gpu_num}. Using 0 instead.")
                gpu_num = 0
            max_gpu_index = torch.cuda.device_count() - 1
            if gpu_num > max_gpu_index:
                self.logger.warning(f"Requested GPU {gpu_num} not available. Using GPU 0 instead.")
                gpu_num = 0
            try:
                self.device = torch.device(f"cuda:{gpu_num}")
                test_tensor = torch.tensor([1.0]).to(self.device)
                test_tensor = test_tensor * 2
            except Exception as e:
                self.logger.error(f"Error initializing CUDA device {gpu_num}: {str(e)}")
                self.logger.info("Falling back to CPU")
                self.device = torch.device("cpu")
        else:
            self.device = torch.device("cpu")
        self.logger.info(f'Using device: {self.device}')
        # Set seed if provided 
        if self.args['seed'] != -1:
            np.random.seed(self.args['seed'])