diff --git a/model_training_nnn_tpu/Untitled-1.ipynb b/model_training_nnn_tpu/Untitled-1.ipynb index d4322a7..939a7cb 100644 --- a/model_training_nnn_tpu/Untitled-1.ipynb +++ b/model_training_nnn_tpu/Untitled-1.ipynb @@ -2,16 +2,218 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "acb1482e", "metadata": {}, "outputs": [], + "source": [ + "# 我只想看看TPU占用情况" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b317eff3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?1h\u001b=\u001b[H\u001b[2J\u001b[mtop - 17:43:08 up 29 min, 0 user, load average: 2.18, 2.22, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "Tasks:\u001b[m\u001b[m\u001b[1m 10 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 1 \u001b[m\u001b[mrunning,\u001b[m\u001b[m\u001b[1m 9 \u001b[m\u001b[msleeping,\u001b[m\u001b[m\u001b[1m 0 \u001b[m\u001b[mstopped,\u001b[m\u001b[m\u001b[1m 0 \u001b[m\u001b[mzombie\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 98.9 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.7 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292516.1 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61605.9 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35359.9 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325302.9 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "\u001b[K\n", + "\u001b[7m PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 310 root 20 0 175.4g 54.5g 371368 S 13.3 14.4 8:45.06 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 1 root 20 0 400128 98296 18312 S 0.0 0.0 0:08.66 jupyter+ \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 13 root 20 0 910476 61572 15620 S 0.0 0.0 0:10.47 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 2209 root 20 0 755688 68480 15688 S 0.0 0.0 0:00.64 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 2241 root 20 0 2576 956 856 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n", + "\u001b[m\u001b[1m 2242 root 20 0 9180 5080 2912 R 0.0 0.0 0:00.01 top \u001b[m\u001b[m\u001b[K\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[H\u001b[mtop - 17:43:11 up 30 min, 0 user, load average: 2.18, 2.22, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "\n", + "%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.3 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.2 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 99.2 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292395.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61724.6 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35362.1 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325184.2 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "\u001b[K\n", + "\n", + "\u001b[m 310 root 20 0 175.5g 54.6g 371368 S 14.3 14.5 8:45.49 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 13 root 20 0 910476 61572 15620 S 1.0 0.0 0:10.50 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 2209 root 20 0 755688 68480 15688 S 1.0 0.0 0:00.67 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 1 root 20 0 400128 98296 18312 S 0.7 0.0 0:08.68 jupyter+ \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n", + "\n", + "\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[H\u001b[mtop - 17:43:14 up 30 min, 0 user, load average: 2.17, 2.21, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "\n", + "%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.2 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.1 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 99.3 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292220.6 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61896.9 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35364.2 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325011.8 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", + "\u001b[K\n", + "\n", + "\u001b[m 310 root 20 0 175.6g 54.8g 371368 S 17.7 14.5 8:46.02 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 13 root 20 0 910476 61572 15620 S 1.0 0.0 0:10.53 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 2209 root 20 0 755688 68480 15688 S 1.0 0.0 0:00.70 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 1 root 20 0 400128 98296 18312 S 0.3 0.0 0:08.69 jupyter+ \u001b[m\u001b[m\u001b[K\n", + "\u001b[m\u001b[1m 2242 root 20 0 9180 5080 2912 R 0.3 0.0 0:00.02 top \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n", + "\u001b[m 2241 root 20 0 2576 956 856 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[?1l\u001b>\u001b[25;1H\n", + "\u001b[K" + ] + } + ], + "source": [ + "!top" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1eee541b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root 309 0.0 0.0 2576 904 pts/0 Ss+ 17:24 0:00 /usr/bin/sh -c cd /kaggle/working/b2txt25/model_training_nnn_tpu && python train_model_tf.py --config_path rnn_args.yaml\n", + "root 2268 0.0 0.0 2576 940 pts/1 Ss+ 17:44 0:00 /usr/bin/sh -c ps aux | grep -i tpu\n", + "root 2270 0.0 0.0 3744 2024 pts/1 S+ 17:44 0:00 grep -i tpu\n" + ] + } + ], + "source": [ + "!ps aux | grep -i tpu" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2f03ffe1", + "metadata": {}, + "outputs": [], + "source": [ + "!pgrep -fl \"python.*tensorflow\\|python.*train\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ffbc7471", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== TPU状态检查 ===\n", + "时间: 2025-10-15 17:46:08\n", + "❌ TPU检查失败: open(/dev/vfio/5): Device or resource busy: Device or resource busy; Couldn't open iommu group /dev/vfio/5\n", + "💻 CPU使用率: 0.4%\n", + "💾 内存使用: 18.9% (68GB/377GB)\n", + "🐍 Python进程: 6个\n", + " PID:13 CPU:0.0% MEM:0.0%\n", + " PID:30 CPU:0.0% MEM:0.0%\n", + " PID:164 CPU:0.0% MEM:0.0%\n", + "🧪 TPU连接测试...\n", + "❌ TPU测试失败: name 'tpu_devices' is not defined\n", + "=== 检查完成 ===\n" + ] + } + ], + "source": [ + "import tensorflow as tf\n", + "import psutil\n", + "import os\n", + "import time\n", + "\n", + "print(\"=== TPU状态检查 ===\")\n", + "print(f\"时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\")\n", + "\n", + "# TPU设备检查\n", + "try:\n", + " tpu_devices = tf.config.list_logical_devices('TPU')\n", + " print(f\"✅ TPU设备: {len(tpu_devices)}个\")\n", + " for i, device in enumerate(tpu_devices):\n", + " print(f\" TPU:{i} -> {device.name}\")\n", + "except Exception as e:\n", + " print(f\"❌ TPU检查失败: {e}\")\n", + "\n", + "# 系统资源\n", + "try:\n", + " cpu_percent = psutil.cpu_percent(interval=1)\n", + " memory = psutil.virtual_memory()\n", + " print(f\"💻 CPU使用率: {cpu_percent:.1f}%\")\n", + " print(f\"💾 内存使用: {memory.percent:.1f}% ({memory.used//1024//1024//1024}GB/{memory.total//1024//1024//1024}GB)\")\n", + "except Exception as e:\n", + " print(f\"❌ 系统资源检查失败: {e}\")\n", + "\n", + "# Python进程检查\n", + "try:\n", + " python_processes = []\n", + " for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):\n", + " if 'python' in proc.info['name'].lower():\n", + " python_processes.append(proc.info)\n", + " \n", + " print(f\"🐍 Python进程: {len(python_processes)}个\")\n", + " for proc in python_processes[:3]: # 只显示前3个\n", + " print(f\" PID:{proc['pid']} CPU:{proc['cpu_percent']:.1f}% MEM:{proc['memory_percent']:.1f}%\")\n", + "except Exception as e:\n", + " print(f\"❌ 进程检查失败: {e}\")\n", + "\n", + "# TPU简单测试\n", + "try:\n", + " print(\"🧪 TPU连接测试...\")\n", + " if tpu_devices:\n", + " with tf.device('/TPU:0'):\n", + " x = tf.constant([[1.0]])\n", + " result = tf.matmul(x, x)\n", + " print(f\"✅ TPU响应正常: {result.numpy()}\")\n", + " else:\n", + " print(\"⚠️ 没有TPU设备可测试\")\n", + "except Exception as e:\n", + " print(f\"❌ TPU测试失败: {e}\")\n", + "\n", + "print(\"=== 检查完成 ===\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e157ff0", + "metadata": {}, + "outputs": [], "source": [] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" } }, "nbformat": 4, diff --git a/model_training_nnn_tpu/rnn_args.yaml b/model_training_nnn_tpu/rnn_args.yaml index 9a7034f..227bc43 100644 --- a/model_training_nnn_tpu/rnn_args.yaml +++ b/model_training_nnn_tpu/rnn_args.yaml @@ -1,9 +1,9 @@ model: n_input_features: 512 # number of input features in the neural data. (2 features per electrode, 256 electrodes) - n_units: 768 # number of units per GRU layer + n_units: 256 # number of units per GRU layer (大幅减少从768→256, 减少70%参数量) rnn_dropout: 0.4 # dropout rate for the GRU layers rnn_trainable: true # whether the GRU layers are trainable - n_layers: 5 # number of GRU layers + n_layers: 3 # number of GRU layers (从5层减少到3层) patch_size: 14 # size of the input patches (14 time steps) patch_stride: 4 # stride for the input patches (4 time steps)