222 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			222 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| {
 | |
|  "cells": [
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 1,
 | |
|    "id": "acb1482e",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "# 我只想看看TPU占用情况"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 2,
 | |
|    "id": "b317eff3",
 | |
|    "metadata": {},
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "\u001b[?1h\u001b=\u001b[H\u001b[2J\u001b[mtop - 17:43:08 up 29 min,  0 user,  load average: 2.18, 2.22, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "Tasks:\u001b[m\u001b[m\u001b[1m  10 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m   1 \u001b[m\u001b[mrunning,\u001b[m\u001b[m\u001b[1m   9 \u001b[m\u001b[msleeping,\u001b[m\u001b[m\u001b[1m   0 \u001b[m\u001b[mstopped,\u001b[m\u001b[m\u001b[1m   0 \u001b[m\u001b[mzombie\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "%Cpu(s):\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m  0.4 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 98.9 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m  0.7 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292516.1 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m  61605.9 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m  35359.9 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m    \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "MiB Swap:\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325302.9 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[K\n",
 | |
|       "\u001b[7m    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND  \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m    310 root      20   0  175.4g  54.5g 371368 S  13.3  14.4   8:45.06 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m      1 root      20   0  400128  98296  18312 S   0.0   0.0   0:08.66 jupyter+ \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m     13 root      20   0  910476  61572  15620 S   0.0   0.0   0:10.47 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m     30 root      20   0 5685268 183828  45432 S   0.0   0.0   0:16.19 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m    164 root      20   0   29448  25612   8688 S   0.0   0.0   0:00.11 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m    309 root      20   0    2576    904    812 S   0.0   0.0   0:00.00 sh       \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m   2175 root      20   0   26072  22068   8836 S   0.0   0.0   0:00.06 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m   2209 root      20   0  755688  68480  15688 S   0.0   0.0   0:00.64 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m   2241 root      20   0    2576    956    856 S   0.0   0.0   0:00.00 sh       \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m\u001b[1m   2242 root      20   0    9180   5080   2912 R   0.0   0.0   0:00.01 top      \u001b[m\u001b[m\u001b[K\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[H\u001b[mtop - 17:43:11 up 30 min,  0 user,  load average: 2.18, 2.22, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "\n",
 | |
|       "%Cpu(s):\u001b[m\u001b[m\u001b[1m  0.3 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m  0.2 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 99.2 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m  0.4 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292395.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m  61724.6 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m  35362.1 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m    \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "MiB Swap:\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325184.2 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[K\n",
 | |
|       "\n",
 | |
|       "\u001b[m    310 root      20   0  175.5g  54.6g 371368 S  14.3  14.5   8:45.49 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m     13 root      20   0  910476  61572  15620 S   1.0   0.0   0:10.50 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m   2209 root      20   0  755688  68480  15688 S   1.0   0.0   0:00.67 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m      1 root      20   0  400128  98296  18312 S   0.7   0.0   0:08.68 jupyter+ \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m     30 root      20   0 5685268 183828  45432 S   0.0   0.0   0:16.19 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m    164 root      20   0   29448  25612   8688 S   0.0   0.0   0:00.11 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m    309 root      20   0    2576    904    812 S   0.0   0.0   0:00.00 sh       \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m   2175 root      20   0   26072  22068   8836 S   0.0   0.0   0:00.06 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\n",
 | |
|       "\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[H\u001b[mtop - 17:43:14 up 30 min,  0 user,  load average: 2.17, 2.21, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "\n",
 | |
|       "%Cpu(s):\u001b[m\u001b[m\u001b[1m  0.2 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m  0.1 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 99.3 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m  0.4 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m  0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292220.6 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m  61896.9 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m  35364.2 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m    \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "MiB Swap:\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m      0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325011.8 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[K\n",
 | |
|       "\n",
 | |
|       "\u001b[m    310 root      20   0  175.6g  54.8g 371368 S  17.7  14.5   8:46.02 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m     13 root      20   0  910476  61572  15620 S   1.0   0.0   0:10.53 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m   2209 root      20   0  755688  68480  15688 S   1.0   0.0   0:00.70 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m      1 root      20   0  400128  98296  18312 S   0.3   0.0   0:08.69 jupyter+ \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m\u001b[1m   2242 root      20   0    9180   5080   2912 R   0.3   0.0   0:00.02 top      \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m     30 root      20   0 5685268 183828  45432 S   0.0   0.0   0:16.19 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m    164 root      20   0   29448  25612   8688 S   0.0   0.0   0:00.11 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m    309 root      20   0    2576    904    812 S   0.0   0.0   0:00.00 sh       \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m   2175 root      20   0   26072  22068   8836 S   0.0   0.0   0:00.06 python   \u001b[m\u001b[m\u001b[K\n",
 | |
|       "\u001b[m   2241 root      20   0    2576    956    856 S   0.0   0.0   0:00.00 sh       \u001b[m\u001b[m\u001b[K\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[?1l\u001b>\u001b[25;1H\n",
 | |
|       "\u001b[K"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "!top"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 3,
 | |
|    "id": "1eee541b",
 | |
|    "metadata": {},
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "root         309  0.0  0.0   2576   904 pts/0    Ss+  17:24   0:00 /usr/bin/sh -c cd /kaggle/working/b2txt25/model_training_nnn_tpu && python train_model_tf.py --config_path rnn_args.yaml\n",
 | |
|       "root        2268  0.0  0.0   2576   940 pts/1    Ss+  17:44   0:00 /usr/bin/sh -c ps aux | grep -i tpu\n",
 | |
|       "root        2270  0.0  0.0   3744  2024 pts/1    S+   17:44   0:00 grep -i tpu\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "!ps aux | grep -i tpu"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 5,
 | |
|    "id": "2f03ffe1",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "!pgrep -fl \"python.*tensorflow\\|python.*train\""
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 6,
 | |
|    "id": "ffbc7471",
 | |
|    "metadata": {},
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "=== TPU状态检查 ===\n",
 | |
|       "时间: 2025-10-15 17:46:08\n",
 | |
|       "❌ TPU检查失败: open(/dev/vfio/5): Device or resource busy: Device or resource busy; Couldn't open iommu group /dev/vfio/5\n",
 | |
|       "💻 CPU使用率: 0.4%\n",
 | |
|       "💾 内存使用: 18.9% (68GB/377GB)\n",
 | |
|       "🐍 Python进程: 6个\n",
 | |
|       "   PID:13 CPU:0.0% MEM:0.0%\n",
 | |
|       "   PID:30 CPU:0.0% MEM:0.0%\n",
 | |
|       "   PID:164 CPU:0.0% MEM:0.0%\n",
 | |
|       "🧪 TPU连接测试...\n",
 | |
|       "❌ TPU测试失败: name 'tpu_devices' is not defined\n",
 | |
|       "=== 检查完成 ===\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "import tensorflow as tf\n",
 | |
|     "import psutil\n",
 | |
|     "import os\n",
 | |
|     "import time\n",
 | |
|     "\n",
 | |
|     "print(\"=== TPU状态检查 ===\")\n",
 | |
|     "print(f\"时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\")\n",
 | |
|     "\n",
 | |
|     "# TPU设备检查\n",
 | |
|     "try:\n",
 | |
|     "    tpu_devices = tf.config.list_logical_devices('TPU')\n",
 | |
|     "    print(f\"✅ TPU设备: {len(tpu_devices)}个\")\n",
 | |
|     "    for i, device in enumerate(tpu_devices):\n",
 | |
|     "        print(f\"   TPU:{i} -> {device.name}\")\n",
 | |
|     "except Exception as e:\n",
 | |
|     "    print(f\"❌ TPU检查失败: {e}\")\n",
 | |
|     "\n",
 | |
|     "# 系统资源\n",
 | |
|     "try:\n",
 | |
|     "    cpu_percent = psutil.cpu_percent(interval=1)\n",
 | |
|     "    memory = psutil.virtual_memory()\n",
 | |
|     "    print(f\"💻 CPU使用率: {cpu_percent:.1f}%\")\n",
 | |
|     "    print(f\"💾 内存使用: {memory.percent:.1f}% ({memory.used//1024//1024//1024}GB/{memory.total//1024//1024//1024}GB)\")\n",
 | |
|     "except Exception as e:\n",
 | |
|     "    print(f\"❌ 系统资源检查失败: {e}\")\n",
 | |
|     "\n",
 | |
|     "# Python进程检查\n",
 | |
|     "try:\n",
 | |
|     "    python_processes = []\n",
 | |
|     "    for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):\n",
 | |
|     "        if 'python' in proc.info['name'].lower():\n",
 | |
|     "            python_processes.append(proc.info)\n",
 | |
|     "    \n",
 | |
|     "    print(f\"🐍 Python进程: {len(python_processes)}个\")\n",
 | |
|     "    for proc in python_processes[:3]:  # 只显示前3个\n",
 | |
|     "        print(f\"   PID:{proc['pid']} CPU:{proc['cpu_percent']:.1f}% MEM:{proc['memory_percent']:.1f}%\")\n",
 | |
|     "except Exception as e:\n",
 | |
|     "    print(f\"❌ 进程检查失败: {e}\")\n",
 | |
|     "\n",
 | |
|     "# TPU简单测试\n",
 | |
|     "try:\n",
 | |
|     "    print(\"🧪 TPU连接测试...\")\n",
 | |
|     "    if tpu_devices:\n",
 | |
|     "        with tf.device('/TPU:0'):\n",
 | |
|     "            x = tf.constant([[1.0]])\n",
 | |
|     "            result = tf.matmul(x, x)\n",
 | |
|     "            print(f\"✅ TPU响应正常: {result.numpy()}\")\n",
 | |
|     "    else:\n",
 | |
|     "        print(\"⚠️  没有TPU设备可测试\")\n",
 | |
|     "except Exception as e:\n",
 | |
|     "    print(f\"❌ TPU测试失败: {e}\")\n",
 | |
|     "\n",
 | |
|     "print(\"=== 检查完成 ===\")"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": null,
 | |
|    "id": "2e157ff0",
 | |
|    "metadata": {},
 | |
|    "outputs": [],
 | |
|    "source": []
 | |
|   }
 | |
|  ],
 | |
|  "metadata": {
 | |
|   "kernelspec": {
 | |
|    "display_name": "Python 3 (ipykernel)",
 | |
|    "language": "python",
 | |
|    "name": "python3"
 | |
|   },
 | |
|   "language_info": {
 | |
|    "codemirror_mode": {
 | |
|     "name": "ipython",
 | |
|     "version": 3
 | |
|    },
 | |
|    "file_extension": ".py",
 | |
|    "mimetype": "text/x-python",
 | |
|    "name": "python",
 | |
|    "nbconvert_exporter": "python",
 | |
|    "pygments_lexer": "ipython3",
 | |
|    "version": "3.10.13"
 | |
|   }
 | |
|  },
 | |
|  "nbformat": 4,
 | |
|  "nbformat_minor": 5
 | |
| }
 | 
