{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "acb1482e", "metadata": {}, "outputs": [], "source": [ "# 我只想看看TPU占用情况" ] }, { "cell_type": "code", "execution_count": 2, "id": "b317eff3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[?1h\u001b=\u001b[H\u001b[2J\u001b[mtop - 17:43:08 up 29 min, 0 user, load average: 2.18, 2.22, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "Tasks:\u001b[m\u001b[m\u001b[1m 10 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 1 \u001b[m\u001b[mrunning,\u001b[m\u001b[m\u001b[1m 9 \u001b[m\u001b[msleeping,\u001b[m\u001b[m\u001b[1m 0 \u001b[m\u001b[mstopped,\u001b[m\u001b[m\u001b[1m 0 \u001b[m\u001b[mzombie\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 98.9 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.7 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292516.1 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61605.9 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35359.9 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325302.9 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "\u001b[K\n", "\u001b[7m PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND \u001b[m\u001b[m\u001b[K\n", "\u001b[m 310 root 20 0 175.4g 54.5g 371368 S 13.3 14.4 8:45.06 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 1 root 20 0 400128 98296 18312 S 0.0 0.0 0:08.66 jupyter+ \u001b[m\u001b[m\u001b[K\n", "\u001b[m 13 root 20 0 910476 61572 15620 S 0.0 0.0 0:10.47 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n", "\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 2209 root 20 0 755688 68480 15688 S 0.0 0.0 0:00.64 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 2241 root 20 0 2576 956 856 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n", "\u001b[m\u001b[1m 2242 root 20 0 9180 5080 2912 R 0.0 0.0 0:00.01 top \u001b[m\u001b[m\u001b[K\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[H\u001b[mtop - 17:43:11 up 30 min, 0 user, load average: 2.18, 2.22, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "\n", "%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.3 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.2 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 99.2 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292395.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61724.6 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35362.1 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325184.2 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "\u001b[K\n", "\n", "\u001b[m 310 root 20 0 175.5g 54.6g 371368 S 14.3 14.5 8:45.49 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 13 root 20 0 910476 61572 15620 S 1.0 0.0 0:10.50 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 2209 root 20 0 755688 68480 15688 S 1.0 0.0 0:00.67 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 1 root 20 0 400128 98296 18312 S 0.7 0.0 0:08.68 jupyter+ \u001b[m\u001b[m\u001b[K\n", "\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n", "\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n", "\n", "\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[H\u001b[mtop - 17:43:14 up 30 min, 0 user, load average: 2.17, 2.21, 2.19\u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "\n", "%Cpu(s):\u001b[m\u001b[m\u001b[1m 0.2 \u001b[m\u001b[mus,\u001b[m\u001b[m\u001b[1m 0.1 \u001b[m\u001b[msy,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mni,\u001b[m\u001b[m\u001b[1m 99.3 \u001b[m\u001b[mid,\u001b[m\u001b[m\u001b[1m 0.4 \u001b[m\u001b[mwa,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mhi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[msi,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mst\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "MiB Mem :\u001b[m\u001b[m\u001b[1m 386908.8 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 292220.6 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 61896.9 \u001b[m\u001b[mused,\u001b[m\u001b[m\u001b[1m 35364.2 \u001b[m\u001b[mbuff/cache\u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "MiB Swap:\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mtotal,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mfree,\u001b[m\u001b[m\u001b[1m 0.0 \u001b[m\u001b[mused.\u001b[m\u001b[m\u001b[1m 325011.8 \u001b[m\u001b[mavail Mem \u001b[m\u001b[m\u001b[m\u001b[m\u001b[K\n", "\u001b[K\n", "\n", "\u001b[m 310 root 20 0 175.6g 54.8g 371368 S 17.7 14.5 8:46.02 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 13 root 20 0 910476 61572 15620 S 1.0 0.0 0:10.53 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 2209 root 20 0 755688 68480 15688 S 1.0 0.0 0:00.70 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 1 root 20 0 400128 98296 18312 S 0.3 0.0 0:08.69 jupyter+ \u001b[m\u001b[m\u001b[K\n", "\u001b[m\u001b[1m 2242 root 20 0 9180 5080 2912 R 0.3 0.0 0:00.02 top \u001b[m\u001b[m\u001b[K\n", "\u001b[m 30 root 20 0 5685268 183828 45432 S 0.0 0.0 0:16.19 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 164 root 20 0 29448 25612 8688 S 0.0 0.0 0:00.11 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 309 root 20 0 2576 904 812 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\n", "\u001b[m 2175 root 20 0 26072 22068 8836 S 0.0 0.0 0:00.06 python \u001b[m\u001b[m\u001b[K\n", "\u001b[m 2241 root 20 0 2576 956 856 S 0.0 0.0 0:00.00 sh \u001b[m\u001b[m\u001b[K\u001b[18;1H\u001b[K\u001b[19;1H\u001b[K\u001b[20;1H\u001b[K\u001b[21;1H\u001b[K\u001b[22;1H\u001b[K\u001b[23;1H\u001b[K\u001b[24;1H\u001b[K\u001b[?1l\u001b>\u001b[25;1H\n", "\u001b[K" ] } ], "source": [ "!top" ] }, { "cell_type": "code", "execution_count": 3, "id": "1eee541b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root 309 0.0 0.0 2576 904 pts/0 Ss+ 17:24 0:00 /usr/bin/sh -c cd /kaggle/working/b2txt25/model_training_nnn_tpu && python train_model_tf.py --config_path rnn_args.yaml\n", "root 2268 0.0 0.0 2576 940 pts/1 Ss+ 17:44 0:00 /usr/bin/sh -c ps aux | grep -i tpu\n", "root 2270 0.0 0.0 3744 2024 pts/1 S+ 17:44 0:00 grep -i tpu\n" ] } ], "source": [ "!ps aux | grep -i tpu" ] }, { "cell_type": "code", "execution_count": 5, "id": "2f03ffe1", "metadata": {}, "outputs": [], "source": [ "!pgrep -fl \"python.*tensorflow\\|python.*train\"" ] }, { "cell_type": "code", "execution_count": 6, "id": "ffbc7471", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "=== TPU状态检查 ===\n", "时间: 2025-10-15 17:46:08\n", "❌ TPU检查失败: open(/dev/vfio/5): Device or resource busy: Device or resource busy; Couldn't open iommu group /dev/vfio/5\n", "💻 CPU使用率: 0.4%\n", "💾 内存使用: 18.9% (68GB/377GB)\n", "🐍 Python进程: 6个\n", " PID:13 CPU:0.0% MEM:0.0%\n", " PID:30 CPU:0.0% MEM:0.0%\n", " PID:164 CPU:0.0% MEM:0.0%\n", "🧪 TPU连接测试...\n", "❌ TPU测试失败: name 'tpu_devices' is not defined\n", "=== 检查完成 ===\n" ] } ], "source": [ "import tensorflow as tf\n", "import psutil\n", "import os\n", "import time\n", "\n", "print(\"=== TPU状态检查 ===\")\n", "print(f\"时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\")\n", "\n", "# TPU设备检查\n", "try:\n", " tpu_devices = tf.config.list_logical_devices('TPU')\n", " print(f\"✅ TPU设备: {len(tpu_devices)}个\")\n", " for i, device in enumerate(tpu_devices):\n", " print(f\" TPU:{i} -> {device.name}\")\n", "except Exception as e:\n", " print(f\"❌ TPU检查失败: {e}\")\n", "\n", "# 系统资源\n", "try:\n", " cpu_percent = psutil.cpu_percent(interval=1)\n", " memory = psutil.virtual_memory()\n", " print(f\"💻 CPU使用率: {cpu_percent:.1f}%\")\n", " print(f\"💾 内存使用: {memory.percent:.1f}% ({memory.used//1024//1024//1024}GB/{memory.total//1024//1024//1024}GB)\")\n", "except Exception as e:\n", " print(f\"❌ 系统资源检查失败: {e}\")\n", "\n", "# Python进程检查\n", "try:\n", " python_processes = []\n", " for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):\n", " if 'python' in proc.info['name'].lower():\n", " python_processes.append(proc.info)\n", " \n", " print(f\"🐍 Python进程: {len(python_processes)}个\")\n", " for proc in python_processes[:3]: # 只显示前3个\n", " print(f\" PID:{proc['pid']} CPU:{proc['cpu_percent']:.1f}% MEM:{proc['memory_percent']:.1f}%\")\n", "except Exception as e:\n", " print(f\"❌ 进程检查失败: {e}\")\n", "\n", "# TPU简单测试\n", "try:\n", " print(\"🧪 TPU连接测试...\")\n", " if tpu_devices:\n", " with tf.device('/TPU:0'):\n", " x = tf.constant([[1.0]])\n", " result = tf.matmul(x, x)\n", " print(f\"✅ TPU响应正常: {result.numpy()}\")\n", " else:\n", " print(\"⚠️ 没有TPU设备可测试\")\n", "except Exception as e:\n", " print(f\"❌ TPU测试失败: {e}\")\n", "\n", "print(\"=== 检查完成 ===\")" ] }, { "cell_type": "code", "execution_count": null, "id": "2e157ff0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }