#!/usr/bin/env python3 """ TPU内存监控工具 - 专门用于训练过程 解决tf.config.experimental.get_memory_info()在TPU上无法工作的问题 """ import tensorflow as tf import time import psutil import os class TPUMemoryMonitor: """TPU内存监控类""" def __init__(self): self.tpu_devices = tf.config.list_logical_devices('TPU') self.baseline_memory = None self.peak_allocations = {} def get_tpu_status(self) -> str: """获取TPU状态 - 实用版本,不依赖get_memory_info""" try: if not self.tpu_devices: return "TPU: No devices" num_cores = len(self.tpu_devices) # 测试TPU响应性 try: with tf.device('/TPU:0'): test_tensor = tf.constant([1.0, 2.0, 3.0]) result = tf.reduce_sum(test_tensor) _ = result.numpy() # 强制执行 activity = "active" except Exception: activity = "inactive" # 获取主机内存作为参考 try: memory = psutil.virtual_memory() host_mem = f"Host:{memory.percent:.1f}%" except: host_mem = "Host:unknown" return f"TPU: {num_cores}cores {activity} {host_mem}" except Exception as e: return f"TPU: error({str(e)[:20]})" def estimate_tensor_memory(self, tensor_shape, dtype=tf.float32): """估算张量内存使用量""" if dtype == tf.float32: bytes_per_element = 4 elif dtype == tf.float16 or dtype == tf.bfloat16: bytes_per_element = 2 elif dtype == tf.int32: bytes_per_element = 4 elif dtype == tf.int64: bytes_per_element = 8 else: bytes_per_element = 4 # 默认 total_elements = 1 for dim in tensor_shape: total_elements *= dim total_bytes = total_elements * bytes_per_element return total_bytes / (1024 * 1024) # 返回MB def track_allocation(self, name: str, tensor_shape, dtype=tf.float32): """跟踪内存分配""" mb = self.estimate_tensor_memory(tensor_shape, dtype) self.peak_allocations[name] = self.peak_allocations.get(name, 0) + mb return mb def get_allocation_summary(self) -> str: """获取分配汇总""" if not self.peak_allocations: return "No allocations tracked" total_mb = sum(self.peak_allocations.values()) top_3 = sorted(self.peak_allocations.items(), key=lambda x: x[1], reverse=True)[:3] summary = f"Tracked:{total_mb:.1f}MB " summary += f"Top:({top_3[0][0]}:{top_3[0][1]:.1f}MB)" return summary def test_memory_allocation_across_cores(self): """测试8个核心的内存分配""" print("🧪 测试所有TPU核心内存分配") print("=" * 40) allocations_per_core = [] for i, device in enumerate(self.tpu_devices): print(f"核心 {i+1}: {device.name}") try: with tf.device(device.name): # 创建不同大小的测试张量 test_sizes = [ ([1000, 1000], "1K×1K"), ([3000, 3000], "3K×3K"), ([5000, 5000], "5K×5K"), ([7000, 7000], "7K×7K"), ] core_total = 0 successful_allocs = [] for shape, desc in test_sizes: try: tensor = tf.ones(shape, dtype=tf.float32) mb = self.estimate_tensor_memory(shape) core_total += mb successful_allocs.append(f"{desc}({mb:.1f}MB)") # 实际使用张量防止被优化 _ = tf.reduce_mean(tensor) except Exception as e: print(f" {desc} 失败: {str(e)[:30]}") break allocations_per_core.append(core_total) print(f" 成功分配: {' + '.join(successful_allocs)}") print(f" 核心总计: {core_total:.1f}MB") except Exception as e: print(f" 核心{i+1}失败: {e}") allocations_per_core.append(0) # 汇总结果 total_all_cores = sum(allocations_per_core) avg_per_core = total_all_cores / len(self.tpu_devices) if self.tpu_devices else 0 print(f"\n📊 汇总结果:") print(f" 总分配: {total_all_cores:.1f}MB ({total_all_cores/1024:.2f}GB)") print(f" 平均每核: {avg_per_core:.1f}MB ({avg_per_core/1024:.2f}GB)") # 推测内存配置 if avg_per_core > 8000: # > 8GB print(" 推测: 每核心≥16GB (高端配置)") elif avg_per_core > 4000: # > 4GB print(" 推测: 每核心8-16GB (标准配置)") elif avg_per_core > 1000: # > 1GB print(" 推测: 每核心2-8GB (受限或共享)") else: print(" 推测: 每核心<2GB (严重受限)") return allocations_per_core def test_training_memory_pattern(): """测试模拟训练的内存模式""" print("\n🏋️ 模拟训练内存模式测试") print("=" * 30) monitor = TPUMemoryMonitor() # 模拟典型的brain-to-text模型内存使用 with tf.device('/TPU:0'): print("创建模拟模型组件...") # 1. 输入数据 (batch_size=32, seq_len=1000, features=512) batch_size, seq_len, features = 32, 1000, 512 input_data = tf.random.normal([batch_size, seq_len, features]) input_mb = monitor.track_allocation("input_data", [batch_size, seq_len, features]) print(f" 输入数据: {input_mb:.1f}MB") # 2. GRU权重 (假设3层, 每层256单元) n_layers, n_units = 3, 256 for layer in range(n_layers): # GRU有3个门,每个门需要权重矩阵 weight_shape = [features if layer == 0 else n_units, n_units * 3] weights = tf.random.normal(weight_shape) weight_mb = monitor.track_allocation(f"gru_layer_{layer}", weight_shape) print(f" GRU层{layer+1}权重: {weight_mb:.1f}MB") # 3. 输出投影层 (n_units -> n_classes=41) n_classes = 41 output_weights = tf.random.normal([n_units, n_classes]) output_mb = monitor.track_allocation("output_projection", [n_units, n_classes]) print(f" 输出投影: {output_mb:.1f}MB") # 4. 中间激活值 (前向传播) hidden_states = tf.random.normal([batch_size, seq_len, n_units]) hidden_mb = monitor.track_allocation("hidden_states", [batch_size, seq_len, n_units]) print(f" 隐藏状态: {hidden_mb:.1f}MB") # 5. 梯度 (反向传播时会翻倍内存) total_params_mb = sum([v for k, v in monitor.peak_allocations.items() if 'layer' in k or 'projection' in k]) gradient_mb = total_params_mb # 梯度内存约等于参数内存 print(f" 梯度内存: {gradient_mb:.1f}MB (估算)") print(f"\n模型总内存估算: {monitor.get_allocation_summary()}") # 实际执行一些操作确保内存被分配 result = tf.reduce_mean(input_data) + tf.reduce_mean(hidden_states) print(f"验证计算结果: {result.numpy():.4f}") if __name__ == "__main__": print("🚀 TPU内存监控工具启动") monitor = TPUMemoryMonitor() # 基础状态检查 print(f"当前TPU状态: {monitor.get_tpu_status()}") # 测试所有核心 print("\n" + "="*50) core_allocations = monitor.test_memory_allocation_across_cores() # 训练内存模式测试 print("\n" + "="*50) test_training_memory_pattern() print(f"\n🎯 关键发现:") if core_allocations: max_core = max(core_allocations) min_core = min([x for x in core_allocations if x > 0]) print(f" 最大单核分配: {max_core:.1f}MB") print(f" 最小单核分配: {min_core:.1f}MB") if max_core > 9000: # 你之前测试到9.4GB print(" ✅ 内存充足,可支持大模型训练") elif max_core > 5000: print(" ⚠️ 内存中等,建议优化模型大小") else: print(" ❌ 内存不足,需要大幅减少模型参数") print(f"\n💡 针对你的训练卡顿问题:") print(f" - SetPriority错误通常是XLA编译问题,不是内存问题") print(f" - 你的9.4GB测试说明TPU内存工作正常") print(f" - 建议检查模型是否有导致XLA编译卡顿的操作") print(f" - 考虑使用更简单的操作或关闭某些XLA优化")