#!/usr/bin/env python3
"""
TPU内存监控工具 - 专门用于训练过程
解决tf.config.experimental.get_memory_info()在TPU上无法工作的问题
"""

import tensorflow as tf
import time
import psutil
import os

class TPUMemoryMonitor:
    """TPU内存监控类"""

    def __init__(self):
        self.tpu_devices = tf.config.list_logical_devices('TPU')
        self.baseline_memory = None
        self.peak_allocations = {}

    def get_tpu_status(self) -> str:
        """获取TPU状态 - 实用版本，不依赖get_memory_info"""
        try:
            if not self.tpu_devices:
                return "TPU: No devices"

            num_cores = len(self.tpu_devices)

            # 测试TPU响应性
            try:
                with tf.device('/TPU:0'):
                    test_tensor = tf.constant([1.0, 2.0, 3.0])
                    result = tf.reduce_sum(test_tensor)
                    _ = result.numpy()  # 强制执行
                activity = "active"
            except Exception:
                activity = "inactive"

            # 获取主机内存作为参考
            try:
                memory = psutil.virtual_memory()
                host_mem = f"Host:{memory.percent:.1f}%"
            except:
                host_mem = "Host:unknown"

            return f"TPU: {num_cores}cores {activity} {host_mem}"

        except Exception as e:
            return f"TPU: error({str(e)[:20]})"

    def estimate_tensor_memory(self, tensor_shape, dtype=tf.float32):
        """估算张量内存使用量"""
        if dtype == tf.float32:
            bytes_per_element = 4
        elif dtype == tf.float16 or dtype == tf.bfloat16:
            bytes_per_element = 2
        elif dtype == tf.int32:
            bytes_per_element = 4
        elif dtype == tf.int64:
            bytes_per_element = 8
        else:
            bytes_per_element = 4  # 默认

        total_elements = 1
        for dim in tensor_shape:
            total_elements *= dim

        total_bytes = total_elements * bytes_per_element
        return total_bytes / (1024 * 1024)  # 返回MB

    def track_allocation(self, name: str, tensor_shape, dtype=tf.float32):
        """跟踪内存分配"""
        mb = self.estimate_tensor_memory(tensor_shape, dtype)
        self.peak_allocations[name] = self.peak_allocations.get(name, 0) + mb
        return mb

    def get_allocation_summary(self) -> str:
        """获取分配汇总"""
        if not self.peak_allocations:
            return "No allocations tracked"

        total_mb = sum(self.peak_allocations.values())
        top_3 = sorted(self.peak_allocations.items(), key=lambda x: x[1], reverse=True)[:3]

        summary = f"Tracked:{total_mb:.1f}MB "
        summary += f"Top:({top_3[0][0]}:{top_3[0][1]:.1f}MB)"

        return summary

    def test_memory_allocation_across_cores(self):
        """测试8个核心的内存分配"""
        print("🧪 测试所有TPU核心内存分配")
        print("=" * 40)

        allocations_per_core = []

        for i, device in enumerate(self.tpu_devices):
            print(f"核心 {i+1}: {device.name}")

            try:
                with tf.device(device.name):
                    # 创建不同大小的测试张量
                    test_sizes = [
                        ([1000, 1000], "1K×1K"),
                        ([3000, 3000], "3K×3K"),
                        ([5000, 5000], "5K×5K"),
                        ([7000, 7000], "7K×7K"),
                    ]

                    core_total = 0
                    successful_allocs = []

                    for shape, desc in test_sizes:
                        try:
                            tensor = tf.ones(shape, dtype=tf.float32)
                            mb = self.estimate_tensor_memory(shape)
                            core_total += mb
                            successful_allocs.append(f"{desc}({mb:.1f}MB)")

                            # 实际使用张量防止被优化
                            _ = tf.reduce_mean(tensor)

                        except Exception as e:
                            print(f"   {desc} 失败: {str(e)[:30]}")
                            break

                    allocations_per_core.append(core_total)
                    print(f"   成功分配: {' + '.join(successful_allocs)}")
                    print(f"   核心总计: {core_total:.1f}MB")

            except Exception as e:
                print(f"   核心{i+1}失败: {e}")
                allocations_per_core.append(0)

        # 汇总结果
        total_all_cores = sum(allocations_per_core)
        avg_per_core = total_all_cores / len(self.tpu_devices) if self.tpu_devices else 0

        print(f"\n📊 汇总结果:")
        print(f"   总分配: {total_all_cores:.1f}MB ({total_all_cores/1024:.2f}GB)")
        print(f"   平均每核: {avg_per_core:.1f}MB ({avg_per_core/1024:.2f}GB)")

        # 推测内存配置
        if avg_per_core > 8000:  # > 8GB
            print("   推测: 每核心≥16GB (高端配置)")
        elif avg_per_core > 4000:  # > 4GB
            print("   推测: 每核心8-16GB (标准配置)")
        elif avg_per_core > 1000:  # > 1GB
            print("   推测: 每核心2-8GB (受限或共享)")
        else:
            print("   推测: 每核心<2GB (严重受限)")

        return allocations_per_core

def test_training_memory_pattern():
    """测试模拟训练的内存模式"""
    print("\n🏋️ 模拟训练内存模式测试")
    print("=" * 30)

    monitor = TPUMemoryMonitor()

    # 模拟典型的brain-to-text模型内存使用
    with tf.device('/TPU:0'):
        print("创建模拟模型组件...")

        # 1. 输入数据 (batch_size=32, seq_len=1000, features=512)
        batch_size, seq_len, features = 32, 1000, 512
        input_data = tf.random.normal([batch_size, seq_len, features])
        input_mb = monitor.track_allocation("input_data", [batch_size, seq_len, features])
        print(f"   输入数据: {input_mb:.1f}MB")

        # 2. GRU权重 (假设3层, 每层256单元)
        n_layers, n_units = 3, 256
        for layer in range(n_layers):
            # GRU有3个门，每个门需要权重矩阵
            weight_shape = [features if layer == 0 else n_units, n_units * 3]
            weights = tf.random.normal(weight_shape)
            weight_mb = monitor.track_allocation(f"gru_layer_{layer}", weight_shape)
            print(f"   GRU层{layer+1}权重: {weight_mb:.1f}MB")

        # 3. 输出投影层 (n_units -> n_classes=41)
        n_classes = 41
        output_weights = tf.random.normal([n_units, n_classes])
        output_mb = monitor.track_allocation("output_projection", [n_units, n_classes])
        print(f"   输出投影: {output_mb:.1f}MB")

        # 4. 中间激活值 (前向传播)
        hidden_states = tf.random.normal([batch_size, seq_len, n_units])
        hidden_mb = monitor.track_allocation("hidden_states", [batch_size, seq_len, n_units])
        print(f"   隐藏状态: {hidden_mb:.1f}MB")

        # 5. 梯度 (反向传播时会翻倍内存)
        total_params_mb = sum([v for k, v in monitor.peak_allocations.items() if 'layer' in k or 'projection' in k])
        gradient_mb = total_params_mb  # 梯度内存约等于参数内存
        print(f"   梯度内存: {gradient_mb:.1f}MB (估算)")

        print(f"\n模型总内存估算: {monitor.get_allocation_summary()}")

        # 实际执行一些操作确保内存被分配
        result = tf.reduce_mean(input_data) + tf.reduce_mean(hidden_states)
        print(f"验证计算结果: {result.numpy():.4f}")

if __name__ == "__main__":
    print("🚀 TPU内存监控工具启动")

    monitor = TPUMemoryMonitor()

    # 基础状态检查
    print(f"当前TPU状态: {monitor.get_tpu_status()}")

    # 测试所有核心
    print("\n" + "="*50)
    core_allocations = monitor.test_memory_allocation_across_cores()

    # 训练内存模式测试
    print("\n" + "="*50)
    test_training_memory_pattern()

    print(f"\n🎯 关键发现:")
    if core_allocations:
        max_core = max(core_allocations)
        min_core = min([x for x in core_allocations if x > 0])
        print(f"   最大单核分配: {max_core:.1f}MB")
        print(f"   最小单核分配: {min_core:.1f}MB")

        if max_core > 9000:  # 你之前测试到9.4GB
            print("   ✅ 内存充足，可支持大模型训练")
        elif max_core > 5000:
            print("   ⚠️ 内存中等，建议优化模型大小")
        else:
            print("   ❌ 内存不足，需要大幅减少模型参数")

    print(f"\n💡 针对你的训练卡顿问题:")
    print(f"   - SetPriority错误通常是XLA编译问题，不是内存问题")
    print(f"   - 你的9.4GB测试说明TPU内存工作正常")
    print(f"   - 建议检查模型是否有导致XLA编译卡顿的操作")
    print(f"   - 考虑使用更简单的操作或关闭某些XLA优化")