|
|
@@ -1,12 +1,15 @@
|
|
|
+import os
|
|
|
import psutil
|
|
|
import time
|
|
|
from datetime import datetime
|
|
|
-import os
|
|
|
import json
|
|
|
import csv
|
|
|
-
|
|
|
+from numba import cuda
|
|
|
+import numpy as np
|
|
|
+import pynvml
|
|
|
class Monitor:
|
|
|
def __init__(self, pid:int, alias:str=''):
|
|
|
+ self.cuda_tensor = cuda.to_device(np.arange(10000).astype(np.float32))
|
|
|
self.pid = pid
|
|
|
self.process = None
|
|
|
self.curr_disk_io_counter = None # 磁盘读写计数器
|
|
|
@@ -108,7 +111,56 @@ class Monitor:
|
|
|
self.usage['sys_net_recv'] = f'{sys_net_recv_speed:.2f} MB/s'
|
|
|
self.curr_system_net_io_counter = new_net_io_counter
|
|
|
|
|
|
-
|
|
|
+ def get_gpu_usage_info(self):
|
|
|
+ try:
|
|
|
+ # 验证进程是否存在
|
|
|
+ if not psutil.pid_exists(self.pid):
|
|
|
+ return
|
|
|
+ # 使用 nvidia-ml-py 监控 GPU
|
|
|
+ pynvml.nvmlInit()
|
|
|
+ device_count = pynvml.nvmlDeviceGetCount()
|
|
|
+ for i in range(device_count):
|
|
|
+ handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
|
+ processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
|
|
|
+ for process in processes:
|
|
|
+ if process.pid == self.pid:
|
|
|
+ # 获取进程 GPU 使用详情
|
|
|
+ memory_mb = process.usedGpuMemory / (1024 * 1024) if process.usedGpuMemory else 0 # 显存 转换为MB
|
|
|
+ try:
|
|
|
+ gpu_util = pynvml.nvmlDeviceGetUtilizationRates(handle) # GPU 使用详情
|
|
|
+ except:
|
|
|
+ gpu_util = None
|
|
|
+ # 获取GPU温度
|
|
|
+ try:
|
|
|
+ temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
|
|
+ except:
|
|
|
+ temperature = 0
|
|
|
+ # 获取GPU功耗
|
|
|
+ try:
|
|
|
+ power_usage = pynvml.nvmlDeviceGetPowerUsage(handle)
|
|
|
+ power_watts = power_usage / 1000.0 if power_usage else 0
|
|
|
+ except:
|
|
|
+ power_watts = 0
|
|
|
+ device_id = f'gpu_{i}_'
|
|
|
+ self.usage[device_id+'mem'] = memory_mb
|
|
|
+ self.usage[device_id+'rate_gpu'] = gpu_util.gpu if gpu_util else 0
|
|
|
+ # self.usage[device_id+'rate_mem'] = gpu_util.memory if gpu_util else 0
|
|
|
+ self.usage[device_id+'temperature'] = temperature
|
|
|
+ self.usage[device_id+'power'] = power_watts
|
|
|
+ # print(f"GPU {i}:")
|
|
|
+ # print(f" 内存使用: {memory_mb:.2f} MB")
|
|
|
+ # print(f" GPU 利用率: {gpu_util.gpu if gpu_util else 0}%")
|
|
|
+ # print(f" 内存利用率: {gpu_util.memory if gpu_util else 0}%")
|
|
|
+ # print(f" GPU温度: {temperature}°C")
|
|
|
+ # print(f" GPU功耗: {power_watts:.2f}W")
|
|
|
+ pynvml.nvmlShutdown()
|
|
|
+ except psutil.NoSuchProcess as e:
|
|
|
+ print(f"进程 PID {self.pid} 不存在")
|
|
|
+ except Exception as e:
|
|
|
+ print(f"GPU监控出错: {e}")
|
|
|
+ # 添加详细的错误跟踪
|
|
|
+ import traceback
|
|
|
+ traceback.print_exc()
|
|
|
def update(self, output_file=None):
|
|
|
self.usage['now_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
self.usage['status'] = self.process.status()
|
|
|
@@ -120,6 +172,8 @@ class Monitor:
|
|
|
self.get_disk_io_info()
|
|
|
# 刷新网络上下行情况
|
|
|
self.get_net_io_info()
|
|
|
+ # 获取GPU使用情况
|
|
|
+ self.get_gpu_usage_info()
|
|
|
if output_file:
|
|
|
self.data_points.append(self.usage.copy())
|
|
|
if len(self.data_points) >= self.max_count: # 如果达到100个数据点,则保存到CSV文件中,并清空数据点
|
|
|
@@ -149,8 +203,10 @@ class Monitor:
|
|
|
def main():
|
|
|
import argparse
|
|
|
parser = argparse.ArgumentParser(description='Py进程监视器')
|
|
|
- parser.add_argument('--pid', '-p', type=int, required=True, help='请输入需要监控的进程id')
|
|
|
- parser.add_argument('--output', '-o', type=str, required=True, help='请输入需要监控的进程id')
|
|
|
+ # parser.add_argument('--pid', '-p', type=int, required=True, help='请输入需要监控的进程id')
|
|
|
+ # parser.add_argument('--output', '-o', type=str, required=True, help='请输入需要监控的进程id')
|
|
|
+ parser.add_argument('--pid', '-p', type=int, default=os.getpid(), help='请输入需要监控的进程id')
|
|
|
+ parser.add_argument('--output', '-o', type=str, default='./results/psutil_output.csv', help='请输入输出文件路径')
|
|
|
args = parser.parse_args()
|
|
|
# 删除已经存在的结果
|
|
|
if os.path.exists(args.output):
|