随着区块链、人工智能的盛行,越来越多的场景开始使用GPU,而其监控也随之受到重视。目前生产环境中大部分GPU为NVIDIA厂商,今天就聊聊NVIDIA如何进行GPU的监控。

NVML

NVIDIA Management Library 是英伟达提供用于监控、管理GPU的API,底层是用C实现。我们常用的nvidia-smi命令也是基于该库进行的封装。官方同时也提供了perl、python版本的库。

[root@host104722317 ~]# nvidia-smi
Tue Sep  8 10:16:02 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81                 Driver Version: 384.81                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla P40           Off  | 00000000:84:00.0 Off |                    0 |
| N/A   34C    P0    49W / 250W |   7083MiB / 22912MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P40           Off  | 00000000:88:00.0 Off |                    0 |
| N/A   31C    P0    50W / 250W |  12843MiB / 22912MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      8336      C   ./image_ds                                   193MiB |
|    0     13531      C   ./image_ds                                   193MiB |
|    0     17306      C   ./image_ds                                   193MiB |
|    0     36354      C   ...entCS/imagesearch/cs/cs01_self/image_cs  1809MiB |
|    0     37691      C   ...rc/trandeploy/imagesearch/as00/image_as  1561MiB |
|    0     37714      C   ...rc/trandeploy/imagesearch/as01/image_as  1561MiB |
|    0     37738      C   ...rc/trandeploy/imagesearch/as02/image_as  1561MiB |
|    1      8336      C   ./image_ds                                   411MiB |
|    1     13531      C   ./image_ds                                   411MiB |
|    1     17306      C   ./image_ds                                   411MiB |
|    1     36376      C   ...entCS/imagesearch/cs/cs11_self/image_cs  1809MiB |
|    1     37622      C   ...rc/trandeploy/imagesearch/as10/image_as  1561MiB |
|    1     37645      C   ...rc/trandeploy/imagesearch/as11/image_as  4401MiB |
|    1     37668      C   ...rc/trandeploy/imagesearch/as12/image_as  1561MiB |
|    1     38160      C   ...entDS/commentVideoDS/ds01_self/image_ds  1131MiB |
|    1     38181      C   ...entDS/commentVideoDS/ds11_self/image_ds  1131MiB |
+-----------------------------------------------------------------------------+

监控项

常规有如下几个监控项,基本可以覆盖大部分应用场景。

  • 温度
  • GPU利用率
  • 内存总量
  • 已分配内存
  • 内存利用率 内存利用率跟已分配内存是有区别的,我理解只要有任务被分配到GPU就会分配内存,而分配的内存究竟有没有被使用才是内存利用率。根据官方的描述:内存利用率是指在每个采集周期内(可能在1/6至1秒)读写占用的时间百分比。
  • 电源使用情况
  • 风扇速度 风扇并不是所有GPU都有
  • GPU数量
  • GPU平均利用率
  • GPU平均内存利用率

实现

需要借助第三方库github.com/mindprince/gonvml,利用cgo的特性直接使用官方的API。

这个方案是在Linux环境下,依赖libnvidia-ml.so.1库。这个库主要包含2个文件

  • nvml.h的存在让我们不依赖于构建环境中存在的NVML。
  • bindings.go是调用NVML函数的cgo桥。 bindings.go中的cgo前导代码使用dlopen动态加载NVML并使其功能可用。

详细的代码直接到github看就可以,不在这里贴出了。

Agent相关代码

package funcs

import (
    "log"

    "github.com/mindprince/gonvml"
    "github.com/open-falcon/falcon-plus/common/model"
)

// 需要load libnvidia-ml.so.1库
func GpuMetrics() (L []*model.MetricValue) {

    if err := gonvml.Initialize(); err != nil {
        log.Println("Initialize error: ", err)
        return
    }

    defer gonvml.Shutdown()

    count, err := gonvml.DeviceCount()
    if err != nil {
        log.Println("DeviceCount error: ", err)
        return
    }

    if count == 0 {
        return
    }

    temperature := uint(0)
    totalMemory := uint64(0)
    usedMemory := uint64(0)
    gpuUtilization := uint(0)
    memoryUtilization := uint(0)
    powerUsage := uint(0)
    allUtilization := uint(0)
    allMemoryUtilization := uint(0)

    for i := 0; i < int(count); i++ {
        dev, err := gonvml.DeviceHandleByIndex(uint(i))
        if err != nil {
            log.Println("DeviceHandleByIndex error:", err)
            continue
        }

        uuid, err := dev.UUID()
        if err != nil {
            log.Println("dev.UUID error", err)
        }

        tag := "uuid=" + uuid

        // 不是所有gpu都有风扇
        fanSpeed, err := dev.FanSpeed()
        if err != nil {
            log.Println("dev.FanSpeed error: ", err)
        } else {
            L = append(L, GaugeValue("gpu.fan.speed", fanSpeed, tag))
        }

        temperature, err = dev.Temperature()
        if err != nil {
            log.Println("dev.Temperature error: ", err)
            continue
        }

        totalMemory, usedMemory, err = dev.MemoryInfo()
        if err != nil {
            log.Println("dev.MemoryInfo error: ", err)
            continue
        }

        // 单位换算为兆
        totalBillion := float64(totalMemory / 1024 / 1024)
        usedBillion := float64(usedMemory / 1024 / 1024)

        gpuUtilization, memoryUtilization, err = dev.UtilizationRates()
        if err != nil {
            log.Println("dev.UtilizationRates error: ", err)
            continue
        }

        allUtilization += gpuUtilization
        allMemoryUtilization += memoryUtilization

        powerUsage, err = dev.PowerUsage()
        if err != nil {
            log.Println("dev.PowerUsage error: ", err)
        }

        // 单位换算为瓦特
        powerWatt := float64(powerUsage / 1000)

        L = append(L, GaugeValue("gpu.temperature", temperature, tag))
        L = append(L, GaugeValue("gpu.memory.total", totalBillion, tag))
        L = append(L, GaugeValue("gpu.memory.used", usedBillion, tag))
        L = append(L, GaugeValue("gpu.memory.util", memoryUtilization, tag))
        L = append(L, GaugeValue("gpu.util", gpuUtilization, tag))
        L = append(L, GaugeValue("gpu.power.usage", powerWatt, tag))
    }

    L = append(L, GaugeValue("gpu.count", count))
    L = append(L, GaugeValue("gpu.util.avg", allUtilization/count))
    L = append(L, GaugeValue("gpu.memory.util.avg", allMemoryUtilization/count))
    return L
}