-
install docker
-
install nvidia_gpu_exporter
From https://github.com/utkuozdemir/nvidia_gpu_exporter.
docker run -d \
--name nvidia_smi_exporter \
--restart unless-stopped \
--device /dev/nvidiactl:/dev/nvidiactl \
--device /dev/nvidia0:/dev/nvidia0 \
-v /usr/lib/x86_64-linux-gnu/libnvidia-ml.so:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so \
-v /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \
-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi \
-p 9835:9835 \
utkuozdemir/nvidia_gpu_exporter:1.2.0
- install prometheus
From https://prometheus.io/docs/prometheus/latest/installation/
# Create persistent volume for your data
docker volume create prometheus-data
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: "GPU"
static_configs:
- targets: ["172.17.0.1:9835"]
# Start Prometheus container
docker run -d \
-p 9090:9090 \
-v /root/monitor/prometheus.yml:/etc/prometheus/prometheus.yml \
-v prometheus-data:/prometheus \
prom/prometheus
- install Grafana
From https://grafana.com/docs/grafana/latest/setup-grafana/installation/docker/
# create a persistent volume for your data
docker volume create grafana-storage
# start grafana
docker run -d -p 3000:3000 --name=grafana \
--volume grafana-storage:/var/lib/grafana \
grafana/grafana-enterprise
- Grafana Setting
First, using admin/admin login.
Second, set prometheus as datasources & Prometheus server URL *
is http://172.17.0.1:9090
(the docker Gateway ip)
Third, import dashboard & enter 14574 to the ID field.