-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
35 lines (29 loc) · 797 Bytes
/
config.yaml
File metadata and controls
35 lines (29 loc) · 797 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# MyGPU Configuration
cluster:
name: "My GPU Cluster"
name: "Prototype Cluster Monitor"
nodes: [] # Add nodes for multi-node monitoring
# Example for future multi-node setup:
# - hostname: gpu-server-01
# ssh_user: admin
# - hostname: gpu-server-02
# ssh_user: admin
monitoring:
interval_seconds: 5
history_retention_hours: 168 # 1 week to change
alerts:
# GPU alerts
gpu_temperature_warn: 80 # Warning at 80°C
gpu_temperature_critical: 90 # Critical at 90°C
gpu_memory_usage_warn: 90 # Warning at 90% memory
gpu_utilization_low: 10 # Alert if GPU idle
# System alerts (percentages)
cpu_usage_warn: 90
memory_usage_warn: 90
disk_usage_warn: 90
web:
host: "0.0.0.0"
port: 8090
storage:
type: sqlite
path: ./metrics.db