Skip to content

Commit 0dae8a2

Browse files
committed
feat: use monitoring script that lists memory usage along with used models
1 parent b19554f commit 0dae8a2

File tree

2 files changed

+27
-4
lines changed

2 files changed

+27
-4
lines changed

nvidia-monitor.Dockerfile

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
FROM nvcr.io/nvidia/cuda:12.4.1-runtime-ubuntu22.04
22

3-
RUN wget -qO nerdctl.tar.gz "https://github.com/containerd/nerdctl/releases/download/v1.7.6/nerdctl-1.7.6-linux-amd64.tar.gz" \
4-
&& tar xvzf /usr/local/bin nerdctl.tar.gz \
5-
&& rm nerdctl.tar.gz
3+
WORKDIR /root
4+
COPY scripts/monitor.sh monitor.sh
65

7-
# for i in $(nerdctl -a /host/run/containerd/containerd.sock -n k8s.io container ls --format "{{.ID}}"); do nerdctl -a /host/run/containerd/containerd.sock -n k8s.io inspect -f '{{.State.Pid}} {{index .Config.Labels "io.kubernetes.pod.name"}}' $i; done | grep gooey-gpu
6+
# ./monitor.sh
87
# nvidia-smi

scripts/monitor.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
ENV_VAR_FILTER="MODEL_IDS"
6+
7+
function print3() {
8+
printf "%-8s %-80s %12s\n" "$1" "${2%,}" "$3"
9+
}
10+
11+
print3 "PID" "Model IDs" "GPU Mem"
12+
13+
nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits \
14+
| while IFS=',' read -r pid mem; do
15+
pid="${pid//[[:space:]]/}"
16+
mem="$(echo $mem|tr -d '[:space:],')"
17+
cmd_env=$(ps eww -p "$pid" -o command=)
18+
val=$(printf '%s\n' "$cmd_env" | tr ' ' '\n' | grep "$ENV_VAR_FILTER=" | cut -d= -f2- | tr '\n' ',')
19+
if [ -z "$val" ]; then
20+
val=$(printf '%s\n' "$cmd_env" | sed -nE 's/.*celery@([^:]+):MainProcess.*/\1/p')
21+
fi
22+
mem_gib=$(awk "BEGIN{printf \"%.2f\", $mem/1024}")
23+
print3 "$pid" "${val%,}" "${mem_gib}GiB"
24+
done

0 commit comments

Comments
 (0)