Skip to content

Commit 2858a94

Browse files
authored
Add Prometheus rules unit tests (#3868)
This PR add unit tests for existing alerts. Signed-off-by: Shirly Radco <[email protected]>
1 parent b787cdc commit 2858a94

File tree

5 files changed

+336
-1
lines changed

5 files changed

+336
-1
lines changed

Makefile

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ build-functest: ## Build the functional tests (content of tests/ subdirectory)
106106
test: test-unit test-functional test-lint ## execute all tests (_NOTE:_ 'WHAT' is expected to match the go cli pattern for paths e.g. './pkg/...'. This differs slightly from rest of the 'make' targets)
107107

108108
test-unit: WHAT = ./pkg/... ./cmd/...
109-
test-unit: ## Run unit tests.
109+
test-unit: prom-rules-verify ## Run unit tests.
110110
${DO} "ACK_GINKGO_DEPRECATIONS=${ACK_GINKGO_DEPRECATIONS} ./hack/build/run-unit-tests.sh ${WHAT}"
111111

112112
test-functional: WHAT = ./tests/...
@@ -137,6 +137,20 @@ builder-push: ## Build and push the builder container image, declared in docker/
137137
openshift-ci-image-push: ## Build and push the OpenShift CI build+test container image, declared in hack/ci/Dockerfile.ci
138138
./hack/build/osci-image-builder.sh
139139

140+
rule-spec-dumper-executable := "rule-spec-dumper"
141+
142+
build-prom-spec-dumper:
143+
${DO} "go build -o ./hack/${rule-spec-dumper-executable} ./hack/prom-rule-ci/rule-spec-dumper.go"
144+
145+
clean-prom-spec-dumper:
146+
rm -f ./hack/${rule-spec-dumper-executable}
147+
148+
prom-rules-verify: build-prom-spec-dumper
149+
./hack/prom-rule-ci/verify-rules.sh \
150+
"./hack/${rule-spec-dumper-executable}" \
151+
"./hack/prom-rule-ci/prom-rules-tests.yaml"
152+
rm ./hack/${rule-spec-dumper-executable}
153+
140154
##@ Local cluster management
141155
cluster-up: ## Start a default Kubernetes or Open Shift cluster. set KUBEVIRT_PROVIDER environment variable to either 'k8s-1.18' or 'os-3.11.0' to select the type of cluster. set KUBEVIRT_NUM_NODES to something higher than 1 to have more than one node.
142156
./cluster-up/up.sh

hack/prom-rule-ci/BUILD.bazel

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
2+
3+
go_library(
4+
name = "go_default_library",
5+
srcs = ["rule-spec-dumper.go"],
6+
importpath = "kubevirt.io/kubevirt/hack/prom-rule-ci",
7+
visibility = ["//visibility:private"],
8+
deps = ["//pkg/monitoring/rules:go_default_library"],
9+
)
10+
11+
go_binary(
12+
name = "prom-rule-ci",
13+
embed = [":go_default_library"],
14+
visibility = ["//visibility:public"],
15+
)
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
---
2+
rule_files:
3+
- /tmp/rules.verify
4+
5+
group_eval_order:
6+
- recordingRules.rules
7+
- alerts.rules
8+
#information about this format can be found in: https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/
9+
tests:
10+
# CDINotReady should fire when kubevirt_cdi_cr_ready == 0 for 5m
11+
- interval: 1m
12+
input_series:
13+
- series: 'kubevirt_cdi_cr_ready'
14+
values: "0 0 0 0 0 0 0 1 1"
15+
alert_rule_test:
16+
# must not trigger before the 5m for-window
17+
- eval_time: 4m
18+
alertname: CDINotReady
19+
exp_alerts: []
20+
# must trigger after the for-window elapses with continuous zeros
21+
- eval_time: 6m
22+
alertname: CDINotReady
23+
exp_alerts:
24+
- exp_labels:
25+
severity: warning
26+
operator_health_impact: critical
27+
kubernetes_operator_part_of: kubevirt
28+
kubernetes_operator_component: containerized-data-importer
29+
exp_annotations:
30+
summary: CDI is not available to use
31+
runbook_url: https://kubevirt.io/monitoring/runbooks/CDINotReady
32+
# must not trigger when healthy (value 1)
33+
- eval_time: 8m
34+
alertname: CDINotReady
35+
exp_alerts: []
36+
37+
38+
39+
# CDIDataImportCronOutdated should fire when any cron is outdated (pending="false") for 15 minutes
40+
- interval: 1m
41+
input_series:
42+
- series: 'kubevirt_cdi_dataimportcron_outdated{pending="false", ns="user-ns", cron_name="cron-a"}'
43+
values: "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
44+
alert_rule_test:
45+
# Must not trigger before the 15m for-window elapses
46+
- eval_time: 14m
47+
alertname: CDIDataImportCronOutdated
48+
exp_alerts: []
49+
# Must trigger after the for-window elapses
50+
- eval_time: 16m
51+
alertname: CDIDataImportCronOutdated
52+
exp_alerts:
53+
- exp_labels:
54+
severity: info
55+
operator_health_impact: warning
56+
kubernetes_operator_part_of: kubevirt
57+
kubernetes_operator_component: containerized-data-importer
58+
ns: user-ns
59+
cron_name: cron-a
60+
exp_annotations:
61+
summary: DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule
62+
runbook_url: https://kubevirt.io/monitoring/runbooks/CDIDataImportCronOutdated
63+
64+
# CDIDataImportCronOutdated must NOT fire for pending="true"
65+
- interval: 1m
66+
input_series:
67+
- series: 'kubevirt_cdi_dataimportcron_outdated{pending="true", ns="user-ns", cron_name="cron-b"}'
68+
values: "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
69+
alert_rule_test:
70+
- eval_time: 20m
71+
alertname: CDIDataImportCronOutdated
72+
exp_alerts: []
73+
74+
# CDIOperatorDown should fire when kubevirt_cdi_operator_up == 0 for 5m
75+
- interval: 1m
76+
input_series:
77+
- series: 'kubevirt_cdi_operator_up'
78+
values: "0 0 0 0 0 0 0"
79+
alert_rule_test:
80+
- eval_time: 6m
81+
alertname: CDIOperatorDown
82+
exp_alerts:
83+
- exp_labels:
84+
severity: warning
85+
operator_health_impact: critical
86+
kubernetes_operator_part_of: kubevirt
87+
kubernetes_operator_component: containerized-data-importer
88+
exp_annotations:
89+
summary: CDI operator is down
90+
runbook_url: https://kubevirt.io/monitoring/runbooks/CDIOperatorDown
91+
92+
# CDIDataVolumeUnusualRestartCount should fire when any population restart metric > 0 for 5m
93+
- interval: 1m
94+
input_series:
95+
- series: 'kube_pod_container_status_restarts_total{pod="importer-1",container="importer"}'
96+
values: "4 4 4 4 4 4"
97+
alert_rule_test:
98+
- eval_time: 6m
99+
alertname: CDIDataVolumeUnusualRestartCount
100+
exp_alerts:
101+
- exp_labels:
102+
severity: warning
103+
operator_health_impact: none
104+
kubernetes_operator_part_of: kubevirt
105+
kubernetes_operator_component: containerized-data-importer
106+
exp_annotations:
107+
summary: Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated
108+
runbook_url: https://kubevirt.io/monitoring/runbooks/CDIDataVolumeUnusualRestartCount
109+
110+
# CDIStorageProfilesIncomplete should fire when any storageprofile is incomplete for 5m
111+
- interval: 1m
112+
input_series:
113+
- series: 'kubevirt_cdi_storageprofile_info{complete="false",storageclass="sc1",provisioner="prov"}'
114+
values: "1 1 1 1 1 1"
115+
alert_rule_test:
116+
- eval_time: 6m
117+
alertname: CDIStorageProfilesIncomplete
118+
exp_alerts:
119+
- exp_labels:
120+
severity: info
121+
operator_health_impact: none
122+
kubernetes_operator_part_of: kubevirt
123+
kubernetes_operator_component: containerized-data-importer
124+
storageclass: sc1
125+
provisioner: prov
126+
exp_annotations:
127+
summary: Incomplete StorageProfile sc1, accessMode/volumeMode cannot be inferred by CDI for PVC population request
128+
runbook_url: https://kubevirt.io/monitoring/runbooks/CDIStorageProfilesIncomplete
129+
130+
# CDINoDefaultStorageClass should fire when there is no default or virtdefault and a DV pending for one (simulated with no series)
131+
- interval: 1m
132+
input_series:
133+
- series: 'kubevirt_cdi_datavolume_pending'
134+
values: "1 1 1 1 1 1"
135+
alert_rule_test:
136+
- eval_time: 6m
137+
alertname: CDINoDefaultStorageClass
138+
exp_alerts:
139+
- exp_labels:
140+
severity: warning
141+
operator_health_impact: none
142+
kubernetes_operator_part_of: kubevirt
143+
kubernetes_operator_component: containerized-data-importer
144+
exp_annotations:
145+
summary: No default StorageClass or virtualization StorageClass, and a DataVolume is pending for one
146+
runbook_url: https://kubevirt.io/monitoring/runbooks/CDINoDefaultStorageClass
147+
148+
# CDIMultipleDefaultVirtStorageClasses should fire when more than one virtdefault=true exists for 5m
149+
- interval: 1m
150+
input_series:
151+
- series: 'kubevirt_cdi_storageprofile_info{virtdefault="true",storageclass="sc-a"}'
152+
values: "1 1 1 1 1 1"
153+
- series: 'kubevirt_cdi_storageprofile_info{virtdefault="true",storageclass="sc-b"}'
154+
values: "1 1 1 1 1 1"
155+
alert_rule_test:
156+
- eval_time: 6m
157+
alertname: CDIMultipleDefaultVirtStorageClasses
158+
exp_alerts:
159+
- exp_labels:
160+
severity: warning
161+
operator_health_impact: none
162+
kubernetes_operator_part_of: kubevirt
163+
kubernetes_operator_component: containerized-data-importer
164+
exp_annotations:
165+
summary: More than one default virtualization StorageClass detected
166+
runbook_url: https://kubevirt.io/monitoring/runbooks/CDIMultipleDefaultVirtStorageClasses
167+
168+
# CDIDefaultStorageClassDegraded should fire when default/virtdefault degraded or missing (simulated with no series)
169+
- interval: 1m
170+
input_series:
171+
- series: 'kubevirt_cdi_storageprofile_info{default="true"}'
172+
values: "1 1 1 1 1 1"
173+
alert_rule_test:
174+
- eval_time: 6m
175+
alertname: CDIDefaultStorageClassDegraded
176+
exp_alerts:
177+
- exp_labels:
178+
severity: warning
179+
operator_health_impact: none
180+
kubernetes_operator_part_of: kubevirt
181+
kubernetes_operator_component: containerized-data-importer
182+
exp_annotations:
183+
summary: Default storage class has no smart clone or ReadWriteMany
184+
runbook_url: https://kubevirt.io/monitoring/runbooks/CDIDefaultStorageClassDegraded
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package main
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"os"
7+
8+
rules "kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
9+
)
10+
11+
func verifyArgs(args []string) error {
12+
numOfArgs := len(os.Args[1:])
13+
if numOfArgs != 1 {
14+
return fmt.Errorf("expected exactly 1 argument, got: %d", numOfArgs)
15+
}
16+
return nil
17+
}
18+
19+
func main() {
20+
if err := verifyArgs(os.Args); err != nil {
21+
fmt.Printf("ERROR: %v\n", err)
22+
os.Exit(1)
23+
}
24+
25+
targetFile := os.Args[1]
26+
27+
if err := rules.SetupRules("ci"); err != nil {
28+
panic(err)
29+
}
30+
31+
promRule, err := rules.BuildPrometheusRule("ci")
32+
if err != nil {
33+
panic(err)
34+
}
35+
b, err := json.Marshal(promRule.Spec)
36+
if err != nil {
37+
panic(err)
38+
}
39+
40+
err = os.WriteFile(targetFile, b, 0600)
41+
if err != nil {
42+
panic(err)
43+
}
44+
45+
// Make the file world-readable so the promtool container (running as a different UID)
46+
// can read it when mounted read-only.
47+
if err := os.Chmod(targetFile, 0644); err != nil {
48+
panic(err)
49+
}
50+
}

hack/prom-rule-ci/verify-rules.sh

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/bin/bash -e
2+
#
3+
# This file is part of the KubeVirt project
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
# Copyright 2020 Red Hat, Inc.
17+
#
18+
19+
KUBEVIRT_CRI=${KUBEVIRT_CRI:-$(command -v podman || command -v docker)}
20+
if [ -z "$KUBEVIRT_CRI" ]; then
21+
echo "ERROR: neither podman nor docker found in PATH" >&2
22+
exit 1
23+
fi
24+
25+
readonly PROM_IMAGE="quay.io/prometheus/prometheus:v2.44.0"
26+
27+
function cleanup() {
28+
local cleanup_files=("${@:?}")
29+
30+
for file in "${cleanup_files[@]}"; do
31+
rm -f "$file"
32+
done
33+
}
34+
35+
function lint() {
36+
local target_file="${1:?}"
37+
38+
${KUBEVIRT_CRI} run --rm --entrypoint=/bin/promtool \
39+
-v "$target_file":/tmp/rules.verify:ro,Z "$PROM_IMAGE" \
40+
check rules /tmp/rules.verify
41+
}
42+
43+
function unit_test() {
44+
local target_file="${1:?}"
45+
local tests_file="${2:?}"
46+
47+
${KUBEVIRT_CRI} run --rm --entrypoint=/bin/promtool \
48+
-v "$tests_file":/tmp/rules.test:ro,Z \
49+
-v "$target_file":/tmp/rules.verify:ro,Z \
50+
"$PROM_IMAGE" \
51+
test rules /tmp/rules.test
52+
}
53+
54+
function main() {
55+
local prom_spec_dumper="${1:?}"
56+
local tests_file="${2:?}"
57+
local target_file
58+
59+
target_file="$(mktemp --tmpdir -u tmp.prom_rules.XXXXX)"
60+
trap "cleanup $target_file" RETURN EXIT INT
61+
62+
"$prom_spec_dumper" "$target_file"
63+
64+
echo "INFO: Rules file content:"
65+
cat "$target_file"
66+
echo
67+
68+
lint "$target_file"
69+
unit_test "$target_file" "$tests_file"
70+
}
71+
72+
main "$@"

0 commit comments

Comments
 (0)