Skip to content

Commit f9cbc3b

Browse files
authored
have both 7 and 30 day slo error budgets (#1187)
Signed-off-by: Kenny Leung <[email protected]>
1 parent 8ee99ce commit f9cbc3b

File tree

1 file changed

+42
-29
lines changed

1 file changed

+42
-29
lines changed

modules/slo/main.tf

Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
locals {
2-
region_filter = "one_of(${join(", ", formatlist("%q", var.regions))})"
2+
rolling_periods = {
3+
"07" = 7
4+
"30" = 30
5+
}
6+
region_rolling_period_map = {
7+
for pair in setproduct(var.regions, keys(local.rolling_periods)) :
8+
"${pair[0]}-${pair[1]}" => {
9+
region = pair[0]
10+
rolling_period_key = pair[1]
11+
rolling_period_days = local.rolling_periods[pair[1]]
12+
}
13+
}
314
}
415

516
resource "google_monitoring_custom_service" "this" {
@@ -8,13 +19,15 @@ resource "google_monitoring_custom_service" "this" {
819
}
920

1021
resource "google_monitoring_slo" "success_cr" {
22+
for_each = local.rolling_periods
23+
1124
service = google_monitoring_custom_service.this.service_id
12-
slo_id = "${var.service_name}-success-multi-region"
13-
display_name = "${var.service_name} - Multi-region Success Cloud Run SLO"
25+
slo_id = "${var.service_name}-success-multi-region-${each.key}d"
26+
display_name = "${var.service_name} - Multi-region Success ${each.key}d Cloud Run SLO"
1427
project = var.project_id
1528

1629
goal = var.slo.success.multi_region_goal
17-
rolling_period_days = 30
30+
rolling_period_days = each.value
1831

1932
request_based_sli {
2033
good_total_ratio {
@@ -37,15 +50,15 @@ resource "google_monitoring_slo" "success_cr" {
3750
}
3851

3952
resource "google_monitoring_slo" "success_cr_per_region" {
40-
for_each = toset(var.regions)
53+
for_each = local.region_rolling_period_map
4154

4255
service = google_monitoring_custom_service.this.service_id
43-
slo_id = "${var.service_name}-success-${each.value}"
44-
display_name = "${var.service_name} - ${each.key} Success Cloud Run SLO"
56+
slo_id = "${var.service_name}-success-${each.value.region}-${each.value.rolling_period_key}d"
57+
display_name = "${var.service_name} - ${each.key} Success ${each.value.rolling_period_key}d Cloud Run SLO"
4558
project = var.project_id
4659

4760
goal = var.slo.success.per_region_goal
48-
rolling_period_days = 30
61+
rolling_period_days = each.value.rolling_period_days
4962

5063
request_based_sli {
5164
good_total_ratio {
@@ -69,15 +82,15 @@ resource "google_monitoring_slo" "success_cr_per_region" {
6982
}
7083

7184
resource "google_monitoring_slo" "success_gclb" {
72-
count = var.slo.monitor_gclb ? 1 : 0
85+
for_each = var.slo.monitor_gclb ? local.rolling_periods : {}
7386

7487
service = google_monitoring_custom_service.this.service_id
75-
slo_id = "${var.service_name}-success-gclb-multi-region"
76-
display_name = "${var.service_name} - Multi-region Success GCLB SLO"
88+
slo_id = "${var.service_name}-success-gclb-multi-region-${each.key}d"
89+
display_name = "${var.service_name} - Multi-region Success ${each.key}d GCLB SLO"
7790
project = var.project_id
7891

7992
goal = var.slo.success.multi_region_goal
80-
rolling_period_days = 30
93+
rolling_period_days = each.value
8194

8295
request_based_sli {
8396
good_total_ratio {
@@ -101,18 +114,18 @@ resource "google_monitoring_slo" "success_gclb" {
101114

102115
# Alert policy for multi-region success SLO burn rate
103116
resource "google_monitoring_alert_policy" "slo_burn_rate_multi_region" {
104-
count = var.slo.enable_alerting ? 1 : 0
117+
for_each = var.slo.monitor_gclb ? local.rolling_periods : {}
105118

106-
display_name = "${var.service_name} - Multi-region SLO Burn Rate Alert"
119+
display_name = "${var.service_name} - Multi-region ${each.key}d SLO Burn Rate Alert"
107120
project = var.project_id
108121
combiner = "OR"
109122

110123
conditions {
111-
display_name = "Multi-region SLO burn rate too high"
124+
display_name = "Multi-region ${each.key}d SLO burn rate too high"
112125

113126
condition_threshold {
114127
filter = <<-EOT
115-
select_slo_burn_rate("${google_monitoring_slo.success_cr.id}", "60m")
128+
select_slo_burn_rate("${google_monitoring_slo.success_cr[each.key].id}", "60m")
116129
EOT
117130

118131
duration = "0s"
@@ -136,7 +149,7 @@ resource "google_monitoring_alert_policy" "slo_burn_rate_multi_region" {
136149
content = <<-EOT
137150
The multi-region SLO for ${var.service_name} is burning through its error budget too quickly.
138151
139-
Current SLO target: ${var.slo.success.multi_region_goal * 100}% success over 30 days
152+
Current SLO target: ${var.slo.success.multi_region_goal * 100}% success over ${each.key} days
140153
Monitored regions: us-central1, us-west1, us-east1
141154
142155
Please investigate the Cloud Run service for errors or performance issues across regions.
@@ -147,18 +160,18 @@ resource "google_monitoring_alert_policy" "slo_burn_rate_multi_region" {
147160

148161
# Alert policies for per-region success SLO burn rates
149162
resource "google_monitoring_alert_policy" "slo_burn_rate_per_region" {
150-
for_each = var.slo.enable_alerting ? toset(var.regions) : []
163+
for_each = var.slo.enable_alerting ? local.region_rolling_period_map : {}
151164

152-
display_name = "${var.service_name} - ${each.value} SLO Burn Rate Alert"
165+
display_name = "${var.service_name} - ${each.value.region} ${each.value.rolling_period_key}d SLO Burn Rate Alert"
153166
project = var.project_id
154167
combiner = "OR"
155168

156169
conditions {
157-
display_name = "${each.value} SLO burn rate too high"
170+
display_name = "${var.service_name} ${each.value.region} ${each.value.rolling_period_key}d SLO burn rate too high"
158171

159172
condition_threshold {
160173
filter = <<-EOT
161-
select_slo_burn_rate("${google_monitoring_slo.success_cr_per_region[each.value].id}", "60m")
174+
select_slo_burn_rate("${google_monitoring_slo.success_cr_per_region[each.key].id}", "60m")
162175
EOT
163176

164177
duration = "0s"
@@ -180,30 +193,30 @@ resource "google_monitoring_alert_policy" "slo_burn_rate_per_region" {
180193

181194
documentation {
182195
content = <<-EOT
183-
The SLO for ${var.service_name} in ${each.value} is burning through its error budget too quickly.
196+
The SLO for ${var.service_name} in ${each.value.region} is burning through its error budget too quickly.
184197
185-
Current SLO target: ${var.slo.success.per_region_goal * 100}% success over 30 days
198+
Current SLO target: ${var.slo.success.per_region_goal * 100}% success over ${each.value.rolling_period_key} days
186199
187-
Please investigate the Cloud Run service in ${each.value} for errors or performance issues.
200+
Please investigate the Cloud Run service in ${each.value.region} for errors or performance issues.
188201
EOT
189202
mime_type = "text/markdown"
190203
}
191204
}
192205

193206
# Alert policy for success GCLB SLO burn rate
194207
resource "google_monitoring_alert_policy" "slo_burn_rate_gclb" {
195-
count = var.slo.enable_alerting && var.slo.monitor_gclb ? 1 : 0
208+
for_each = var.slo.enable_alerting && var.slo.monitor_gclb ? local.rolling_periods : {}
196209

197-
display_name = "${var.service_name} - GCLB SLO Burn Rate Alert"
210+
display_name = "${var.service_name} - GCLB ${each.key}d SLO Burn Rate Alert"
198211
project = var.project_id
199212
combiner = "OR"
200213

201214
conditions {
202-
display_name = "GCLB SLO burn rate too high"
215+
display_name = "GCLB ${each.key}d SLO burn rate too high"
203216

204217
condition_threshold {
205218
filter = <<-EOT
206-
select_slo_burn_rate("${google_monitoring_slo.success_gclb[0].id}", "60m")
219+
select_slo_burn_rate("${google_monitoring_slo.success_gclb[each.key].id}", "60m")
207220
EOT
208221

209222
duration = "0s"
@@ -227,7 +240,7 @@ resource "google_monitoring_alert_policy" "slo_burn_rate_gclb" {
227240
content = <<-EOT
228241
The GCLB SLO for ${var.service_name} is burning through its error budget too quickly.
229242
230-
Current SLO target: ${var.slo.success.multi_region_goal * 100}% success over 30 days
243+
Current SLO target: ${var.slo.success.multi_region_goal * 100}% success over ${each.key} days
231244
232245
Please investigate the Cloud Run service for errors or performance issues across regions.
233246
EOT

0 commit comments

Comments
 (0)