Skip to content

Commit 37c8c44

Browse files
hkfgotomkerkhove
andauthored
feat: Add Data Freshness Check for Self-Healing (#2694)
Co-authored-by: Tom Kerkhove <[email protected]>
1 parent b3c636e commit 37c8c44

File tree

17 files changed

+569
-18
lines changed

17 files changed

+569
-18
lines changed

.github/workflows/templates-build-push-image.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,4 @@ jobs:
5050
context: ./src/
5151
file: ./src/${{ inputs.project_name }}/Dockerfile.linux
5252
tags: ${{ env.image_commit_uri }},${{ env.image_latest_uri }}
53-
push: true
53+
push: true

changelog/content/experimental/unreleased.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ version:
88

99
- {{% tag fixed %}} Azure Monitor Scraper: batch based on aggregation in addition to existing criteria
1010
- {{% tag feature %}} Azure Monitor Scraper: make query lookback range configurable in minutes
11+
- {{% tag feature %}} Azure Monitor Scraper: add health check based on data freshness
1112

1213
#### Resource Discovery
1314

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
namespace Promitor.Agents.Scraper.Configuration
2+
{
3+
public class HealthCheckConfiguration
4+
{
5+
/// <summary>
6+
/// Indicates whether the scraper freshness health check is enabled.
7+
/// This health check monitors whether scrapes have been completed recently.
8+
/// Defaults to true.
9+
/// </summary>
10+
public bool EnableScraperFreshnessHealthCheck { get; set; } = true;
11+
}
12+
}
13+

src/Promitor.Agents.Scraper/Configuration/ScraperRuntimeConfiguration.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@ public class ScraperRuntimeConfiguration : RuntimeConfiguration
1212
public MetricsConfiguration MetricsConfiguration { get; set; } = new();
1313
public MetricSinkConfiguration MetricSinks { get; set; } = new();
1414
public ResourceDiscoveryConfiguration ResourceDiscovery { get; set; }
15+
public HealthCheckConfiguration HealthCheck { get; set; } = new();
1516
}
1617
}

src/Promitor.Agents.Scraper/Docs/Open-Api.xml

Lines changed: 51 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/Promitor.Agents.Scraper/Extensions/IHealthChecksBuilderExtensions.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,25 @@ public static IHealthChecksBuilder AddResourceDiscoveryHealthCheck(this IHealthC
2929

3030
return healthChecksBuilder;
3131
}
32+
33+
/// <summary>
34+
/// Add health check to validate the scraper has completed a scrape recently
35+
/// </summary>
36+
/// <param name="healthChecksBuilder">Builder for adding health checks</param>
37+
/// <param name="configuration">Configuration of Promitor</param>
38+
public static IHealthChecksBuilder AddScraperFreshnessHealthCheck(this IHealthChecksBuilder healthChecksBuilder, IConfiguration configuration)
39+
{
40+
Guard.NotNull(healthChecksBuilder, nameof(healthChecksBuilder));
41+
Guard.NotNull(configuration, nameof(configuration));
42+
43+
var healthCheckConfiguration = configuration.GetSection("healthCheck").Get<HealthCheckConfiguration>() ?? new HealthCheckConfiguration();
44+
45+
if (healthCheckConfiguration.EnableScraperFreshnessHealthCheck)
46+
{
47+
healthChecksBuilder.AddCheck<ScraperResultFreshnessHealthCheck>("Promitor Scraper Data Freshness", HealthStatus.Unhealthy);
48+
}
49+
50+
return healthChecksBuilder;
51+
}
3252
}
3353
}

src/Promitor.Agents.Scraper/Extensions/IServiceCollectionExtensions.cs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
using Promitor.Agents.Scraper.Configuration.Sinks;
1818
using Promitor.Agents.Scraper.Discovery;
1919
using Promitor.Agents.Scraper.Discovery.Interfaces;
20+
using Promitor.Agents.Scraper.Health;
2021
using Promitor.Agents.Scraper.Scheduling;
2122
using Promitor.Agents.Scraper.Usability;
2223
using Promitor.Agents.Scraper.Validation.Steps;
@@ -31,6 +32,7 @@
3132
using Promitor.Core.Scraping.Configuration.Serialization.v1.Core;
3233
using Promitor.Core.Scraping.Configuration.Serialization.v1.Model;
3334
using Promitor.Core.Scraping.Factories;
35+
using Promitor.Agents.Scraper.Runtime;
3436
using Promitor.Integrations.Azure.Authentication.Configuration;
3537
using Promitor.Integrations.AzureMonitor.Configuration;
3638
using Promitor.Integrations.Sinks.Atlassian.Statuspage;
@@ -63,7 +65,7 @@ public static IServiceCollection AddResourceDiscoveryClient(this IServiceCollect
6365

6466
var resourceDiscoveryConfiguration = configuration.Get<ScraperRuntimeConfiguration>();
6567

66-
if(resourceDiscoveryConfiguration?.ResourceDiscovery?.IsConfigured == true)
68+
if (resourceDiscoveryConfiguration?.ResourceDiscovery?.IsConfigured == true)
6769
{
6870
services.AddHttpClient<ResourceDiscoveryClient>(client =>
6971
{
@@ -110,12 +112,14 @@ public static IServiceCollection AddAtlassianStatuspageClient(this IServiceColle
110112
public static IServiceCollection DefineDependencies(this IServiceCollection services)
111113
{
112114
Guard.NotNull(services, nameof(services));
113-
115+
114116
services.AddTransient<IMetricsDeclarationProvider, MetricsDeclarationProvider>();
115117
services.AddTransient<IAzureScrapingSystemMetricsPublisher, AzureScrapingSystemMetricsPublisher>();
116118
services.AddTransient<MetricScraperFactory>();
117119
services.AddTransient<ConfigurationSerializer>();
118120
services.AddSingleton<AzureMonitorClientFactory>();
121+
services.AddSingleton<ILastSuccessfulScrapeStore, LastSuccessfulScrapeStore>();
122+
services.AddSingleton<IScrapeScheduleProvider, ScrapeScheduleProvider>();
119123

120124
services.AddSingleton<IDeserializer<MetricsDeclarationV1>, V1Deserializer>();
121125
services.AddSingleton<IDeserializer<AzureMetadataV1>, AzureMetadataDeserializer>();
@@ -245,7 +249,7 @@ private static void AddOpenTelemetryCollectorMetricSink(string collectorUri, str
245249
}
246250

247251
private static void AddStatsdMetricSink(IServiceCollection services, StatsdSinkConfiguration statsdConfiguration, Table metricSinkAsciiTable)
248-
{
252+
{
249253
metricSinkAsciiTable.AddRow("StatsD", $"Url: {statsdConfiguration.Host}:{statsdConfiguration.Port}.");
250254
metricSinkAsciiTable.AddRow("", $"Format: {statsdConfiguration.MetricFormat}.");
251255

@@ -309,9 +313,9 @@ public static IServiceCollection AddScrapingMutex(this IServiceCollection servic
309313
}
310314

311315
var serverConfiguration = configuration.GetSection("server").Get<ServerConfiguration>();
312-
316+
313317
services.TryAdd(ServiceDescriptor.Singleton<IScrapingMutex, ScrapingMutex>(_ => ScrapingMutexBuilder(serverConfiguration)));
314-
318+
315319
return services;
316320
}
317321

@@ -333,6 +337,7 @@ public static IServiceCollection ConfigureYamlConfiguration(this IServiceCollect
333337
services.Configure<TelemetryConfiguration>(configuration.GetSection("telemetry"));
334338
services.Configure<ServerConfiguration>(configuration.GetSection("server"));
335339
services.Configure<AuthenticationConfiguration>(configuration.GetSection("authentication"));
340+
services.Configure<HealthCheckConfiguration>(configuration.GetSection("healthCheck"));
336341
services.Configure<PrometheusScrapingEndpointSinkConfiguration>(configuration.GetSection("metricSinks:prometheusScrapingEndpoint"));
337342
services.Configure<StatsdSinkConfiguration>(configuration.GetSection("metricSinks:statsd"));
338343
services.Configure<AtlassianStatusPageSinkConfiguration>(configuration.GetSection("metricSinks:atlassianStatuspage"));
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
using System;
2+
3+
namespace Promitor.Agents.Scraper.Health
4+
{
5+
/// <summary>
6+
/// Provides information about the configured scrape schedules
7+
/// </summary>
8+
public interface IScrapeScheduleProvider
9+
{
10+
/// <summary>
11+
/// Gets the minimum scrape interval across all configured metrics
12+
/// </summary>
13+
TimeSpan GetMinimumScrapeInterval();
14+
}
15+
}
16+
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
using System;
2+
using System.Linq;
3+
using Cronos;
4+
using GuardNet;
5+
using Microsoft.Extensions.Logging;
6+
using Promitor.Core.Scraping.Configuration.Providers.Interfaces;
7+
8+
namespace Promitor.Agents.Scraper.Health
9+
{
10+
/// <summary>
11+
/// Provides information about the configured scrape schedules
12+
/// </summary>
13+
public class ScrapeScheduleProvider : IScrapeScheduleProvider
14+
{
15+
private readonly IMetricsDeclarationProvider _metricsDeclarationProvider;
16+
private readonly ILogger<ScrapeScheduleProvider> _logger;
17+
private TimeSpan? _cachedMinimumInterval;
18+
19+
public ScrapeScheduleProvider(IMetricsDeclarationProvider metricsDeclarationProvider, ILogger<ScrapeScheduleProvider> logger)
20+
{
21+
Guard.NotNull(metricsDeclarationProvider, nameof(metricsDeclarationProvider));
22+
Guard.NotNull(logger, nameof(logger));
23+
24+
_metricsDeclarationProvider = metricsDeclarationProvider;
25+
_logger = logger;
26+
}
27+
28+
/// <summary>
29+
/// Gets the minimum scrape interval across all configured metrics
30+
/// </summary>
31+
public TimeSpan GetMinimumScrapeInterval()
32+
{
33+
if (_cachedMinimumInterval.HasValue)
34+
{
35+
return _cachedMinimumInterval.Value;
36+
}
37+
38+
var metricsDeclaration = _metricsDeclarationProvider.Get(applyDefaults: true);
39+
if (metricsDeclaration?.Metrics == null || !metricsDeclaration.Metrics.Any())
40+
{
41+
_logger.LogWarning("No metrics configured, using default interval of 5 minutes");
42+
return TimeSpan.FromMinutes(5);
43+
}
44+
45+
var intervals = metricsDeclaration.Metrics
46+
.Select(m => m.Scraping?.Schedule)
47+
.Where(schedule => !string.IsNullOrWhiteSpace(schedule))
48+
.Distinct()
49+
.Select(schedule => CalculateIntervalFromCronExpression(schedule))
50+
.Where(interval => interval.HasValue)
51+
.Select(interval => interval.Value)
52+
.ToList();
53+
54+
if (!intervals.Any())
55+
{
56+
_logger.LogWarning("Unable to calculate intervals from cron expressions, using default interval of 5 minutes");
57+
return TimeSpan.FromMinutes(5);
58+
}
59+
60+
_cachedMinimumInterval = intervals.Min();
61+
_logger.LogInformation("Calculated minimum scrape interval: {MinimumInterval}", _cachedMinimumInterval.Value);
62+
63+
return _cachedMinimumInterval.Value;
64+
}
65+
66+
private TimeSpan? CalculateIntervalFromCronExpression(string cronExpression)
67+
{
68+
try
69+
{
70+
var cron = CronExpression.Parse(cronExpression, CronFormat.IncludeSeconds);
71+
var baseTime = DateTimeOffset.UtcNow;
72+
var next1 = cron.GetNextOccurrence(baseTime, TimeZoneInfo.Utc);
73+
74+
if (!next1.HasValue)
75+
{
76+
_logger.LogWarning("Unable to calculate next occurrence for cron expression: {CronExpression}", cronExpression);
77+
return null;
78+
}
79+
80+
var next2 = cron.GetNextOccurrence(next1.Value, TimeZoneInfo.Utc);
81+
82+
if (!next2.HasValue)
83+
{
84+
_logger.LogWarning("Unable to calculate second occurrence for cron expression: {CronExpression}", cronExpression);
85+
return null;
86+
}
87+
88+
var interval = next2.Value - next1.Value;
89+
_logger.LogDebug("Calculated interval {Interval} for cron expression: {CronExpression}", interval, cronExpression);
90+
91+
return interval;
92+
}
93+
catch (Exception ex)
94+
{
95+
_logger.LogError(ex, "Failed to parse cron expression: {CronExpression}", cronExpression);
96+
return null;
97+
}
98+
}
99+
}
100+
}
101+

0 commit comments

Comments
 (0)