Skip to content

Commit bbcfd74

Browse files
authored
fix: correctly log job timeout (#2629)
1 parent fa5561e commit bbcfd74

File tree

3 files changed

+13
-8
lines changed

3 files changed

+13
-8
lines changed

changelog/content/experimental/unreleased.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ version:
77
#### Scraper
88

99
None.
10+
1011
#### Resource Discovery
1112

1213
None.

src/Promitor.Agents.Core/Configuration/Defaults.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public static class Concurrency
2424
/// On rare occasions, scrape jobs can hang and subsequent jobs cannot run, as they cannot acquire a mutex.
2525
/// MutexTimeoutSeconds setting ensures scraping mutex is released.
2626
/// </summary>
27-
public static int MutexTimeoutSeconds { get; } = 30;
27+
public static int MutexTimeoutSeconds { get; } = 90;
2828
}
2929

3030
public class Telemetry

src/Promitor.Agents.Scraper/Scheduling/ResourcesScrapingJob.cs

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,11 @@ public async Task ExecuteAsync(CancellationToken cancellationToken)
142142
{
143143
var mutexReleasedAfterSeconds = _concurrencyConfiguration.Value.MutexTimeoutSeconds;
144144
var timeoutCancellationTokenSource = new CancellationTokenSource();
145+
Logger.LogInformation("Timeout after {Timeout}", mutexReleasedAfterSeconds);
146+
145147
timeoutCancellationTokenSource.CancelAfter(mutexReleasedAfterSeconds * 1000);
146-
timeoutCancellationTokenSource.Token.Register(() => {
147-
Logger.LogError("Scrape job {JobName} was cancelled due to timeout. However, dangling async tasks " +
148-
"may be running for an unbounded amount of time. In the rare case where " +
149-
"many such timeouts occur, consider restarting the Scraper Agent.", Name);
150-
});
151148

152-
Logger.LogWarning("Init timeout token");
149+
Logger.LogInformation("Init timeout token");
153150
// to enforce timeout in addition to cancellationToken passed down by .NET
154151
var composedCancellationTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCancellationTokenSource.Token);
155152
var scrapeDefinitions = await GetAllScrapeDefinitions(composedCancellationTokenSource.Token);
@@ -377,7 +374,14 @@ private async Task WorkWrapper(Func<Task> work, CancellationToken cancellationTo
377374
cancellationToken.Register(() => {
378375
tcs.TrySetResult(null);
379376
});
380-
await Task.WhenAny(work(), tcs.Task);
377+
var completedTask = await Task.WhenAny(work(), tcs.Task);
378+
if (completedTask == tcs.Task)
379+
{
380+
Logger.LogError("Scrape job {JobName} was cancelled due to timeout. However, dangling async tasks " +
381+
"may be running for an unbounded amount of time. In the rare case where " +
382+
"many such timeouts occur, consider restarting the Scraper Agent or tuning the mutex timeout configuration.", Name);
383+
throw new OperationCanceledException(cancellationToken);
384+
}
381385
}
382386
finally
383387
{

0 commit comments

Comments
 (0)