Skip to content

Commit fb84fb2

Browse files
authored
feat: resolve manual incidents automatically (keephq#3927)
1 parent 73216cd commit fb84fb2

File tree

3 files changed

+74
-45
lines changed

3 files changed

+74
-45
lines changed

keep/api/bl/incidents_bl.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,22 @@
1919
get_all_alerts_by_fingerprints,
2020
get_incident_by_id,
2121
get_incident_unique_fingerprint_count,
22+
is_all_alerts_resolved,
23+
is_first_incident_alert_resolved,
24+
is_last_incident_alert_resolved,
2225
remove_alerts_to_incident_by_incident_id,
2326
update_incident_from_dto_by_id,
2427
update_incident_severity,
2528
)
2629
from keep.api.core.elastic import ElasticClient
27-
from keep.api.models.alert import IncidentDto, IncidentDtoIn, IncidentSeverity
30+
from keep.api.models.alert import (
31+
IncidentDto,
32+
IncidentDtoIn,
33+
IncidentSeverity,
34+
IncidentStatus,
35+
)
2836
from keep.api.models.db.alert import ActionType, Incident
37+
from keep.api.models.db.rule import ResolveOn
2938
from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts
3039
from keep.workflowmanager.workflowmanager import WorkflowManager
3140

@@ -359,3 +368,30 @@ def __postprocess_incident_change(self, incident):
359368
extra={"incident_id": incident.id},
360369
)
361370
return new_incident_dto
371+
372+
@staticmethod
373+
def resolve_incident_if_require(incident: Incident, session: Session) -> Incident:
374+
375+
should_resolve = False
376+
377+
if incident.resolve_on == ResolveOn.ALL.value and is_all_alerts_resolved(
378+
incident=incident, session=session
379+
):
380+
should_resolve = True
381+
382+
elif (
383+
incident.resolve_on == ResolveOn.FIRST.value
384+
and is_first_incident_alert_resolved(incident, session=session)
385+
):
386+
should_resolve = True
387+
388+
elif (
389+
incident.resolve_on == ResolveOn.LAST.value
390+
and is_last_incident_alert_resolved(incident, session=session)
391+
):
392+
should_resolve = True
393+
394+
if should_resolve:
395+
incident.status = IncidentStatus.RESOLVED.value
396+
397+
return incident

keep/api/tasks/process_event_task.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
# internals
1818
from keep.api.alert_deduplicator.alert_deduplicator import AlertDeduplicator
1919
from keep.api.bl.enrichments_bl import EnrichmentsBl
20+
from keep.api.bl.incidents_bl import IncidentBl
2021
from keep.api.bl.maintenance_windows_bl import MaintenanceWindowsBl
2122
from keep.api.core.db import (
2223
bulk_upsert_alert_fields,
24+
enrich_alerts_with_incidents,
2325
get_alerts_by_fingerprint,
2426
get_all_presets_dtos,
2527
get_enrichment_with_session,
@@ -138,6 +140,7 @@ def __save_to_db(
138140
session.add(audit)
139141

140142
enriched_formatted_events = []
143+
saved_alerts = []
141144

142145
for formatted_event in formatted_events:
143146
formatted_event.pushed = True
@@ -211,6 +214,7 @@ def __save_to_db(
211214
alert = Alert(**alert_args)
212215
session.add(alert)
213216
session.flush()
217+
saved_alerts.append(alert)
214218
alert_id = alert.id
215219
formatted_event.event_id = str(alert_id)
216220

@@ -251,6 +255,29 @@ def __save_to_db(
251255
value = value.strip()
252256
setattr(formatted_event, enrichment, value)
253257
enriched_formatted_events.append(formatted_event)
258+
259+
logger.info("Checking for incidents to resolve", extra={"tenant_id": tenant_id})
260+
try:
261+
saved_alerts = enrich_alerts_with_incidents(
262+
tenant_id, saved_alerts, session
263+
) # note: this only enriches incidents that were not yet ended
264+
for alert in saved_alerts:
265+
if alert.event.get("status") == AlertStatus.RESOLVED.value:
266+
logger.debug(
267+
"Checking for alert with status resolved",
268+
extra={"alert_id": alert.id, "tenant_id": tenant_id},
269+
)
270+
for incident in alert._incidents:
271+
IncidentBl.resolve_incident_if_require(incident, session)
272+
logger.info(
273+
"Completed checking for incidents to resolve",
274+
extra={"tenant_id": tenant_id},
275+
)
276+
except Exception:
277+
logger.exception(
278+
"Failed to check for incidents to resolve",
279+
extra={"tenant_id": tenant_id},
280+
)
254281
session.commit()
255282

256283
logger.info(

keep/rulesengine/rulesengine.py

Lines changed: 10 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,11 @@
1919
get_incident_for_grouping_rule,
2020
)
2121
from keep.api.core.db import get_rules as get_rules_db
22-
from keep.api.core.db import (
23-
is_all_alerts_in_status,
24-
is_all_alerts_resolved,
25-
is_first_incident_alert_resolved,
26-
is_last_incident_alert_resolved,
27-
)
22+
from keep.api.core.db import is_all_alerts_in_status
2823
from keep.api.core.dependencies import get_pusher_client
29-
from keep.api.models.alert import (
30-
AlertDto,
31-
AlertSeverity,
32-
AlertStatus,
33-
IncidentDto,
34-
IncidentStatus,
35-
)
24+
from keep.api.models.alert import AlertDto, AlertSeverity, AlertStatus, IncidentDto
3625
from keep.api.models.db.alert import Incident
37-
from keep.api.models.db.rule import ResolveOn, Rule
26+
from keep.api.models.db.rule import Rule
3827
from keep.api.utils.cel_utils import preprocess_cel_expression
3928
from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts
4029

@@ -155,13 +144,17 @@ def _run_cel_rules(
155144
)
156145
incident.is_confirmed = True
157146
elif rule.create_on == "all":
158-
incident = self._process_event_for_history_based_rule(
159-
incident, rule, session
147+
incident = (
148+
self._process_event_for_history_based_rule(
149+
incident, rule, session
150+
)
160151
)
161152

162153
send_created_event = incident.is_confirmed
163154

164-
incident = self._resolve_incident_if_require(incident, session)
155+
incident = IncidentBl.resolve_incident_if_require(
156+
incident, session
157+
)
165158
session.add(incident)
166159
session.commit()
167160

@@ -336,33 +329,6 @@ def _process_event_for_history_based_rule(
336329

337330
return incident
338331

339-
@staticmethod
340-
def _resolve_incident_if_require(incident: Incident, session: Session) -> Incident:
341-
342-
should_resolve = False
343-
344-
if incident.resolve_on == ResolveOn.ALL.value and is_all_alerts_resolved(
345-
incident=incident, session=session
346-
):
347-
should_resolve = True
348-
349-
elif (
350-
incident.resolve_on == ResolveOn.FIRST.value
351-
and is_first_incident_alert_resolved(incident, session=session)
352-
):
353-
should_resolve = True
354-
355-
elif (
356-
incident.resolve_on == ResolveOn.LAST.value
357-
and is_last_incident_alert_resolved(incident, session=session)
358-
):
359-
should_resolve = True
360-
361-
if should_resolve:
362-
incident.status = IncidentStatus.RESOLVED.value
363-
364-
return incident
365-
366332
@staticmethod
367333
def _extract_subrules(expression):
368334
# CEL rules looks like '(source == "sentry") || (source == "grafana" && severity == "critical")'

0 commit comments

Comments
 (0)