Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions client/src/Hooks/useMonitorForm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const getBaseDefaults = (data?: Monitor | null) => ({
geoCheckEnabled: data?.geoCheckEnabled ?? false,
geoCheckLocations: data?.geoCheckLocations || [],
geoCheckInterval: data?.geoCheckInterval || 300000,
escalationRules: data?.escalationRules || [],
});

export const useMonitorForm = ({
Expand Down
87 changes: 85 additions & 2 deletions client/src/Pages/CreateMonitor/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { useMemo, useState } from "react";
import { useEffect } from "react";
import { logger } from "@/Utils/logger";
import { useParams, useLocation, useNavigate } from "react-router";
import { useForm, Controller } from "react-hook-form";
import { useForm, Controller, useFieldArray } from "react-hook-form";
import { zodResolver } from "@hookform/resolvers/zod";
import { useTheme } from "@mui/material";
import Stack from "@mui/material/Stack";
Expand All @@ -14,7 +14,7 @@ import Typography from "@mui/material/Typography";
import Link from "@mui/material/Link";
import Divider from "@mui/material/Divider";
import IconButton from "@mui/material/IconButton";
import { Trash2 } from "lucide-react";
import { Trash2, Plus } from "lucide-react";
import { HeaderDeleteControls } from "@/Components/monitors";
import { GeoContinents } from "@/Types/GeoCheck";

Expand Down Expand Up @@ -203,6 +203,8 @@ const CreateMonitorPage = () => {
defaultValues: defaults,
});
const { control, watch, handleSubmit, clearErrors } = form;
const { fields: escalationFields, append: appendEscalation, remove: removeEscalation } =
useFieldArray({ control, name: "escalationRules" });

useEffect(() => {
form.reset(defaults);
Expand Down Expand Up @@ -765,6 +767,87 @@ const CreateMonitorPage = () => {
}
/>

<ConfigBox
title={t("pages.createMonitor.form.escalation.title")}
subtitle={t("pages.createMonitor.form.escalation.description")}
rightContent={
<Stack spacing={theme.spacing(LAYOUT.MD)}>
{escalationFields.map((field, index) => {
const notificationOptions = (notifications ?? []).map((n) => ({
...n,
name: n.notificationName,
}));
return (
<Stack
key={field.id}
direction="row"
alignItems="flex-start"
spacing={theme.spacing(SPACING.MD)}
>
<Controller
name={`escalationRules.${index}.delayMinutes`}
control={control}
render={({ field: f, fieldState }) => (
<TextField
type="number"
value={f.value ?? ""}
onChange={(e) => f.onChange(Number(e.target.value))}
fieldLabel={t("pages.createMonitor.form.escalation.delayLabel")}
error={!!fieldState.error}
helperText={fieldState.error?.message ?? ""}
sx={{ width: 140 }}
/>
)}
/>
<Controller
name={`escalationRules.${index}.notificationId`}
control={control}
render={({ field: f }) => {
const selected =
notificationOptions.find((n) => n.id === f.value) ?? null;
return (
<Autocomplete
options={notificationOptions}
value={selected}
getOptionLabel={(o) => o.name}
onChange={(_: unknown, v: (typeof notificationOptions)[0] | null) =>
f.onChange(v?.id ?? "")
}
isOptionEqualToValue={(o, v) => o.id === v.id}
fieldLabel={t(
"pages.createMonitor.form.escalation.channelLabel"
)}
sx={{ minWidth: 200 }}
/>
);
}}
/>
<IconButton
size="small"
onClick={() => removeEscalation(index)}
aria-label="Remove escalation rule"
sx={{ mt: 3 }}
>
<Trash2 size={16} />
</IconButton>
</Stack>
);
})}
<Button
variant="outlined"
onClick={() => appendEscalation({ delayMinutes: 30, notificationId: "" })}
sx={{ alignSelf: "flex-start" }}
>
<Plus
size={16}
style={{ marginRight: 4 }}
/>
{t("pages.createMonitor.form.escalation.addRule")}
</Button>
</Stack>
}
/>

{(watchedType === "http" ||
watchedType === "grpc" ||
watchedType === "websocket") && (
Expand Down
6 changes: 6 additions & 0 deletions client/src/Types/Monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ export type MonitorStatus = (typeof MonitorStatuses)[number];

export type MonitorMatchMethod = "equal" | "include" | "regex" | "";

export interface EscalationRule {
notificationId: string;
delayMinutes: number;
}

export interface Monitor {
id: string;
userId: string;
Expand All @@ -60,6 +65,7 @@ export interface Monitor {
interval: number;
uptimePercentage?: number;
notifications: string[];
escalationRules?: EscalationRule[];
secret?: string;
cpuAlertThreshold: number;
cpuAlertCounter: number;
Expand Down
6 changes: 6 additions & 0 deletions client/src/Validation/monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ import { GeoContinents } from "@/Types/GeoCheck";
// URL schema with custom error message
const urlSchema = z.url({ message: "Please enter a valid URL" });

const escalationRuleSchema = z.object({
notificationId: z.string().min(1),
delayMinutes: z.number().int().min(1, "Delay must be at least 1 minute"),
});

// Common base schema for all monitor types
const baseSchema = z.object({
name: z
Expand All @@ -27,6 +32,7 @@ const baseSchema = z.object({
.number()
.min(300000, "Interval must be at least 5 minutes")
.optional(),
escalationRules: z.array(escalationRuleSchema).optional(),
});

// HTTP monitor schema
Expand Down
7 changes: 7 additions & 0 deletions client/src/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,13 @@
"description": "Select the notification channels you want to use",
"title": "Notifications"
},
"escalation": {
"title": "Escalation rules",
"description": "Send an additional alert if an incident is still active after a set delay.",
"addRule": "Add escalation rule",
"delayLabel": "Delay (minutes)",
"channelLabel": "Notify channel"
},
"type": {
"description": "Select the type of check to perform",
"optionDockerDescription": "Use Docker to monitor if a container is running.",
Expand Down
4 changes: 4 additions & 0 deletions server/src/db/models/Incident.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ const IncidentSchema = new Schema<IncidentDocument>(
type: String,
default: null,
},
escalationsFired: {
type: [String],
default: [],
},
},
{ timestamps: true }
);
Expand Down
14 changes: 13 additions & 1 deletion server/src/db/models/Monitor.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { Schema, model, Types } from "mongoose";
import type { Monitor, MonitorMatchMethod, CheckSnapshot } from "@/types/monitor.js";
import type { Monitor, MonitorMatchMethod, CheckSnapshot, EscalationRule } from "@/types/monitor.js";
import { MonitorTypes, MonitorStatuses } from "@/types/monitor.js";
import type {
CheckAudits,
Expand Down Expand Up @@ -173,6 +173,14 @@ const snapshotAuditsSchema = new Schema<CheckAudits>(
{ _id: false }
);

const escalationRuleSchema = new Schema<EscalationRule>(
{
notificationId: { type: String, required: true },
delayMinutes: { type: Number, required: true, min: 1 },
},
{ _id: false }
);

const checkSnapshotSchema = new Schema<CheckSnapshotDocument>(
{
id: { type: String, required: true },
Expand Down Expand Up @@ -351,6 +359,10 @@ const MonitorSchema = new Schema<MonitorDocument>(
type: Number,
default: 300000,
},
escalationRules: {
type: [escalationRuleSchema],
default: [],
},
recentChecks: {
type: [checkSnapshotSchema],
default: [],
Expand Down
1 change: 1 addition & 0 deletions server/src/repositories/incidents/IIncidentsRepository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export interface IIncidentsRepository {

// update
updateById(incidentId: string, teamId: string, updateData: Partial<Incident>): Promise<Incident>;
addEscalationFired(incidentId: string, teamId: string, notificationId: string): Promise<void>;
// delete
deleteByMonitorId(monitorId: string, teamId: string): Promise<number>;
deleteByMonitorIdsNotIn(monitorIds: string[]): Promise<number>;
Expand Down
8 changes: 8 additions & 0 deletions server/src/repositories/incidents/MongoIncidentRepository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class MongoIncidentRepository implements IIncidentsRepository {
resolvedBy: doc.resolvedBy ? this.toStringId(doc.resolvedBy) : null,
resolvedByEmail: doc.resolvedByEmail ?? null,
comment: doc.comment ?? null,
escalationsFired: doc.escalationsFired ?? [],
createdAt: this.toDateString(doc.createdAt),
updatedAt: this.toDateString(doc.updatedAt),
};
Expand Down Expand Up @@ -274,6 +275,13 @@ class MongoIncidentRepository implements IIncidentsRepository {
};
};

addEscalationFired = async (incidentId: string, teamId: string, notificationId: string): Promise<void> => {
await IncidentModel.updateOne(
{ _id: new mongoose.Types.ObjectId(incidentId), teamId: new mongoose.Types.ObjectId(teamId) },
{ $addToSet: { escalationsFired: notificationId } }
);
};

deleteByMonitorId = async (monitorId: string, teamId: string) => {
const result = await IncidentModel.deleteMany({
monitorId: new mongoose.Types.ObjectId(monitorId),
Expand Down
8 changes: 8 additions & 0 deletions server/src/repositories/monitors/MongoMonitorsRepository.ts
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,10 @@ class MongoMonitorsRepository implements IMonitorsRepository {
interval: doc.interval,
uptimePercentage: doc.uptimePercentage ?? undefined,
notifications: notificationIds,
escalationRules: (doc.escalationRules ?? []).map((rule) => ({
notificationId: rule.notificationId,
delayMinutes: rule.delayMinutes,
})),
secret: doc.secret ?? undefined,
cpuAlertThreshold: doc.cpuAlertThreshold,
cpuAlertCounter: doc.cpuAlertCounter,
Expand Down Expand Up @@ -433,6 +437,10 @@ class MongoMonitorsRepository implements IMonitorsRepository {
interval: doc.interval,
uptimePercentage: doc.uptimePercentage ?? undefined,
notifications: notificationIds,
escalationRules: (doc.escalationRules ?? []).map((rule) => ({
notificationId: rule.notificationId,
delayMinutes: rule.delayMinutes,
})),
secret: doc.secret ?? undefined,
cpuAlertThreshold: doc.cpuAlertThreshold,
cpuAlertCounter: doc.cpuAlertCounter,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export interface MonitorActionDecision {
shouldResolveIncident: boolean;
shouldSendNotification: boolean;
incidentReason: "status_down" | "threshold_breach" | null;
notificationReason: "status_change" | "threshold_breach" | null;
notificationReason: "status_change" | "threshold_breach" | "escalation" | null;
thresholdBreaches?: {
cpu?: boolean;
memory?: boolean;
Expand Down Expand Up @@ -177,6 +177,15 @@ export class SuperSimpleQueueHelper implements ISuperSimpleQueueHelper {
stack: error instanceof Error ? error.stack : undefined,
});
});

// Step 8. Check escalations (best effort, don't wait)
this.checkEscalations(statusChangeResult.monitor).catch((error: unknown) => {
this.logger.warn({
message: `Error checking escalations for monitor ${statusChangeResult.monitor.id}: ${error instanceof Error ? error.message : "Unknown error"}`,
service: SERVICE_NAME,
method: "getMonitorJob",
});
});
} catch (error: unknown) {
this.logger.warn({
message: error instanceof Error ? error.message : "Unknown error",
Expand Down Expand Up @@ -418,6 +427,40 @@ export class SuperSimpleQueueHelper implements ISuperSimpleQueueHelper {
};
};

private checkEscalations = async (monitor: Monitor): Promise<void> => {
if (!monitor.escalationRules || monitor.escalationRules.length === 0) return;

const incident = await this.incidentsRepository.findActiveByMonitorId(monitor.id, monitor.teamId);
if (!incident) return;

const incidentStartMs = new Date(incident.startTime).getTime();
const nowMs = Date.now();
const alreadyFired = new Set(incident.escalationsFired ?? []);

const dueRules = monitor.escalationRules.filter((rule) => {
if (alreadyFired.has(rule.notificationId)) return false;
return nowMs >= incidentStartMs + rule.delayMinutes * 60 * 1000;
});

for (const rule of dueRules) {
try {
await this.notificationsService.sendEscalationNotification(monitor, rule.notificationId);
await this.incidentsRepository.addEscalationFired(incident.id, monitor.teamId, rule.notificationId);
this.logger.info({
message: `Escalation fired for monitor ${monitor.id}, notificationId ${rule.notificationId}`,
service: SERVICE_NAME,
method: "checkEscalations",
});
} catch (error: unknown) {
this.logger.warn({
message: `Escalation failed for notificationId ${rule.notificationId}: ${error instanceof Error ? error.message : "Unknown error"}`,
service: SERVICE_NAME,
method: "checkEscalations",
});
}
}
};

private evaluateMonitorAction(statusChangeResult: StatusChangeResult): MonitorActionDecision {
const { monitor, statusChanged, prevStatus } = statusChangeResult;

Expand Down
26 changes: 26 additions & 0 deletions server/src/service/infrastructure/notificationMessageBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export interface INotificationMessageBuilder {
decision: MonitorActionDecision,
clientHost: string
): NotificationMessage;
buildEscalationMessage(monitor: Monitor, clientHost: string): NotificationMessage;
extractThresholdBreaches(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse): ThresholdBreach[];
}

Expand Down Expand Up @@ -182,6 +183,31 @@ export class NotificationMessageBuilder implements INotificationMessageBuilder {
};
}

public buildEscalationMessage(monitor: Monitor, clientHost: string): NotificationMessage {
return {
type: "monitor_down",
severity: "critical",
monitor: {
id: monitor.id,
name: monitor.name,
url: monitor.url,
type: monitor.type,
status: monitor.status,
},
content: {
title: `Escalation: ${monitor.name} is still down`,
summary: `Monitor "${monitor.name}" remains down and has not recovered. An escalation alert has been triggered.`,
details: [`URL: ${monitor.url}`, `Status: ${monitor.status}`, `Type: ${monitor.type}`],
timestamp: new Date(),
},
clientHost,
metadata: {
teamId: monitor.teamId,
notificationReason: "escalation",
},
};
}

public extractThresholdBreaches(monitor: Monitor, monitorStatusResponse: MonitorStatusResponse<HardwareStatusPayload>): ThresholdBreach[] {
const breaches: ThresholdBreach[] = [];

Expand Down
Loading