Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ concurrency:
env:
NODE_VERSION: '22'
PNPM_VERSION: '10.33.4'
GO_VERSION: '1.25.10'
GO_VERSION: '1.25.11'

# SR-007: least-privilege default. CI runs on every PR (including forks) and
# only needs to read the repo. Any job needing more must opt in per-job.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
if: matrix.language == 'go'
uses: actions/setup-go@v6
with:
go-version: '1.25.10'
go-version: '1.25.11'
cache-dependency-path: agent/go.sum

- name: Initialize CodeQL
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:
env:
NODE_VERSION: '22'
PNPM_VERSION: '10.33.4'
GO_VERSION: '1.25.10'
GO_VERSION: '1.25.11'
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/security.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ concurrency:
env:
NODE_VERSION: '22'
PNPM_VERSION: '10.33.4'
GO_VERSION: '1.25.10'
GO_VERSION: '1.25.11'

permissions:
actions: read
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/**
* Grace-period prune of stale patch tombstones (#1004).
*
* The scan ingest marks every existing device_patches row `status='missing'` at
* the start of a scan, then re-upserts the rows the scan reported. Rows left at
* 'missing' are tombstones; before this fix they accumulated unbounded (a Linux
* package upgraded to a new externalId orphans the old row forever).
*
* pruneStaleTombstones bounds that growth by deleting 'missing' rows whose
* `updatedAt` (the last scan that actually reported the patch — the bulk
* mark-missing leaves updatedAt untouched) is older than a grace window. These
* tests exercise the real SQL against a real DB (a Drizzle mock would never run
* the make_interval filter), covering Todd's required cases: recent rows survive
* (empty/zero-item payloads, same-bucket partial-provider failures self-heal),
* non-'missing' rows are never touched, idempotency, and device/org isolation.
*
* Prerequisites:
* docker compose -f docker-compose.test.yml up -d
* Run:
* pnpm test:integration -- src/routes/agents/patchTombstonePrune.integration.test.ts
*/
import './setup';

import { randomUUID } from 'node:crypto';
import { describe, it, expect, beforeEach } from 'vitest';
import { eq } from 'drizzle-orm';
import { getTestDb } from './setup';
import { setupTestEnvironment } from './db-utils';
import { devices, patches, devicePatches } from '../../db/schema';
import type { Database } from '../../db';
import { pruneStaleTombstones } from '../../routes/agents/patches';

const WINDOW_HOURS = 24;
const OLD = new Date(Date.now() - 200 * 60 * 60 * 1000); // 200h ago (past the window)
const RECENT = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1h ago (inside the window)

let agentSeq = 0;
async function seedDevice(orgId: string, siteId: string, hostname: string): Promise<string> {
agentSeq++;
const [row] = await getTestDb()
.insert(devices)
.values({
orgId,
siteId,
agentId: `agent-tombstone-${agentSeq}-${Date.now()}`,
hostname,
displayName: hostname,
osType: 'linux',
osVersion: '22.04',
osBuild: 'jammy',
architecture: 'x86_64',
agentVersion: '0.0.0-test',
status: 'online',
enrolledAt: new Date(),
})
.returning({ id: devices.id });
if (!row) throw new Error('seedDevice: no row');
return row.id;
}

let patchSeq = 0;
async function seedDevicePatch(opts: {
orgId: string;
deviceId: string;
status: 'pending' | 'installed' | 'missing';
source?: 'linux' | 'third_party';
updatedAt: Date;
}): Promise<string> {
patchSeq++;
const source = opts.source ?? 'linux';
// patches is a global catalog (not truncated per-test) → externalId must be unique.
const [patch] = await getTestDb()
.insert(patches)
.values({ source, externalId: `${source}:${randomUUID()}`, title: `tp-${patchSeq}`, severity: 'unknown' })
.returning({ id: patches.id });
if (!patch) throw new Error('seedDevicePatch: no patch');
const [dp] = await getTestDb()
.insert(devicePatches)
.values({
deviceId: opts.deviceId,
orgId: opts.orgId,
patchId: patch.id,
status: opts.status,
lastCheckedAt: new Date(),
updatedAt: opts.updatedAt,
})
.returning({ id: devicePatches.id });
if (!dp) throw new Error('seedDevicePatch: no device_patch');
return dp.id;
}

async function exists(devicePatchId: string): Promise<boolean> {
const rows = await getTestDb()
.select({ id: devicePatches.id })
.from(devicePatches)
.where(eq(devicePatches.id, devicePatchId));
return rows.length > 0;
}

function prune(deviceId: string, orgId: string): Promise<void> {
return pruneStaleTombstones(getTestDb() as unknown as Database, deviceId, orgId, WINDOW_HOURS);
}

describe('pruneStaleTombstones (#1004 grace-period prune)', () => {
let orgId: string;
let siteId: string;

beforeEach(async () => {
const env = await setupTestEnvironment({ scope: 'organization' });
orgId = env.organization.id;
siteId = env.site.id;
});

it('deletes a stale "missing" tombstone older than the window', async () => {
const deviceId = await seedDevice(orgId, siteId, 'stale-host');
const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD });

await prune(deviceId, orgId);

expect(await exists(stale)).toBe(false);
});

it('keeps a "missing" row still inside the window (empty/zero-item scan + same-bucket self-heal)', async () => {
const deviceId = await seedDevice(orgId, siteId, 'recent-host');
// A row that went missing this cycle but was reported recently — e.g. winget
// failed while chocolatey succeeded under the shared third_party bucket.
const recent = await seedDevicePatch({ orgId, deviceId, status: 'missing', source: 'third_party', updatedAt: RECENT });

await prune(deviceId, orgId);

expect(await exists(recent)).toBe(true);
});

it('never touches pending or installed rows, however old', async () => {
const deviceId = await seedDevice(orgId, siteId, 'live-host');
const pending = await seedDevicePatch({ orgId, deviceId, status: 'pending', updatedAt: OLD });
const installed = await seedDevicePatch({ orgId, deviceId, status: 'installed', updatedAt: OLD });

await prune(deviceId, orgId);

expect(await exists(pending)).toBe(true);
expect(await exists(installed)).toBe(true);
});

it('is idempotent — a second prune is a clean no-op', async () => {
const deviceId = await seedDevice(orgId, siteId, 'idempotent-host');
const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD });
const recent = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: RECENT });

await prune(deviceId, orgId);
await prune(deviceId, orgId); // must not throw, must not affect the survivor

expect(await exists(stale)).toBe(false);
expect(await exists(recent)).toBe(true);
});

it('only prunes the target device — a stale tombstone on another device is untouched', async () => {
const target = await seedDevice(orgId, siteId, 'target-host');
const other = await seedDevice(orgId, siteId, 'other-host');
const targetStale = await seedDevicePatch({ orgId, deviceId: target, status: 'missing', updatedAt: OLD });
const otherStale = await seedDevicePatch({ orgId, deviceId: other, status: 'missing', updatedAt: OLD });

await prune(target, orgId);

expect(await exists(targetStale)).toBe(false); // target pruned
expect(await exists(otherStale)).toBe(true); // other device untouched
});

it('does not prune when the orgId does not match the rows (cross-tenant guard)', async () => {
const deviceId = await seedDevice(orgId, siteId, 'org-guard-host');
const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD });

// Same device id but a different org id → the org-scoped WHERE must spare it.
await prune(deviceId, randomUUID());

expect(await exists(stale)).toBe(true);
});
});
3 changes: 3 additions & 0 deletions apps/api/src/routes/agents/patches.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const tables = vi.hoisted(() => ({

vi.mock('drizzle-orm', () => ({
eq: (left: unknown, right: unknown) => ({ op: 'eq', left, right }),
and: (...conds: unknown[]) => ({ op: 'and', conds }),
sql: (strings: TemplateStringsArray, ...values: unknown[]) => ({
op: 'sql',
strings: Array.from(strings),
Expand All @@ -59,6 +60,8 @@ vi.mock('../../db', () => ({
db: {
select: vi.fn(),
transaction: vi.fn(),
// tombstone prune (#1004) runs after the scan txn via db.delete(...).where(...)
delete: vi.fn(() => ({ where: vi.fn().mockResolvedValue(undefined) })),
},
runOutsideDbContext: vi.fn((fn: () => unknown) => fn()),
withDbAccessContext: vi.fn(async (_ctx: unknown, fn: () => Promise<unknown>) => fn()),
Expand Down
41 changes: 39 additions & 2 deletions apps/api/src/routes/agents/patches.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { Hono } from 'hono';
import { zValidator } from '@hono/zod-validator';
import { eq, sql } from 'drizzle-orm';
import { db } from '../../db';
import { and, eq, sql } from 'drizzle-orm';
import { db, type Database } from '../../db';
import { devices, patches, devicePatches } from '../../db/schema';
import { enqueueWingetReleaseTest } from '../../jobs/wingetReleaseTestWorker';
import { writeAuditEvent } from '../../services/auditEvents';
Expand All @@ -17,6 +17,38 @@ function deriveVendor(packageId: string | null | undefined, fallback: string | n
return fallback ?? null;
}

/**
* Bound tombstone growth (#1004): delete device_patches rows that have stayed
* 'missing' (absent from every scan) longer than the grace window.
*
* `updatedAt` is bumped only when a scan actually reports the patch — the bulk
* mark-missing in the scan ingest sets `status='missing'` + `lastCheckedAt` but
* leaves `updatedAt` untouched — so `updatedAt` dates the patch's last real
* sighting. A transient partial-provider failure (e.g. winget fails while
* chocolatey succeeds under the shared 'third_party' source bucket) self-heals:
* the row is re-upserted on the next clean scan inside the window, so only
* genuinely-removed packages age out. The window is generous (default 7 days,
* `PATCH_TOMBSTONE_PRUNE_AFTER_HOURS`) so a missed scan never prunes prematurely.
* Scoped to a single device + org (cross-tenant safe).
*/
export async function pruneStaleTombstones(
executor: Database,
deviceId: string,
orgId: string,
pruneAfterHours = Number(process.env.PATCH_TOMBSTONE_PRUNE_AFTER_HOURS) || 168,
): Promise<void> {
await executor
.delete(devicePatches)
.where(
and(
eq(devicePatches.deviceId, deviceId),
eq(devicePatches.orgId, orgId),
eq(devicePatches.status, 'missing'),
sql`${devicePatches.updatedAt} < now() - make_interval(hours => ${pruneAfterHours})`,
),
);
}

export const patchesRoutes = new Hono();

patchesRoutes.put('/:id/patches', zValidator('json', submitPatchesSchema), async (c) => {
Expand Down Expand Up @@ -204,6 +236,11 @@ patchesRoutes.put('/:id/patches', zValidator('json', submitPatchesSchema), async
}
});

// Prune stale tombstones after the scan commits. Outside the txn on purpose:
// it reads the just-committed state, and a crash before it runs is harmless
// (the next scan prunes). Runs in the same request DB context as the ingest.
await pruneStaleTombstones(db, device.id, device.orgId);

writeAuditEvent(c, {
orgId: agent?.orgId ?? device.orgId,
actorType: 'agent',
Expand Down
Loading