From e9df76a60a62e2c1b31c073283f2f506402a03dd Mon Sep 17 00:00:00 2001 From: Billy Dunn Date: Tue, 2 Jun 2026 19:17:01 -0500 Subject: [PATCH 1/3] fix(ci): bump Go toolchain 1.25.10 -> 1.25.11 (patch 2 stdlib govulncheck findings) govulncheck (Security Scanning workflow) is failing on main and every open PR with 2 Go standard-library vulnerabilities, both fixed in go1.25.11: - GO-2026-5039 net/textproto (unescaped inputs in errors) - GO-2026-5037 crypto/x509 (inefficient candidate hostname parsing) These are stdlib-only (no third-party module bumps needed); a freshly-published advisory started failing the scan on unchanged code because CI pinned 1.25.10. Bump the GO_VERSION / go-version pins in ci.yml, codeql.yml and release.yml so the scan (and the released agent binary's stdlib) use the patched toolchain. go.mod's go directive (1.25.10 floor) is intentionally left as-is. --- .github/workflows/ci.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/release.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 58354ada7..9c4202052 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ concurrency: env: NODE_VERSION: '22' PNPM_VERSION: '10.33.4' - GO_VERSION: '1.25.10' + GO_VERSION: '1.25.11' # SR-007: least-privilege default. CI runs on every PR (including forks) and # only needs to read the repo. Any job needing more must opt in per-job. diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index ecf7bdb89..fe904d14c 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -32,7 +32,7 @@ jobs: if: matrix.language == 'go' uses: actions/setup-go@v6 with: - go-version: '1.25.10' + go-version: '1.25.11' cache-dependency-path: agent/go.sum - name: Initialize CodeQL diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fbf24220f..f68e8bcfa 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,7 +14,7 @@ on: env: NODE_VERSION: '22' PNPM_VERSION: '10.33.4' - GO_VERSION: '1.25.10' + GO_VERSION: '1.25.11' REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} From 6d5b325afc91db2f51297a07bb346813c8125370 Mon Sep 17 00:00:00 2001 From: Billy Dunn Date: Tue, 2 Jun 2026 19:19:16 -0500 Subject: [PATCH 2/3] fix(ci): bump security.yml Go pin too (the actual govulncheck workflow) security.yml runs the Go Vulnerability Check job and had its own GO_VERSION: '1.25.10' (separate from ci.yml). Bumping it to 1.25.11 is what actually makes govulncheck use the patched stdlib. --- .github/workflows/security.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 8d9b0487e..d4aa31482 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -15,7 +15,7 @@ concurrency: env: NODE_VERSION: '22' PNPM_VERSION: '10.33.4' - GO_VERSION: '1.25.10' + GO_VERSION: '1.25.11' permissions: actions: read From 24a2b6ed8b3747f1d9cef3eb63a8034a3528324f Mon Sep 17 00:00:00 2001 From: Billy Dunn Date: Tue, 2 Jun 2026 20:15:14 -0500 Subject: [PATCH 3/3] fix(patches): grace-period prune of stale 'missing' tombstones (#1004) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scan ingest marks every device_patches row status='missing' at the start of a scan, then re-upserts the rows the scan reported. Rows left 'missing' are stale tombstones (e.g. a Linux package upgraded to a new externalId orphans the old row) and accumulated unbounded — US prod had devices with 300-960 'missing' rows. Add pruneStaleTombstones(): after each scan commits, delete this device's 'missing' rows whose updatedAt is older than a grace window (default 7d, PATCH_TOMBSTONE_PRUNE_AFTER_HOURS). updatedAt is the right signal — it's bumped only when a scan actually reports the patch; the bulk mark-missing leaves it untouched (unlike lastCheckedAt, which is refreshed every scan). This realizes the API-only 'grace-period' option from the issue thread safely: the destructive edge Todd flagged (source buckets are coarser than providers; a winget failure under the shared 'third_party' bucket while chocolatey succeeds) self-heals, because the row is re-upserted on the next clean scan inside the window. Only genuinely-removed packages age out. Scoped to one device + org. Tests: 6 integration cases against a real DB (a Drizzle mock would never run the make_interval filter) — stale row pruned; recent row kept (covers empty/zero-item payload + same-bucket self-heal); pending/installed never touched; idempotency; target-device-only; org-scope guard. Existing patches.test.ts unit suite updated (db.delete + and() mocks) and green. --- .../patchTombstonePrune.integration.test.ts | 178 ++++++++++++++++++ apps/api/src/routes/agents/patches.test.ts | 3 + apps/api/src/routes/agents/patches.ts | 41 +++- 3 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 apps/api/src/__tests__/integration/patchTombstonePrune.integration.test.ts diff --git a/apps/api/src/__tests__/integration/patchTombstonePrune.integration.test.ts b/apps/api/src/__tests__/integration/patchTombstonePrune.integration.test.ts new file mode 100644 index 000000000..8ad5879f9 --- /dev/null +++ b/apps/api/src/__tests__/integration/patchTombstonePrune.integration.test.ts @@ -0,0 +1,178 @@ +/** + * Grace-period prune of stale patch tombstones (#1004). + * + * The scan ingest marks every existing device_patches row `status='missing'` at + * the start of a scan, then re-upserts the rows the scan reported. Rows left at + * 'missing' are tombstones; before this fix they accumulated unbounded (a Linux + * package upgraded to a new externalId orphans the old row forever). + * + * pruneStaleTombstones bounds that growth by deleting 'missing' rows whose + * `updatedAt` (the last scan that actually reported the patch — the bulk + * mark-missing leaves updatedAt untouched) is older than a grace window. These + * tests exercise the real SQL against a real DB (a Drizzle mock would never run + * the make_interval filter), covering Todd's required cases: recent rows survive + * (empty/zero-item payloads, same-bucket partial-provider failures self-heal), + * non-'missing' rows are never touched, idempotency, and device/org isolation. + * + * Prerequisites: + * docker compose -f docker-compose.test.yml up -d + * Run: + * pnpm test:integration -- src/routes/agents/patchTombstonePrune.integration.test.ts + */ +import './setup'; + +import { randomUUID } from 'node:crypto'; +import { describe, it, expect, beforeEach } from 'vitest'; +import { eq } from 'drizzle-orm'; +import { getTestDb } from './setup'; +import { setupTestEnvironment } from './db-utils'; +import { devices, patches, devicePatches } from '../../db/schema'; +import type { Database } from '../../db'; +import { pruneStaleTombstones } from '../../routes/agents/patches'; + +const WINDOW_HOURS = 24; +const OLD = new Date(Date.now() - 200 * 60 * 60 * 1000); // 200h ago (past the window) +const RECENT = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1h ago (inside the window) + +let agentSeq = 0; +async function seedDevice(orgId: string, siteId: string, hostname: string): Promise { + agentSeq++; + const [row] = await getTestDb() + .insert(devices) + .values({ + orgId, + siteId, + agentId: `agent-tombstone-${agentSeq}-${Date.now()}`, + hostname, + displayName: hostname, + osType: 'linux', + osVersion: '22.04', + osBuild: 'jammy', + architecture: 'x86_64', + agentVersion: '0.0.0-test', + status: 'online', + enrolledAt: new Date(), + }) + .returning({ id: devices.id }); + if (!row) throw new Error('seedDevice: no row'); + return row.id; +} + +let patchSeq = 0; +async function seedDevicePatch(opts: { + orgId: string; + deviceId: string; + status: 'pending' | 'installed' | 'missing'; + source?: 'linux' | 'third_party'; + updatedAt: Date; +}): Promise { + patchSeq++; + const source = opts.source ?? 'linux'; + // patches is a global catalog (not truncated per-test) → externalId must be unique. + const [patch] = await getTestDb() + .insert(patches) + .values({ source, externalId: `${source}:${randomUUID()}`, title: `tp-${patchSeq}`, severity: 'unknown' }) + .returning({ id: patches.id }); + if (!patch) throw new Error('seedDevicePatch: no patch'); + const [dp] = await getTestDb() + .insert(devicePatches) + .values({ + deviceId: opts.deviceId, + orgId: opts.orgId, + patchId: patch.id, + status: opts.status, + lastCheckedAt: new Date(), + updatedAt: opts.updatedAt, + }) + .returning({ id: devicePatches.id }); + if (!dp) throw new Error('seedDevicePatch: no device_patch'); + return dp.id; +} + +async function exists(devicePatchId: string): Promise { + const rows = await getTestDb() + .select({ id: devicePatches.id }) + .from(devicePatches) + .where(eq(devicePatches.id, devicePatchId)); + return rows.length > 0; +} + +function prune(deviceId: string, orgId: string): Promise { + return pruneStaleTombstones(getTestDb() as unknown as Database, deviceId, orgId, WINDOW_HOURS); +} + +describe('pruneStaleTombstones (#1004 grace-period prune)', () => { + let orgId: string; + let siteId: string; + + beforeEach(async () => { + const env = await setupTestEnvironment({ scope: 'organization' }); + orgId = env.organization.id; + siteId = env.site.id; + }); + + it('deletes a stale "missing" tombstone older than the window', async () => { + const deviceId = await seedDevice(orgId, siteId, 'stale-host'); + const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD }); + + await prune(deviceId, orgId); + + expect(await exists(stale)).toBe(false); + }); + + it('keeps a "missing" row still inside the window (empty/zero-item scan + same-bucket self-heal)', async () => { + const deviceId = await seedDevice(orgId, siteId, 'recent-host'); + // A row that went missing this cycle but was reported recently — e.g. winget + // failed while chocolatey succeeded under the shared third_party bucket. + const recent = await seedDevicePatch({ orgId, deviceId, status: 'missing', source: 'third_party', updatedAt: RECENT }); + + await prune(deviceId, orgId); + + expect(await exists(recent)).toBe(true); + }); + + it('never touches pending or installed rows, however old', async () => { + const deviceId = await seedDevice(orgId, siteId, 'live-host'); + const pending = await seedDevicePatch({ orgId, deviceId, status: 'pending', updatedAt: OLD }); + const installed = await seedDevicePatch({ orgId, deviceId, status: 'installed', updatedAt: OLD }); + + await prune(deviceId, orgId); + + expect(await exists(pending)).toBe(true); + expect(await exists(installed)).toBe(true); + }); + + it('is idempotent — a second prune is a clean no-op', async () => { + const deviceId = await seedDevice(orgId, siteId, 'idempotent-host'); + const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD }); + const recent = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: RECENT }); + + await prune(deviceId, orgId); + await prune(deviceId, orgId); // must not throw, must not affect the survivor + + expect(await exists(stale)).toBe(false); + expect(await exists(recent)).toBe(true); + }); + + it('only prunes the target device — a stale tombstone on another device is untouched', async () => { + const target = await seedDevice(orgId, siteId, 'target-host'); + const other = await seedDevice(orgId, siteId, 'other-host'); + const targetStale = await seedDevicePatch({ orgId, deviceId: target, status: 'missing', updatedAt: OLD }); + const otherStale = await seedDevicePatch({ orgId, deviceId: other, status: 'missing', updatedAt: OLD }); + + await prune(target, orgId); + + expect(await exists(targetStale)).toBe(false); // target pruned + expect(await exists(otherStale)).toBe(true); // other device untouched + }); + + it('does not prune when the orgId does not match the rows (cross-tenant guard)', async () => { + const deviceId = await seedDevice(orgId, siteId, 'org-guard-host'); + const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD }); + + // Same device id but a different org id → the org-scoped WHERE must spare it. + await prune(deviceId, randomUUID()); + + expect(await exists(stale)).toBe(true); + }); +}); diff --git a/apps/api/src/routes/agents/patches.test.ts b/apps/api/src/routes/agents/patches.test.ts index 490d3568b..597b30e9a 100644 --- a/apps/api/src/routes/agents/patches.test.ts +++ b/apps/api/src/routes/agents/patches.test.ts @@ -48,6 +48,7 @@ const tables = vi.hoisted(() => ({ vi.mock('drizzle-orm', () => ({ eq: (left: unknown, right: unknown) => ({ op: 'eq', left, right }), + and: (...conds: unknown[]) => ({ op: 'and', conds }), sql: (strings: TemplateStringsArray, ...values: unknown[]) => ({ op: 'sql', strings: Array.from(strings), @@ -59,6 +60,8 @@ vi.mock('../../db', () => ({ db: { select: vi.fn(), transaction: vi.fn(), + // tombstone prune (#1004) runs after the scan txn via db.delete(...).where(...) + delete: vi.fn(() => ({ where: vi.fn().mockResolvedValue(undefined) })), }, runOutsideDbContext: vi.fn((fn: () => unknown) => fn()), withDbAccessContext: vi.fn(async (_ctx: unknown, fn: () => Promise) => fn()), diff --git a/apps/api/src/routes/agents/patches.ts b/apps/api/src/routes/agents/patches.ts index 72deb8ada..93f7b58c7 100644 --- a/apps/api/src/routes/agents/patches.ts +++ b/apps/api/src/routes/agents/patches.ts @@ -1,7 +1,7 @@ import { Hono } from 'hono'; import { zValidator } from '@hono/zod-validator'; -import { eq, sql } from 'drizzle-orm'; -import { db } from '../../db'; +import { and, eq, sql } from 'drizzle-orm'; +import { db, type Database } from '../../db'; import { devices, patches, devicePatches } from '../../db/schema'; import { enqueueWingetReleaseTest } from '../../jobs/wingetReleaseTestWorker'; import { writeAuditEvent } from '../../services/auditEvents'; @@ -17,6 +17,38 @@ function deriveVendor(packageId: string | null | undefined, fallback: string | n return fallback ?? null; } +/** + * Bound tombstone growth (#1004): delete device_patches rows that have stayed + * 'missing' (absent from every scan) longer than the grace window. + * + * `updatedAt` is bumped only when a scan actually reports the patch — the bulk + * mark-missing in the scan ingest sets `status='missing'` + `lastCheckedAt` but + * leaves `updatedAt` untouched — so `updatedAt` dates the patch's last real + * sighting. A transient partial-provider failure (e.g. winget fails while + * chocolatey succeeds under the shared 'third_party' source bucket) self-heals: + * the row is re-upserted on the next clean scan inside the window, so only + * genuinely-removed packages age out. The window is generous (default 7 days, + * `PATCH_TOMBSTONE_PRUNE_AFTER_HOURS`) so a missed scan never prunes prematurely. + * Scoped to a single device + org (cross-tenant safe). + */ +export async function pruneStaleTombstones( + executor: Database, + deviceId: string, + orgId: string, + pruneAfterHours = Number(process.env.PATCH_TOMBSTONE_PRUNE_AFTER_HOURS) || 168, +): Promise { + await executor + .delete(devicePatches) + .where( + and( + eq(devicePatches.deviceId, deviceId), + eq(devicePatches.orgId, orgId), + eq(devicePatches.status, 'missing'), + sql`${devicePatches.updatedAt} < now() - make_interval(hours => ${pruneAfterHours})`, + ), + ); +} + export const patchesRoutes = new Hono(); patchesRoutes.put('/:id/patches', zValidator('json', submitPatchesSchema), async (c) => { @@ -204,6 +236,11 @@ patchesRoutes.put('/:id/patches', zValidator('json', submitPatchesSchema), async } }); + // Prune stale tombstones after the scan commits. Outside the txn on purpose: + // it reads the just-committed state, and a crash before it runs is harmless + // (the next scan prunes). Runs in the same request DB context as the ingest. + await pruneStaleTombstones(db, device.id, device.orgId); + writeAuditEvent(c, { orgId: agent?.orgId ?? device.orgId, actorType: 'agent',