From e9df76a60a62e2c1b31c073283f2f506402a03dd Mon Sep 17 00:00:00 2001
From: Billy Dunn <bdunn@bdunn.com>
Date: Tue, 2 Jun 2026 19:17:01 -0500
Subject: [PATCH 1/3] fix(ci): bump Go toolchain 1.25.10 -> 1.25.11 (patch 2
 stdlib govulncheck findings)

govulncheck (Security Scanning workflow) is failing on main and every open PR with
2 Go standard-library vulnerabilities, both fixed in go1.25.11:
  - GO-2026-5039  net/textproto  (unescaped inputs in errors)
  - GO-2026-5037  crypto/x509    (inefficient candidate hostname parsing)

These are stdlib-only (no third-party module bumps needed); a freshly-published
advisory started failing the scan on unchanged code because CI pinned 1.25.10.
Bump the GO_VERSION / go-version pins in ci.yml, codeql.yml and release.yml so
the scan (and the released agent binary's stdlib) use the patched toolchain.
go.mod's go directive (1.25.10 floor) is intentionally left as-is.
---
 .github/workflows/ci.yml      | 2 +-
 .github/workflows/codeql.yml  | 2 +-
 .github/workflows/release.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 58354ada7..9c4202052 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,7 +24,7 @@ concurrency:
 env:
   NODE_VERSION: '22'
   PNPM_VERSION: '10.33.4'
-  GO_VERSION: '1.25.10'
+  GO_VERSION: '1.25.11'
 
 # SR-007: least-privilege default. CI runs on every PR (including forks) and
 # only needs to read the repo. Any job needing more must opt in per-job.
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index ecf7bdb89..fe904d14c 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -32,7 +32,7 @@ jobs:
         if: matrix.language == 'go'
         uses: actions/setup-go@v6
         with:
-          go-version: '1.25.10'
+          go-version: '1.25.11'
           cache-dependency-path: agent/go.sum
 
       - name: Initialize CodeQL
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index fbf24220f..f68e8bcfa 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -14,7 +14,7 @@ on:
 env:
   NODE_VERSION: '22'
   PNPM_VERSION: '10.33.4'
-  GO_VERSION: '1.25.10'
+  GO_VERSION: '1.25.11'
   REGISTRY: ghcr.io
   IMAGE_NAME: ${{ github.repository }}
 

From 6d5b325afc91db2f51297a07bb346813c8125370 Mon Sep 17 00:00:00 2001
From: Billy Dunn <bdunn@bdunn.com>
Date: Tue, 2 Jun 2026 19:19:16 -0500
Subject: [PATCH 2/3] fix(ci): bump security.yml Go pin too (the actual
 govulncheck workflow)

security.yml runs the Go Vulnerability Check job and had its own GO_VERSION:
'1.25.10' (separate from ci.yml). Bumping it to 1.25.11 is what actually makes
govulncheck use the patched stdlib.
---
 .github/workflows/security.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index 8d9b0487e..d4aa31482 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -15,7 +15,7 @@ concurrency:
 env:
   NODE_VERSION: '22'
   PNPM_VERSION: '10.33.4'
-  GO_VERSION: '1.25.10'
+  GO_VERSION: '1.25.11'
 
 permissions:
   actions: read

From 24a2b6ed8b3747f1d9cef3eb63a8034a3528324f Mon Sep 17 00:00:00 2001
From: Billy Dunn <bdunn@bdunn.com>
Date: Tue, 2 Jun 2026 20:15:14 -0500
Subject: [PATCH 3/3] fix(patches): grace-period prune of stale 'missing'
 tombstones (#1004)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The scan ingest marks every device_patches row status='missing' at the start of
a scan, then re-upserts the rows the scan reported. Rows left 'missing' are stale
tombstones (e.g. a Linux package upgraded to a new externalId orphans the old
row) and accumulated unbounded — US prod had devices with 300-960 'missing' rows.

Add pruneStaleTombstones(): after each scan commits, delete this device's
'missing' rows whose updatedAt is older than a grace window (default 7d,
PATCH_TOMBSTONE_PRUNE_AFTER_HOURS). updatedAt is the right signal — it's bumped
only when a scan actually reports the patch; the bulk mark-missing leaves it
untouched (unlike lastCheckedAt, which is refreshed every scan). This realizes
the API-only 'grace-period' option from the issue thread safely: the destructive
edge Todd flagged (source buckets are coarser than providers; a winget failure
under the shared 'third_party' bucket while chocolatey succeeds) self-heals,
because the row is re-upserted on the next clean scan inside the window. Only
genuinely-removed packages age out. Scoped to one device + org.

Tests: 6 integration cases against a real DB (a Drizzle mock would never run the
make_interval filter) — stale row pruned; recent row kept (covers empty/zero-item
payload + same-bucket self-heal); pending/installed never touched; idempotency;
target-device-only; org-scope guard. Existing patches.test.ts unit suite updated
(db.delete + and() mocks) and green.
---
 .../patchTombstonePrune.integration.test.ts   | 178 ++++++++++++++++++
 apps/api/src/routes/agents/patches.test.ts    |   3 +
 apps/api/src/routes/agents/patches.ts         |  41 +++-
 3 files changed, 220 insertions(+), 2 deletions(-)
 create mode 100644 apps/api/src/__tests__/integration/patchTombstonePrune.integration.test.ts

diff --git a/apps/api/src/__tests__/integration/patchTombstonePrune.integration.test.ts b/apps/api/src/__tests__/integration/patchTombstonePrune.integration.test.ts
new file mode 100644
index 000000000..8ad5879f9
--- /dev/null
+++ b/apps/api/src/__tests__/integration/patchTombstonePrune.integration.test.ts
@@ -0,0 +1,178 @@
+/**
+ * Grace-period prune of stale patch tombstones (#1004).
+ *
+ * The scan ingest marks every existing device_patches row `status='missing'` at
+ * the start of a scan, then re-upserts the rows the scan reported. Rows left at
+ * 'missing' are tombstones; before this fix they accumulated unbounded (a Linux
+ * package upgraded to a new externalId orphans the old row forever).
+ *
+ * pruneStaleTombstones bounds that growth by deleting 'missing' rows whose
+ * `updatedAt` (the last scan that actually reported the patch — the bulk
+ * mark-missing leaves updatedAt untouched) is older than a grace window. These
+ * tests exercise the real SQL against a real DB (a Drizzle mock would never run
+ * the make_interval filter), covering Todd's required cases: recent rows survive
+ * (empty/zero-item payloads, same-bucket partial-provider failures self-heal),
+ * non-'missing' rows are never touched, idempotency, and device/org isolation.
+ *
+ * Prerequisites:
+ *   docker compose -f docker-compose.test.yml up -d
+ * Run:
+ *   pnpm test:integration -- src/routes/agents/patchTombstonePrune.integration.test.ts
+ */
+import './setup';
+
+import { randomUUID } from 'node:crypto';
+import { describe, it, expect, beforeEach } from 'vitest';
+import { eq } from 'drizzle-orm';
+import { getTestDb } from './setup';
+import { setupTestEnvironment } from './db-utils';
+import { devices, patches, devicePatches } from '../../db/schema';
+import type { Database } from '../../db';
+import { pruneStaleTombstones } from '../../routes/agents/patches';
+
+const WINDOW_HOURS = 24;
+const OLD = new Date(Date.now() - 200 * 60 * 60 * 1000); // 200h ago (past the window)
+const RECENT = new Date(Date.now() - 1 * 60 * 60 * 1000); // 1h ago (inside the window)
+
+let agentSeq = 0;
+async function seedDevice(orgId: string, siteId: string, hostname: string): Promise<string> {
+  agentSeq++;
+  const [row] = await getTestDb()
+    .insert(devices)
+    .values({
+      orgId,
+      siteId,
+      agentId: `agent-tombstone-${agentSeq}-${Date.now()}`,
+      hostname,
+      displayName: hostname,
+      osType: 'linux',
+      osVersion: '22.04',
+      osBuild: 'jammy',
+      architecture: 'x86_64',
+      agentVersion: '0.0.0-test',
+      status: 'online',
+      enrolledAt: new Date(),
+    })
+    .returning({ id: devices.id });
+  if (!row) throw new Error('seedDevice: no row');
+  return row.id;
+}
+
+let patchSeq = 0;
+async function seedDevicePatch(opts: {
+  orgId: string;
+  deviceId: string;
+  status: 'pending' | 'installed' | 'missing';
+  source?: 'linux' | 'third_party';
+  updatedAt: Date;
+}): Promise<string> {
+  patchSeq++;
+  const source = opts.source ?? 'linux';
+  // patches is a global catalog (not truncated per-test) → externalId must be unique.
+  const [patch] = await getTestDb()
+    .insert(patches)
+    .values({ source, externalId: `${source}:${randomUUID()}`, title: `tp-${patchSeq}`, severity: 'unknown' })
+    .returning({ id: patches.id });
+  if (!patch) throw new Error('seedDevicePatch: no patch');
+  const [dp] = await getTestDb()
+    .insert(devicePatches)
+    .values({
+      deviceId: opts.deviceId,
+      orgId: opts.orgId,
+      patchId: patch.id,
+      status: opts.status,
+      lastCheckedAt: new Date(),
+      updatedAt: opts.updatedAt,
+    })
+    .returning({ id: devicePatches.id });
+  if (!dp) throw new Error('seedDevicePatch: no device_patch');
+  return dp.id;
+}
+
+async function exists(devicePatchId: string): Promise<boolean> {
+  const rows = await getTestDb()
+    .select({ id: devicePatches.id })
+    .from(devicePatches)
+    .where(eq(devicePatches.id, devicePatchId));
+  return rows.length > 0;
+}
+
+function prune(deviceId: string, orgId: string): Promise<void> {
+  return pruneStaleTombstones(getTestDb() as unknown as Database, deviceId, orgId, WINDOW_HOURS);
+}
+
+describe('pruneStaleTombstones (#1004 grace-period prune)', () => {
+  let orgId: string;
+  let siteId: string;
+
+  beforeEach(async () => {
+    const env = await setupTestEnvironment({ scope: 'organization' });
+    orgId = env.organization.id;
+    siteId = env.site.id;
+  });
+
+  it('deletes a stale "missing" tombstone older than the window', async () => {
+    const deviceId = await seedDevice(orgId, siteId, 'stale-host');
+    const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD });
+
+    await prune(deviceId, orgId);
+
+    expect(await exists(stale)).toBe(false);
+  });
+
+  it('keeps a "missing" row still inside the window (empty/zero-item scan + same-bucket self-heal)', async () => {
+    const deviceId = await seedDevice(orgId, siteId, 'recent-host');
+    // A row that went missing this cycle but was reported recently — e.g. winget
+    // failed while chocolatey succeeded under the shared third_party bucket.
+    const recent = await seedDevicePatch({ orgId, deviceId, status: 'missing', source: 'third_party', updatedAt: RECENT });
+
+    await prune(deviceId, orgId);
+
+    expect(await exists(recent)).toBe(true);
+  });
+
+  it('never touches pending or installed rows, however old', async () => {
+    const deviceId = await seedDevice(orgId, siteId, 'live-host');
+    const pending = await seedDevicePatch({ orgId, deviceId, status: 'pending', updatedAt: OLD });
+    const installed = await seedDevicePatch({ orgId, deviceId, status: 'installed', updatedAt: OLD });
+
+    await prune(deviceId, orgId);
+
+    expect(await exists(pending)).toBe(true);
+    expect(await exists(installed)).toBe(true);
+  });
+
+  it('is idempotent — a second prune is a clean no-op', async () => {
+    const deviceId = await seedDevice(orgId, siteId, 'idempotent-host');
+    const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD });
+    const recent = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: RECENT });
+
+    await prune(deviceId, orgId);
+    await prune(deviceId, orgId); // must not throw, must not affect the survivor
+
+    expect(await exists(stale)).toBe(false);
+    expect(await exists(recent)).toBe(true);
+  });
+
+  it('only prunes the target device — a stale tombstone on another device is untouched', async () => {
+    const target = await seedDevice(orgId, siteId, 'target-host');
+    const other = await seedDevice(orgId, siteId, 'other-host');
+    const targetStale = await seedDevicePatch({ orgId, deviceId: target, status: 'missing', updatedAt: OLD });
+    const otherStale = await seedDevicePatch({ orgId, deviceId: other, status: 'missing', updatedAt: OLD });
+
+    await prune(target, orgId);
+
+    expect(await exists(targetStale)).toBe(false); // target pruned
+    expect(await exists(otherStale)).toBe(true); // other device untouched
+  });
+
+  it('does not prune when the orgId does not match the rows (cross-tenant guard)', async () => {
+    const deviceId = await seedDevice(orgId, siteId, 'org-guard-host');
+    const stale = await seedDevicePatch({ orgId, deviceId, status: 'missing', updatedAt: OLD });
+
+    // Same device id but a different org id → the org-scoped WHERE must spare it.
+    await prune(deviceId, randomUUID());
+
+    expect(await exists(stale)).toBe(true);
+  });
+});
diff --git a/apps/api/src/routes/agents/patches.test.ts b/apps/api/src/routes/agents/patches.test.ts
index 490d3568b..597b30e9a 100644
--- a/apps/api/src/routes/agents/patches.test.ts
+++ b/apps/api/src/routes/agents/patches.test.ts
@@ -48,6 +48,7 @@ const tables = vi.hoisted(() => ({
 
 vi.mock('drizzle-orm', () => ({
   eq: (left: unknown, right: unknown) => ({ op: 'eq', left, right }),
+  and: (...conds: unknown[]) => ({ op: 'and', conds }),
   sql: (strings: TemplateStringsArray, ...values: unknown[]) => ({
     op: 'sql',
     strings: Array.from(strings),
@@ -59,6 +60,8 @@ vi.mock('../../db', () => ({
   db: {
     select: vi.fn(),
     transaction: vi.fn(),
+    // tombstone prune (#1004) runs after the scan txn via db.delete(...).where(...)
+    delete: vi.fn(() => ({ where: vi.fn().mockResolvedValue(undefined) })),
   },
   runOutsideDbContext: vi.fn((fn: () => unknown) => fn()),
   withDbAccessContext: vi.fn(async (_ctx: unknown, fn: () => Promise<unknown>) => fn()),
diff --git a/apps/api/src/routes/agents/patches.ts b/apps/api/src/routes/agents/patches.ts
index 72deb8ada..93f7b58c7 100644
--- a/apps/api/src/routes/agents/patches.ts
+++ b/apps/api/src/routes/agents/patches.ts
@@ -1,7 +1,7 @@
 import { Hono } from 'hono';
 import { zValidator } from '@hono/zod-validator';
-import { eq, sql } from 'drizzle-orm';
-import { db } from '../../db';
+import { and, eq, sql } from 'drizzle-orm';
+import { db, type Database } from '../../db';
 import { devices, patches, devicePatches } from '../../db/schema';
 import { enqueueWingetReleaseTest } from '../../jobs/wingetReleaseTestWorker';
 import { writeAuditEvent } from '../../services/auditEvents';
@@ -17,6 +17,38 @@ function deriveVendor(packageId: string | null | undefined, fallback: string | n
   return fallback ?? null;
 }
 
+/**
+ * Bound tombstone growth (#1004): delete device_patches rows that have stayed
+ * 'missing' (absent from every scan) longer than the grace window.
+ *
+ * `updatedAt` is bumped only when a scan actually reports the patch — the bulk
+ * mark-missing in the scan ingest sets `status='missing'` + `lastCheckedAt` but
+ * leaves `updatedAt` untouched — so `updatedAt` dates the patch's last real
+ * sighting. A transient partial-provider failure (e.g. winget fails while
+ * chocolatey succeeds under the shared 'third_party' source bucket) self-heals:
+ * the row is re-upserted on the next clean scan inside the window, so only
+ * genuinely-removed packages age out. The window is generous (default 7 days,
+ * `PATCH_TOMBSTONE_PRUNE_AFTER_HOURS`) so a missed scan never prunes prematurely.
+ * Scoped to a single device + org (cross-tenant safe).
+ */
+export async function pruneStaleTombstones(
+  executor: Database,
+  deviceId: string,
+  orgId: string,
+  pruneAfterHours = Number(process.env.PATCH_TOMBSTONE_PRUNE_AFTER_HOURS) || 168,
+): Promise<void> {
+  await executor
+    .delete(devicePatches)
+    .where(
+      and(
+        eq(devicePatches.deviceId, deviceId),
+        eq(devicePatches.orgId, orgId),
+        eq(devicePatches.status, 'missing'),
+        sql`${devicePatches.updatedAt} < now() - make_interval(hours => ${pruneAfterHours})`,
+      ),
+    );
+}
+
 export const patchesRoutes = new Hono();
 
 patchesRoutes.put('/:id/patches', zValidator('json', submitPatchesSchema), async (c) => {
@@ -204,6 +236,11 @@ patchesRoutes.put('/:id/patches', zValidator('json', submitPatchesSchema), async
     }
   });
 
+  // Prune stale tombstones after the scan commits. Outside the txn on purpose:
+  // it reads the just-committed state, and a crash before it runs is harmless
+  // (the next scan prunes). Runs in the same request DB context as the ingest.
+  await pruneStaleTombstones(db, device.id, device.orgId);
+
   writeAuditEvent(c, {
     orgId: agent?.orgId ?? device.orgId,
     actorType: 'agent',