diff --git a/graphile/graphile-search/src/__tests__/search-config-integration.test.ts b/graphile/graphile-search/src/__tests__/search-config-integration.test.ts new file mode 100644 index 000000000..84b8afa42 --- /dev/null +++ b/graphile/graphile-search/src/__tests__/search-config-integration.test.ts @@ -0,0 +1,632 @@ +/** + * Integration tests for @searchConfig smart tag (Phase D) and @hasChunks + * chunk-aware querying (Phase E), plus Phase I schema-time validation. + * + * These tests run against a real PostgreSQL database with tables tagged via + * a custom Graphile plugin that injects smart tags programmatically (since + * @searchConfig and @hasChunks are JSON objects, not simple strings). + */ + +import { join } from 'path'; +import { getConnections, seed } from 'graphile-test'; +import type { GraphQLResponse } from 'graphile-test'; +import type { PgTestClient } from 'pgsql-test'; +import { ConnectionFilterPreset } from 'graphile-connection-filter'; +import { Bm25CodecPlugin } from '../codecs/bm25-codec'; +import { VectorCodecPlugin } from '../codecs/vector-codec'; +import { TsvectorCodecPlugin } from '../codecs/tsvector-codec'; +import { createUnifiedSearchPlugin } from '../plugin'; +import { createTsvectorAdapter } from '../adapters/tsvector'; +import { createBm25Adapter } from '../adapters/bm25'; +import { createTrgmAdapter } from '../adapters/trgm'; +import { createPgvectorAdapter } from '../adapters/pgvector'; +import type { GraphileConfig } from 'graphile-config'; + +// ─── Smart Tags Plugin ─────────────────────────────────────────────────────── +// Injects @searchConfig and @hasChunks smart tags on test tables during the +// schema build phase. This is necessary because these tags are JSON objects +// that can't be set via SQL COMMENT ON statements. + +function makeTestSmartTagsPlugin( + tagsByTable: Record> +): GraphileConfig.Plugin { + return { + name: 'TestSmartTagsPlugin', + version: '1.0.0', + + schema: { + hooks: { + // Run early in the build phase to inject tags before other plugins read them + init: { + before: ['UnifiedSearchPlugin'], + callback(_, build) { + // Iterate over all codecs and inject tags on matching tables + for (const codec of Object.values(build.input.pgRegistry.pgCodecs)) { + const c = codec as any; + if (!c.attributes || !c.name) continue; + + const tags = tagsByTable[c.name]; + if (!tags) continue; + + // Ensure extensions.tags exists + if (!c.extensions) c.extensions = {}; + if (!c.extensions.tags) c.extensions.tags = {}; + + Object.assign(c.extensions.tags, tags); + } + return _; + }, + }, + }, + }, + }; +} + +// ─── Result types ──────────────────────────────────────────────────────────── + +interface ArticleNode { + rowId: number; + title: string; + tsvRank: number | null; + bodyBm25Score: number | null; + embeddingVectorDistance: number | null; + searchScore: number | null; +} + +interface AllArticlesResult { + allArticles: { + nodes: ArticleNode[]; + }; +} + +interface PostNode { + rowId: number; + title: string; + embeddingVectorDistance: number | null; + searchScore: number | null; +} + +interface AllPostsResult { + allPosts: { + nodes: PostNode[]; + }; +} + +type QueryFn = ( + query: string, + variables?: Record +) => Promise>; + +// ─── Test Suite: @searchConfig with custom weights ─────────────────────────── + +describe('@searchConfig integration tests', () => { + let db: PgTestClient; + let teardown: () => Promise; + let query: QueryFn; + + beforeAll(async () => { + const unifiedPlugin = createUnifiedSearchPlugin({ + adapters: [ + createTsvectorAdapter(), + createBm25Adapter(), + createTrgmAdapter({ defaultThreshold: 0.1 }), + createPgvectorAdapter(), + ], + enableSearchScore: true, + enableFullTextSearch: true, + }); + + // Inject @searchConfig on the articles table with custom weights + const smartTagsPlugin = makeTestSmartTagsPlugin({ + articles: { + searchConfig: { + weights: { tsv: 0.7, bm25: 0.2, vector: 0.1 }, + normalization: 'linear', + boost_recent: true, + boost_recency_field: 'updated_at', + boost_recency_decay: 0.9, + }, + }, + }); + + const testPreset = { + extends: [ConnectionFilterPreset()], + plugins: [ + TsvectorCodecPlugin, + Bm25CodecPlugin, + VectorCodecPlugin, + smartTagsPlugin, + unifiedPlugin, + ], + }; + + const connections = await getConnections( + { + schemas: ['unified_search_test'], + preset: testPreset, + useRoot: true, + authRole: 'postgres', + }, + [seed.sqlfile([join(__dirname, './setup.sql')])] + ); + + db = connections.db; + teardown = connections.teardown; + query = connections.query; + + await db.client.query('BEGIN'); + }); + + afterAll(async () => { + if (db) { + try { + await db.client.query('ROLLBACK'); + } catch { + // Ignore rollback errors + } + } + if (teardown) { + await teardown(); + } + }); + + beforeEach(async () => { + await db.beforeEach(); + }); + + afterEach(async () => { + await db.afterEach(); + }); + + it('returns searchScore for articles with @searchConfig weights applied', async () => { + const result = await query(` + query { + allArticles(where: { + tsvTsv: "database" + }) { + nodes { + rowId + title + tsvRank + searchScore + } + } + } + `); + + expect(result.errors).toBeUndefined(); + const nodes = result.data?.allArticles?.nodes; + expect(nodes).toBeDefined(); + expect(nodes!.length).toBeGreaterThan(0); + + for (const node of nodes!) { + expect(typeof node.searchScore).toBe('number'); + expect(node.searchScore).toBeGreaterThanOrEqual(0); + expect(node.searchScore).toBeLessThanOrEqual(1); + } + }); + + it('returns searchScore with recency boost (newer articles score higher)', async () => { + // Query articles matching "database" — article 1 (PostgreSQL, 2025-01) and + // article 3 (Indexing, 2026-01) should both match. Article 3 is newer, so + // with recency boost its searchScore should be at least as high. + const result = await query(` + query { + allArticles(where: { + tsvTsv: "database" + }, orderBy: SEARCH_SCORE_DESC) { + nodes { + rowId + title + tsvRank + searchScore + } + } + } + `); + + expect(result.errors).toBeUndefined(); + const nodes = result.data?.allArticles?.nodes; + expect(nodes).toBeDefined(); + expect(nodes!.length).toBeGreaterThan(0); + + // All returned nodes should have valid search scores + for (const node of nodes!) { + expect(typeof node.searchScore).toBe('number'); + expect(node.searchScore).toBeGreaterThanOrEqual(0); + expect(node.searchScore).toBeLessThanOrEqual(1); + } + }); + + it('applies per-table weights (tsv weighted 0.7) and produces valid scores', async () => { + // Use combined search to verify per-table weights produce valid composite scores + const result = await query(` + query { + allArticles(where: { + tsvTsv: "database" + vectorEmbedding: { vector: [1, 0, 0], metric: COSINE } + }) { + nodes { + rowId + title + tsvRank + embeddingVectorDistance + searchScore + } + } + } + `); + + expect(result.errors).toBeUndefined(); + const nodes = result.data?.allArticles?.nodes; + expect(nodes).toBeDefined(); + expect(nodes!.length).toBeGreaterThan(0); + + for (const node of nodes!) { + // Both individual scores should be populated + expect(typeof node.tsvRank).toBe('number'); + expect(typeof node.embeddingVectorDistance).toBe('number'); + // Composite should be in valid range + expect(typeof node.searchScore).toBe('number'); + expect(node.searchScore).toBeGreaterThanOrEqual(0); + expect(node.searchScore).toBeLessThanOrEqual(1); + } + }); +}); + +// ─── Test Suite: @searchConfig with sigmoid normalization ──────────────────── + +describe('@searchConfig with sigmoid normalization', () => { + let teardown: () => Promise; + let query: QueryFn; + + beforeAll(async () => { + const unifiedPlugin = createUnifiedSearchPlugin({ + adapters: [ + createTsvectorAdapter(), + createBm25Adapter(), + createTrgmAdapter({ defaultThreshold: 0.1 }), + createPgvectorAdapter(), + ], + enableSearchScore: true, + enableFullTextSearch: true, + }); + + // Inject @searchConfig with sigmoid normalization + const smartTagsPlugin = makeTestSmartTagsPlugin({ + articles: { + searchConfig: { + normalization: 'sigmoid', + }, + }, + }); + + const testPreset = { + extends: [ConnectionFilterPreset()], + plugins: [ + TsvectorCodecPlugin, + Bm25CodecPlugin, + VectorCodecPlugin, + smartTagsPlugin, + unifiedPlugin, + ], + }; + + const connections = await getConnections( + { + schemas: ['unified_search_test'], + preset: testPreset, + useRoot: true, + authRole: 'postgres', + }, + [seed.sqlfile([join(__dirname, './setup.sql')])] + ); + + teardown = connections.teardown; + query = connections.query; + }); + + afterAll(async () => { + if (teardown) { + await teardown(); + } + }); + + it('produces valid scores with sigmoid normalization forced', async () => { + const result = await query(` + query { + allArticles(where: { + tsvTsv: "database" + }) { + nodes { + rowId + title + tsvRank + searchScore + } + } + } + `); + + expect(result.errors).toBeUndefined(); + const nodes = result.data?.allArticles?.nodes; + expect(nodes).toBeDefined(); + expect(nodes!.length).toBeGreaterThan(0); + + for (const node of nodes!) { + expect(typeof node.searchScore).toBe('number'); + // Sigmoid normalization always produces values in (0, 1) + expect(node.searchScore).toBeGreaterThan(0); + expect(node.searchScore).toBeLessThan(1); + } + }); +}); + +// ─── Test Suite: @hasChunks chunk-aware querying ───────────────────────────── + +describe('@hasChunks chunk-aware querying integration', () => { + let db: PgTestClient; + let teardown: () => Promise; + let query: QueryFn; + + beforeAll(async () => { + const unifiedPlugin = createUnifiedSearchPlugin({ + adapters: [ + createTsvectorAdapter(), + createPgvectorAdapter(), + ], + enableSearchScore: true, + }); + + // Inject @hasChunks on the posts table pointing to posts_chunks + const smartTagsPlugin = makeTestSmartTagsPlugin({ + posts: { + hasChunks: { + chunksTable: 'posts_chunks', + parentFk: 'post_id', + parentPk: 'id', + embeddingField: 'embedding', + }, + }, + }); + + const testPreset = { + extends: [ConnectionFilterPreset()], + plugins: [ + TsvectorCodecPlugin, + VectorCodecPlugin, + smartTagsPlugin, + unifiedPlugin, + ], + }; + + const connections = await getConnections( + { + schemas: ['unified_search_test'], + preset: testPreset, + useRoot: true, + authRole: 'postgres', + }, + [seed.sqlfile([join(__dirname, './setup.sql')])] + ); + + db = connections.db; + teardown = connections.teardown; + query = connections.query; + + await db.client.query('BEGIN'); + }); + + afterAll(async () => { + if (db) { + try { + await db.client.query('ROLLBACK'); + } catch { + // Ignore rollback errors + } + } + if (teardown) { + await teardown(); + } + }); + + beforeEach(async () => { + await db.beforeEach(); + }); + + afterEach(async () => { + await db.afterEach(); + }); + + it('returns chunk-aware distance (LEAST of parent + chunks)', async () => { + // Query with vector close to [1,0,0]. Post 1 parent is [0.5,0.5,0] but + // chunk 1 is [0.95,0.05,0] — much closer. The chunk-aware query should + // return the smaller distance from the chunk. + const result = await query(` + query { + allPosts(where: { + vectorEmbedding: { vector: [1, 0, 0], metric: COSINE } + }) { + nodes { + rowId + title + embeddingVectorDistance + } + } + } + `); + + expect(result.errors).toBeUndefined(); + const nodes = result.data?.allPosts?.nodes; + expect(nodes).toBeDefined(); + expect(nodes!.length).toBeGreaterThan(0); + + // Find post 1 + const post1 = nodes!.find((n) => n.rowId === 1); + expect(post1).toBeDefined(); + expect(typeof post1!.embeddingVectorDistance).toBe('number'); + + // Post 1's chunk [0.95, 0.05, 0] is much closer to [1,0,0] than + // the parent [0.5, 0.5, 0]. Cosine distance of chunk ≈ 0.05 + // while parent ≈ 0.29. Chunk-aware should use the smaller value. + expect(post1!.embeddingVectorDistance).toBeLessThan(0.15); + }); + + it('returns standard distance when includeChunks is false', async () => { + const result = await query(` + query { + allPosts(where: { + vectorEmbedding: { vector: [1, 0, 0], metric: COSINE, includeChunks: false } + }) { + nodes { + rowId + title + embeddingVectorDistance + } + } + } + `); + + expect(result.errors).toBeUndefined(); + const nodes = result.data?.allPosts?.nodes; + expect(nodes).toBeDefined(); + expect(nodes!.length).toBeGreaterThan(0); + + // Find post 1 — without chunks, distance is from parent [0.5, 0.5, 0] + const post1 = nodes!.find((n) => n.rowId === 1); + expect(post1).toBeDefined(); + expect(typeof post1!.embeddingVectorDistance).toBe('number'); + + // Parent-only distance should be larger than the chunk-aware distance + // Parent [0.5, 0.5, 0] to [1, 0, 0] ≈ 0.29 cosine distance + expect(post1!.embeddingVectorDistance).toBeGreaterThan(0.15); + }); + + it('applies distance threshold with chunk-aware query', async () => { + // Use a tight distance threshold that only the closest chunk satisfies + const result = await query(` + query { + allPosts(where: { + vectorEmbedding: { vector: [1, 0, 0], metric: COSINE, distance: 0.1 } + }) { + nodes { + rowId + title + embeddingVectorDistance + } + } + } + `); + + expect(result.errors).toBeUndefined(); + const nodes = result.data?.allPosts?.nodes; + expect(nodes).toBeDefined(); + + // Only post 1 should match (chunk [0.95, 0.05, 0] is within 0.1 of [1,0,0]) + // Post 2's closest chunk [0.2, 0.8, 0.1] is much farther + for (const node of nodes!) { + expect(node.embeddingVectorDistance).toBeLessThanOrEqual(0.1); + } + }); +}); + +// ─── Test Suite: Phase I validation — nonexistent recency field ────────────── + +describe('Phase I: schema-time validation of boost_recency_field', () => { + let teardown: () => Promise; + let query: QueryFn; + let consoleWarnSpy: jest.SpyInstance; + + beforeAll(async () => { + consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(() => {}); + + const unifiedPlugin = createUnifiedSearchPlugin({ + adapters: [ + createTsvectorAdapter(), + createPgvectorAdapter(), + ], + enableSearchScore: true, + }); + + // Inject @searchConfig with a nonexistent recency field — should trigger + // Phase I validation and gracefully disable recency boost + const smartTagsPlugin = makeTestSmartTagsPlugin({ + // Use the "documents" table which does NOT have an "updated_at" column + documents: { + searchConfig: { + boost_recent: true, + boost_recency_field: 'nonexistent_field', + }, + }, + }); + + const testPreset = { + extends: [ConnectionFilterPreset()], + plugins: [ + TsvectorCodecPlugin, + VectorCodecPlugin, + smartTagsPlugin, + unifiedPlugin, + ], + }; + + const connections = await getConnections( + { + schemas: ['unified_search_test'], + preset: testPreset, + useRoot: true, + authRole: 'postgres', + }, + [seed.sqlfile([join(__dirname, './setup.sql')])] + ); + + teardown = connections.teardown; + query = connections.query; + }); + + afterAll(async () => { + consoleWarnSpy.mockRestore(); + if (teardown) { + await teardown(); + } + }); + + it('warns about nonexistent recency field and does not crash', async () => { + // The schema should have built without crashing + // Verify by running a query that exercises searchScore + const result = await query(` + query { + allDocuments(where: { + tsvTsv: "machine learning" + }) { + nodes { + title + searchScore + } + } + } + `); + + expect(result.errors).toBeUndefined(); + const nodes = result.data?.allDocuments?.nodes; + expect(nodes).toBeDefined(); + expect(nodes!.length).toBeGreaterThan(0); + + // searchScore should still work (just without recency boost) + for (const node of nodes!) { + expect(typeof node.searchScore).toBe('number'); + expect(node.searchScore).toBeGreaterThanOrEqual(0); + expect(node.searchScore).toBeLessThanOrEqual(1); + } + }); + + it('emitted a console.warn about the missing field', () => { + // Phase I validation should have warned during schema build + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('nonexistent_field') + ); + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('not found on table') + ); + }); +}); diff --git a/graphile/graphile-search/src/__tests__/search-config.test.ts b/graphile/graphile-search/src/__tests__/search-config.test.ts new file mode 100644 index 000000000..bd64e45f6 --- /dev/null +++ b/graphile/graphile-search/src/__tests__/search-config.test.ts @@ -0,0 +1,413 @@ +/** + * Tests for Phase D (per-table @searchConfig smart tag) and Phase E (chunk querying). + * + * These tests verify the pure helper functions extracted from plugin.ts and pgvector.ts + * without requiring a database connection. + */ + +// We need to test the internal helpers. Since they're not exported, +// we test them indirectly through the adapter/plugin behavior using mocked codecs. + +import { createPgvectorAdapter } from '../adapters/pgvector'; +import { createUnifiedSearchPlugin } from '../plugin'; +import { createTsvectorAdapter } from '../adapters/tsvector'; +import { createBm25Adapter } from '../adapters/bm25'; + +// ─── pgvector adapter: chunk detection ──────────────────────────────────────── + +describe('pgvector adapter — chunk querying (Phase E)', () => { + describe('detectColumns with @hasChunks smart tag', () => { + const adapter = createPgvectorAdapter(); + + it('detects vector columns without chunks info when no @hasChunks tag', () => { + const codec = { + name: 'documents', + attributes: { + id: { codec: { name: 'uuid' } }, + embedding: { codec: { name: 'vector' } }, + }, + extensions: { tags: {} }, + }; + + const columns = adapter.detectColumns(codec, {}); + expect(columns).toHaveLength(1); + expect(columns[0].attributeName).toBe('embedding'); + expect(columns[0].adapterData).toBeUndefined(); + }); + + it('includes chunksInfo when @hasChunks smart tag has metadata', () => { + const codec = { + name: 'documents', + attributes: { + id: { codec: { name: 'uuid' } }, + embedding: { codec: { name: 'vector' } }, + }, + extensions: { + tags: { + hasChunks: { + chunksTable: 'documents_chunks', + parentFk: 'document_id', + embeddingField: 'embedding', + }, + }, + pg: { schemaName: 'app_public' }, + }, + }; + + const columns = adapter.detectColumns(codec, {}); + expect(columns).toHaveLength(1); + expect(columns[0].attributeName).toBe('embedding'); + expect(columns[0].adapterData).toEqual({ + chunksInfo: { + chunksSchema: 'app_public', + chunksTableName: 'documents_chunks', + parentFkField: 'document_id', + parentPkField: 'id', + embeddingField: 'embedding', + }, + }); + }); + + it('parses JSON-encoded @hasChunks smart tag', () => { + const codec = { + name: 'documents', + attributes: { + embedding: { codec: { name: 'vector' } }, + }, + extensions: { + tags: { + hasChunks: JSON.stringify({ + chunksTable: 'doc_chunks', + chunksSchema: 'private_schema', + parentFk: 'doc_id', + parentPk: 'doc_uuid', + embeddingField: 'vec', + }), + }, + }, + }; + + const columns = adapter.detectColumns(codec, {}); + expect(columns).toHaveLength(1); + expect(columns[0].adapterData).toEqual({ + chunksInfo: { + chunksSchema: 'private_schema', + chunksTableName: 'doc_chunks', + parentFkField: 'doc_id', + parentPkField: 'doc_uuid', + embeddingField: 'vec', + }, + }); + }); + + it('uses default parentFk, parentPk, and embeddingField when not specified', () => { + const codec = { + name: 'documents', + attributes: { + embedding: { codec: { name: 'vector' } }, + }, + extensions: { + tags: { + hasChunks: { chunksTable: 'my_chunks' }, + }, + }, + }; + + const columns = adapter.detectColumns(codec, {}); + expect(columns[0].adapterData).toEqual({ + chunksInfo: { + chunksSchema: null, + chunksTableName: 'my_chunks', + parentFkField: 'parent_id', + parentPkField: 'id', + embeddingField: 'embedding', + }, + }); + }); + + it('inherits schema from parent codec when not explicitly set', () => { + const codec = { + name: 'documents', + attributes: { + embedding: { codec: { name: 'vector' } }, + }, + extensions: { + tags: { + hasChunks: { chunksTable: 'my_chunks' }, + }, + pg: { schemaName: 'my_schema' }, + }, + }; + + const columns = adapter.detectColumns(codec, {}); + expect(columns[0].adapterData).toEqual({ + chunksInfo: { + chunksSchema: 'my_schema', + chunksTableName: 'my_chunks', + parentFkField: 'parent_id', + parentPkField: 'id', + embeddingField: 'embedding', + }, + }); + }); + + it('ignores boolean true @hasChunks (no metadata to resolve)', () => { + const codec = { + name: 'documents', + attributes: { + embedding: { codec: { name: 'vector' } }, + }, + extensions: { + tags: { hasChunks: true }, + }, + }; + + const columns = adapter.detectColumns(codec, {}); + expect(columns).toHaveLength(1); + expect(columns[0].adapterData).toBeUndefined(); + }); + + it('ignores invalid JSON in @hasChunks string', () => { + const codec = { + name: 'documents', + attributes: { + embedding: { codec: { name: 'vector' } }, + }, + extensions: { + tags: { hasChunks: 'not-valid-json' }, + }, + }; + + const columns = adapter.detectColumns(codec, {}); + expect(columns).toHaveLength(1); + expect(columns[0].adapterData).toBeUndefined(); + }); + + it('does not detect chunks when enableChunkQuerying is false', () => { + const noChunksAdapter = createPgvectorAdapter({ enableChunkQuerying: false }); + const codec = { + name: 'documents', + attributes: { + embedding: { codec: { name: 'vector' } }, + }, + extensions: { + tags: { + hasChunks: { chunksTable: 'doc_chunks' }, + }, + }, + }; + + const columns = noChunksAdapter.detectColumns(codec, {}); + expect(columns).toHaveLength(1); + expect(columns[0].adapterData).toBeUndefined(); + }); + }); + + describe('buildFilterApply with chunks', () => { + const adapter = createPgvectorAdapter(); + + // Mock sql object that mimics pg-sql2 behavior + const mockSql = { + identifier: (name: string) => `"${name}"`, + value: (val: any) => `'${val}'`, + raw: (s: string) => s, + fragment: (strings: TemplateStringsArray, ...values: any[]) => { + let result = ''; + strings.forEach((str, i) => { + result += str; + if (i < values.length) result += String(values[i]); + }); + return result; + }, + join: (parts: any[], sep: string) => parts.join(sep), + parens: (expr: any) => `(${expr})`, + }; + // Make sql a tagged template function too + const sql = Object.assign( + (strings: TemplateStringsArray, ...values: any[]) => { + let result = ''; + strings.forEach((str: string, i: number) => { + result += str; + if (i < values.length) result += String(values[i]); + }); + return result; + }, + mockSql, + ); + + it('generates standard query when no chunks info', () => { + const result = adapter.buildFilterApply( + sql, + 'tbl' as any, + { attributeName: 'embedding' }, + { vector: [1, 0, 0], metric: 'COSINE' }, + {}, + ); + + expect(result).not.toBeNull(); + expect(result!.scoreExpression).toBeDefined(); + // Standard query should NOT contain chunks table reference + expect(String(result!.scoreExpression)).not.toContain('chunks'); + }); + + it('generates chunk-aware query when chunks info is present', () => { + const result = adapter.buildFilterApply( + sql, + 'tbl' as any, + { + attributeName: 'embedding', + adapterData: { + chunksInfo: { + chunksSchema: null, + chunksTableName: 'documents_chunks', + parentFkField: 'document_id', + parentPkField: 'id', + embeddingField: 'embedding', + }, + }, + }, + { vector: [1, 0, 0], metric: 'COSINE' }, + {}, + ); + + expect(result).not.toBeNull(); + // Chunk-aware query should contain LEAST and chunks table reference + const scoreStr = String(result!.scoreExpression); + expect(scoreStr).toContain('LEAST'); + expect(scoreStr).toContain('documents_chunks'); + }); + + it('generates standard query when includeChunks is false', () => { + const result = adapter.buildFilterApply( + sql, + 'tbl' as any, + { + attributeName: 'embedding', + adapterData: { + chunksInfo: { + chunksSchema: null, + chunksTableName: 'documents_chunks', + parentFkField: 'document_id', + parentPkField: 'id', + embeddingField: 'embedding', + }, + }, + }, + { vector: [1, 0, 0], metric: 'COSINE', includeChunks: false }, + {}, + ); + + expect(result).not.toBeNull(); + // With includeChunks: false, should NOT use chunk query + const scoreStr = String(result!.scoreExpression); + expect(scoreStr).not.toContain('LEAST'); + expect(scoreStr).not.toContain('documents_chunks'); + }); + + it('generates schema-qualified chunk query when chunksSchema is set', () => { + const result = adapter.buildFilterApply( + sql, + 'tbl' as any, + { + attributeName: 'embedding', + adapterData: { + chunksInfo: { + chunksSchema: 'app_private', + chunksTableName: 'doc_chunks', + parentFkField: 'document_id', + parentPkField: 'row_id', + embeddingField: 'vec', + }, + }, + }, + { vector: [1, 0, 0], metric: 'COSINE' }, + {}, + ); + + expect(result).not.toBeNull(); + const scoreStr = String(result!.scoreExpression); + // Should contain both schema and table name + expect(scoreStr).toContain('app_private'); + expect(scoreStr).toContain('doc_chunks'); + expect(scoreStr).toContain('LEAST'); + }); + }); +}); + +// ─── Plugin: per-table @searchConfig smart tag (Phase D) ────────────────────── + +describe('per-table @searchConfig smart tag (Phase D)', () => { + describe('createUnifiedSearchPlugin respects @searchConfig', () => { + it('creates a plugin with searchScoreWeights that can be overridden per-table', () => { + // This test verifies the plugin can be created with global weights + // The actual per-table override happens at schema-build time when codec.extensions.tags.searchConfig is read + const plugin = createUnifiedSearchPlugin({ + adapters: [ + createTsvectorAdapter(), + createBm25Adapter(), + ], + enableSearchScore: true, + searchScoreWeights: { tsv: 0.8, bm25: 0.2 }, + }); + + expect(plugin).toBeDefined(); + expect(plugin.name).toBe('UnifiedSearchPlugin'); + }); + + it('creates a plugin without global weights (per-table only)', () => { + const plugin = createUnifiedSearchPlugin({ + adapters: [ + createTsvectorAdapter(), + createBm25Adapter(), + ], + enableSearchScore: true, + // No searchScoreWeights — per-table @searchConfig will be the only source + }); + + expect(plugin).toBeDefined(); + expect(plugin.name).toBe('UnifiedSearchPlugin'); + }); + }); +}); + +// ─── VectorNearbyInput: includeChunks field ────────────────────────────────── + +describe('VectorNearbyInput includeChunks field (Phase E)', () => { + it('adapter registers includeChunks field on VectorNearbyInput', () => { + const adapter = createPgvectorAdapter(); + + // Mock build object with minimal graphql types + const registeredTypes: Record = {}; + const mockBuild = { + graphql: { + GraphQLList: function GraphQLList(type: any) { return { type, kind: 'list' }; }, + GraphQLNonNull: function GraphQLNonNull(type: any) { return { type, kind: 'nonnull' }; }, + GraphQLFloat: 'Float', + GraphQLBoolean: 'Boolean', + }, + getTypeByName: (name: string) => registeredTypes[name] ?? name, + registerEnumType: (name: string, _scope: any, specFn: any, _origin: string) => { + registeredTypes[name] = { name, ...specFn() }; + }, + registerInputObjectType: (name: string, _scope: any, specFn: any, _origin: string) => { + registeredTypes[name] = { name, ...specFn() }; + }, + }; + + adapter.registerTypes(mockBuild); + + // VectorNearbyInput should be registered + expect(registeredTypes['VectorNearbyInput']).toBeDefined(); + + // Get the fields + const fieldsResult = registeredTypes['VectorNearbyInput'].fields; + const fields = typeof fieldsResult === 'function' ? fieldsResult() : fieldsResult; + + expect(fields.vector).toBeDefined(); + expect(fields.metric).toBeDefined(); + expect(fields.distance).toBeDefined(); + expect(fields.includeChunks).toBeDefined(); + expect(fields.includeChunks.type).toBe('Boolean'); + expect(fields.includeChunks.description).toContain('chunks'); + }); +}); diff --git a/graphile/graphile-search/src/__tests__/setup.sql b/graphile/graphile-search/src/__tests__/setup.sql index bae3ea5d0..c38dfcdb3 100644 --- a/graphile/graphile-search/src/__tests__/setup.sql +++ b/graphile/graphile-search/src/__tests__/setup.sql @@ -64,3 +64,95 @@ INSERT INTO unified_search_test.documents (id, title, body, tsv, embedding) VALU -- Reset sequence SELECT setval('unified_search_test.documents_id_seq', 5); + +-- ============================================================================ +-- ARTICLES table — for @searchConfig integration tests +-- Has tsvector + vector + updated_at for recency boost testing +-- ============================================================================ +CREATE TABLE unified_search_test.articles ( + id serial PRIMARY KEY, + title text NOT NULL, + body text NOT NULL, + tsv tsvector NOT NULL, + embedding vector(3) NOT NULL, + updated_at timestamptz NOT NULL DEFAULT now() +); + +CREATE INDEX idx_articles_tsv ON unified_search_test.articles USING gin(tsv); +CREATE INDEX idx_articles_embedding ON unified_search_test.articles + USING ivfflat(embedding vector_cosine_ops) WITH (lists = 1); + +-- BM25 index on body column +CREATE INDEX idx_articles_body_bm25 ON unified_search_test.articles + USING bm25(body) WITH (text_config='english'); + +-- pg_trgm GIN index on title column +CREATE INDEX idx_articles_title_trgm ON unified_search_test.articles + USING gin(title gin_trgm_ops); + +INSERT INTO unified_search_test.articles (id, title, body, tsv, embedding, updated_at) VALUES + (1, 'Guide to PostgreSQL', + 'PostgreSQL is a powerful open source relational database system with strong extensibility.', + to_tsvector('english', 'postgresql powerful open source relational database system extensibility'), + '[1, 0, 0]', + '2025-01-01T00:00:00Z'), + (2, 'Advanced SQL Patterns', + 'Advanced SQL patterns include window functions, CTEs, and recursive queries for complex data analysis.', + to_tsvector('english', 'advanced sql patterns window functions CTEs recursive queries complex data analysis'), + '[0, 1, 0]', + '2025-06-01T00:00:00Z'), + (3, 'Database Indexing Strategies', + 'Proper indexing is crucial for database performance. B-tree, GIN, and GiST indexes serve different purposes.', + to_tsvector('english', 'indexing crucial database performance b-tree GIN GiST indexes different purposes'), + '[0, 0, 1]', + '2026-01-01T00:00:00Z'); + +SELECT setval('unified_search_test.articles_id_seq', 3); + +-- ============================================================================ +-- POSTS table + POSTS_CHUNKS table — for @hasChunks integration tests +-- Parent table has an embedding; chunks table has fragment embeddings +-- ============================================================================ +CREATE TABLE unified_search_test.posts ( + id serial PRIMARY KEY, + title text NOT NULL, + body text NOT NULL, + tsv tsvector NOT NULL, + embedding vector(3) NOT NULL +); + +CREATE INDEX idx_posts_tsv ON unified_search_test.posts USING gin(tsv); +CREATE INDEX idx_posts_embedding ON unified_search_test.posts + USING ivfflat(embedding vector_cosine_ops) WITH (lists = 1); + +CREATE TABLE unified_search_test.posts_chunks ( + id serial PRIMARY KEY, + post_id integer NOT NULL REFERENCES unified_search_test.posts(id), + chunk_text text NOT NULL, + embedding vector(3) NOT NULL +); + +CREATE INDEX idx_posts_chunks_embedding ON unified_search_test.posts_chunks + USING ivfflat(embedding vector_cosine_ops) WITH (lists = 1); + +-- Seed posts: parent embeddings are approximate; chunks have more specific embeddings +INSERT INTO unified_search_test.posts (id, title, body, tsv, embedding) VALUES + (1, 'Intro to Vector Search', + 'Vector search uses embeddings to find semantically similar content.', + to_tsvector('english', 'vector search embeddings semantically similar content'), + '[0.5, 0.5, 0]'), + (2, 'Chunk-Based Retrieval', + 'Breaking documents into chunks improves retrieval accuracy for long texts.', + to_tsvector('english', 'chunk based retrieval documents chunks accuracy long texts'), + '[0, 0.5, 0.5]'); + +SELECT setval('unified_search_test.posts_id_seq', 2); + +-- Chunks for post 1: one chunk very close to [1,0,0], one moderately close +INSERT INTO unified_search_test.posts_chunks (id, post_id, chunk_text, embedding) VALUES + (1, 1, 'First chunk about vectors and similarity', '[0.95, 0.05, 0]'), + (2, 1, 'Second chunk about search algorithms', '[0.3, 0.7, 0]'), + (3, 2, 'First chunk about document chunking strategies', '[0.1, 0.1, 0.9]'), + (4, 2, 'Second chunk about retrieval methods', '[0.2, 0.8, 0.1]'); + +SELECT setval('unified_search_test.posts_chunks_id_seq', 4); diff --git a/graphile/graphile-search/src/adapters/pgvector.ts b/graphile/graphile-search/src/adapters/pgvector.ts index ea40e5e21..7608a4e91 100644 --- a/graphile/graphile-search/src/adapters/pgvector.ts +++ b/graphile/graphile-search/src/adapters/pgvector.ts @@ -22,6 +22,66 @@ function isVectorCodec(codec: any): boolean { return codec?.name === 'vector'; } +/** + * Chunks table info detected from @hasChunks smart tag. + */ +interface ChunksInfo { + chunksSchema: string | null; + chunksTableName: string; + parentFkField: string; + parentPkField: string; + embeddingField: string; +} + +/** + * Read @hasChunks smart tag from codec extensions. + * The tag value is a JSON object like: + * { + * "chunksTable": "documents_chunks", + * "chunksSchema": "app_private", // optional, defaults to parent table's schema + * "parentFk": "document_id", // optional, defaults to "parent_id" + * "parentPk": "id", // optional, defaults to "id" + * "embeddingField": "embedding" // optional, defaults to "embedding" + * } + */ +function getChunksInfo(codec: any): ChunksInfo | undefined { + const tags = codec?.extensions?.tags; + if (!tags) return undefined; + const raw = tags.hasChunks; + if (!raw) return undefined; + + let parsed: any; + if (typeof raw === 'string') { + try { + parsed = JSON.parse(raw); + } catch { + // If it's just "true" or a plain string, use convention-based defaults + return undefined; + } + } else if (typeof raw === 'object') { + parsed = raw; + } else if (raw === true) { + return undefined; // boolean true = no metadata, can't resolve + } else { + return undefined; + } + + if (!parsed.chunksTable) return undefined; + + // Resolve schema: explicit chunksSchema > parent codec schema > null + const chunksSchema = parsed.chunksSchema + || codec?.extensions?.pg?.schemaName + || null; + + return { + chunksSchema, + chunksTableName: parsed.chunksTable, + parentFkField: parsed.parentFk || 'parent_id', + parentPkField: parsed.parentPk || 'id', + embeddingField: parsed.embeddingField || 'embedding', + }; +} + export interface PgvectorAdapterOptions { /** * Filter prefix for vector filter fields. @@ -34,12 +94,21 @@ export interface PgvectorAdapterOptions { * @default 'COSINE' */ defaultMetric?: 'COSINE' | 'L2' | 'IP'; + + /** + * When true, tables with @hasChunks smart tag will transparently + * query through the chunks table to find the closest chunk. + * The parent row's vector distance is the minimum distance across + * all its chunks. + * @default true + */ + enableChunkQuerying?: boolean; } export function createPgvectorAdapter( options: PgvectorAdapterOptions = {} ): SearchAdapter { - const { filterPrefix = 'vector', defaultMetric = 'COSINE' } = options; + const { filterPrefix = 'vector', defaultMetric = 'COSINE', enableChunkQuerying = true } = options; return { name: 'vector', @@ -63,11 +132,16 @@ export function createPgvectorAdapter( if (!codec?.attributes) return []; const columns: SearchableColumn[] = []; + const chunksInfo = enableChunkQuerying ? getChunksInfo(codec) : undefined; + for (const [attributeName, attribute] of Object.entries( codec.attributes as Record )) { if (isVectorCodec(attribute.codec)) { - columns.push({ attributeName }); + columns.push({ + attributeName, + adapterData: chunksInfo ? { chunksInfo } : undefined, + }); } } return columns; @@ -134,6 +208,13 @@ export function createPgvectorAdapter( type: GraphQLFloat, description: 'Maximum distance threshold. Only rows within this distance are returned.', }, + includeChunks: { + type: build.graphql.GraphQLBoolean, + description: + 'When true (default for tables with @hasChunks), transparently queries ' + + 'the chunks table and returns the minimum distance across parent + all chunks. ' + + 'Set to false to only search the parent embedding.', + }, }; }, }), @@ -157,15 +238,62 @@ export function createPgvectorAdapter( ): FilterApplyResult | null { if (filterValue == null) return null; - const { vector, metric, distance } = filterValue; + const { vector, metric, distance, includeChunks } = filterValue; if (!vector || !Array.isArray(vector) || vector.length === 0) return null; const resolvedMetric = metric || defaultMetric; const operator = METRIC_OPERATORS[resolvedMetric] || METRIC_OPERATORS.COSINE; const vectorString = `[${vector.join(',')}]`; + const vectorExpr = sql`${sql.value(vectorString)}::vector`; + + // Check if this column has chunks info and chunk querying is requested + const adapterData = column.adapterData as { chunksInfo?: ChunksInfo } | undefined; + const chunksInfo = adapterData?.chunksInfo; + + if (chunksInfo && (includeChunks !== false)) { + // Chunk-aware query: find the closest chunk for each parent row + // Uses a lateral subquery to get the minimum distance across all chunks + const chunksTableRef = chunksInfo.chunksSchema + ? sql`${sql.identifier(chunksInfo.chunksSchema)}.${sql.identifier(chunksInfo.chunksTableName)}` + : sql`${sql.identifier(chunksInfo.chunksTableName)}`; + const parentFk = sql.identifier(chunksInfo.parentFkField); + const chunkEmbedding = sql.identifier(chunksInfo.embeddingField); + // Use the configured PK field (defaults to 'id', but can be overridden via @hasChunks tag) + const parentId = sql`${alias}.${sql.identifier(chunksInfo.parentPkField)}`; + // Alias to avoid ambiguity when the chunks table name might collide + const chunksAlias = sql.identifier('__chunks'); + + // Subquery: SELECT MIN(distance) FROM chunks WHERE chunks.parent_fk = parent.pk + const chunkDistanceSubquery = sql`( + SELECT MIN(${chunksAlias}.${chunkEmbedding} ${sql.raw(operator)} ${vectorExpr}) + FROM ${chunksTableRef} AS ${chunksAlias} + WHERE ${chunksAlias}.${parentFk} = ${parentId} + )`; + // Also compute direct parent distance if the parent has an embedding + const parentColumnExpr = sql`${alias}.${sql.identifier(column.attributeName)}`; + const parentDistanceExpr = sql`(${parentColumnExpr} ${sql.raw(operator)} ${vectorExpr})`; + + // Use LEAST of parent distance and closest chunk distance + // COALESCE handles cases where parent or chunks may not have embeddings + const combinedDistanceExpr = sql`LEAST( + COALESCE(${parentDistanceExpr}, 'Infinity'::float), + COALESCE(${chunkDistanceSubquery}, 'Infinity'::float) + )`; + + let whereClause: SQL | null = null; + if (distance !== undefined && distance !== null) { + whereClause = sql`${combinedDistanceExpr} <= ${sql.value(distance)}`; + } + + return { + whereClause, + scoreExpression: combinedDistanceExpr, + }; + } + + // Standard (non-chunk) query const columnExpr = sql`${alias}.${sql.identifier(column.attributeName)}`; - const vectorExpr = sql`${sql.value(vectorString)}::vector`; const distanceExpr = sql`(${columnExpr} ${sql.raw(operator)} ${vectorExpr})`; let whereClause: SQL | null = null; diff --git a/graphile/graphile-search/src/plugin.ts b/graphile/graphile-search/src/plugin.ts index db66ddb6c..d8b6de2f6 100644 --- a/graphile/graphile-search/src/plugin.ts +++ b/graphile/graphile-search/src/plugin.ts @@ -66,6 +66,102 @@ interface SearchScoreDetails { selectIndex: number; } +/** + * Per-table search configuration read from the @searchConfig smart tag. + * Written by DataFullTextSearch, DataBm25, and DataSearch in constructive-db. + */ +interface SearchConfig { + weights?: Record; + normalization?: 'linear' | 'sigmoid'; + boost_recent?: boolean; + boost_recency_field?: string; + boost_recency_decay?: number; +} + +/** + * Read the @searchConfig smart tag from a codec's extensions. + * Returns undefined if no searchConfig tag is present. + */ +function getSearchConfig(codec: PgCodecWithAttributes): SearchConfig | undefined { + const tags = (codec.extensions as any)?.tags; + if (!tags) return undefined; + const raw = tags.searchConfig; + if (!raw) return undefined; + // Smart tags can be strings (JSON-encoded) or already-parsed objects + if (typeof raw === 'string') { + try { + return JSON.parse(raw) as SearchConfig; + } catch { + return undefined; + } + } + if (typeof raw === 'object') return raw as SearchConfig; + return undefined; +} + +/** + * Normalize a raw score to 0..1 using the specified strategy. + * + * When strategy is 'sigmoid', sigmoid normalization is used for ALL adapters + * (both bounded and unbounded). When strategy is 'linear' (default), + * known-range adapters use linear normalization and unbounded adapters + * use sigmoid normalization as fallback. + */ +function normalizeScore( + score: number, + lowerIsBetter: boolean, + range: [number, number] | null, + strategy: 'linear' | 'sigmoid' = 'linear', +): number { + let normalized: number; + + if (range && strategy === 'linear') { + // Known range + linear strategy: linear normalization + const [min, max] = range; + normalized = lowerIsBetter + ? 1 - (score - min) / (max - min) + : (score - min) / (max - min); + } else { + // Unbounded range, or explicit sigmoid strategy: sigmoid normalization + if (lowerIsBetter) { + // BM25: negative scores, more negative = better + normalized = 1 / (1 + Math.abs(score)); + } else { + // Higher-is-better: map via sigmoid + normalized = score / (1 + score); + } + } + + return Math.max(0, Math.min(1, normalized)); +} + +/** + * Apply recency boost to a normalized score. + * Uses exponential decay based on age in days. + * + * @param normalizedScore - The already-normalized score (0..1) + * @param recencyValue - The raw recency field value (timestamp string from SQL row) + * @param decay - Decay factor per day (e.g. 0.95 means 5% penalty per day) + */ +function applyRecencyBoost( + normalizedScore: number, + recencyValue: any, + decay: number, +): number { + if (recencyValue == null) return normalizedScore; + + const fieldDate = new Date(recencyValue); + if (isNaN(fieldDate.getTime())) return normalizedScore; + + const now = new Date(); + const ageInDays = (now.getTime() - fieldDate.getTime()) / (1000 * 60 * 60 * 24); + if (ageInDays < 0) return normalizedScore; // future dates get no penalty + + // Exponential decay: boost = decay^ageInDays + const boost = Math.pow(decay, ageInDays); + return normalizedScore * boost; +} + /** * Cache key for discovered columns per adapter per codec. * Built during the first hook invocation and reused across hooks. @@ -277,6 +373,7 @@ export function createUnifiedSearchPlugin( GraphQLObjectType_fields(fields, build, context) { const { inflection, + sql, graphql: { GraphQLFloat }, grafast: { lambda }, } = build; @@ -383,6 +480,29 @@ export function createUnifiedSearchPlugin( } } + // Read per-table @searchConfig smart tag (written by DataSearch/DataFullTextSearch/DataBm25) + // Per-table config overrides global searchScoreWeights + const tableSearchConfig = getSearchConfig(codec); + + // Resolve effective weights: per-table > global > equal (undefined) + const effectiveWeights = tableSearchConfig?.weights ?? options.searchScoreWeights; + // Resolve normalization strategy: per-table > default 'linear' + const normalizationStrategy = tableSearchConfig?.normalization ?? 'linear'; + // Recency boost config from per-table smart tag + let boostRecent = tableSearchConfig?.boost_recent ?? false; + const boostRecencyField = tableSearchConfig?.boost_recency_field ?? 'updated_at'; + const boostRecencyDecay = tableSearchConfig?.boost_recency_decay ?? 0.95; + + // Phase I: Validate that the recency field actually exists on the table. + // If it doesn't, disable recency boost gracefully instead of crashing at query time. + if (boostRecent && boostRecencyField && !codec.attributes[boostRecencyField]) { + console.warn( + `[graphile-search] @searchConfig.boost_recency_field "${boostRecencyField}" ` + + `not found on table "${codec.name}". Recency boost disabled for this table.` + ); + boostRecent = false; + } + newFields = build.extend( newFields, { @@ -395,6 +515,7 @@ export function createUnifiedSearchPlugin( description: 'Composite search relevance score (0..1, higher = more relevant). ' + 'Computed by normalizing and averaging all active search signals. ' + + 'Supports per-table weight customization via @searchConfig smart tag. ' + 'Returns null when no search filters are active.', type: GraphQLFloat, plan($step: any) { @@ -411,14 +532,30 @@ export function createUnifiedSearchPlugin( // Collect all meta steps for all adapters const $metaSteps = allMetaKeys.map((mk) => $select.getMeta(mk.metaKey)); + // If recency boost is configured, inject the recency field into + // the SQL SELECT so we can read it by numeric index at runtime. + let recencySelectIndex: number | null = null; + if (boostRecent && boostRecencyField) { + const recencyColumnSql = sql`${$select.alias}.${sql.identifier(boostRecencyField)}::text`; + recencySelectIndex = $select.selectAndReturnIndex(recencyColumnSql); + } + + // Capture the index in a local const for the lambda closure + const capturedRecencyIndex = recencySelectIndex; + return lambda( [...$metaSteps, $row], (args: readonly any[]) => { const row = args[args.length - 1]; if (row == null) return null; - let sum = 0; - let count = 0; + let weightedSum = 0; + let totalWeight = 0; + + // Read recency value from the injected SELECT column + const recencyValue = (boostRecent && capturedRecencyIndex != null) + ? row[capturedRecencyIndex] + : null; for (let i = 0; i < allMetaKeys.length; i++) { const details = args[i] as SearchScoreDetails | null; @@ -431,78 +568,31 @@ export function createUnifiedSearchPlugin( if (typeof score !== 'number' || isNaN(score)) continue; const mk = allMetaKeys[i]; + const weight = effectiveWeights?.[mk.adapterName] ?? 1; + + // Normalize using the resolved strategy + let normalized = normalizeScore( + score, + mk.lowerIsBetter, + mk.range, + normalizationStrategy, + ); - // Normalize to 0..1 (higher = better) - let normalized: number; - if (mk.range) { - // Known range: linear normalization - const [min, max] = mk.range; - normalized = mk.lowerIsBetter - ? 1 - (score - min) / (max - min) - : (score - min) / (max - min); - } else { - // Unbounded: sigmoid normalization - if (mk.lowerIsBetter) { - // BM25: negative scores, more negative = better - // Map via 1 / (1 + abs(score)) - normalized = 1 / (1 + Math.abs(score)); - } else { - // Hypothetical unbounded higher-is-better - normalized = score / (1 + score); - } - } - - // Clamp to [0, 1] - normalized = Math.max(0, Math.min(1, normalized)); - sum += normalized; - count++; - } - - if (count === 0) return null; - - // Apply optional weights - if (options.searchScoreWeights) { - let weightedSum = 0; - let totalWeight = 0; - let weightIdx = 0; - - for (let i = 0; i < allMetaKeys.length; i++) { - const details = args[i] as SearchScoreDetails | null; - if (details == null || details.selectIndex == null) continue; - - const rawValue = row[details.selectIndex]; - if (rawValue == null) continue; - - const mk = allMetaKeys[i]; - const weight = options.searchScoreWeights[mk.adapterName] ?? 1; - - const score = TYPES.float.fromPg(rawValue as string); - if (typeof score !== 'number' || isNaN(score)) continue; - - let normalized: number; - if (mk.range) { - const [min, max] = mk.range; - normalized = mk.lowerIsBetter - ? 1 - (score - min) / (max - min) - : (score - min) / (max - min); - } else { - if (mk.lowerIsBetter) { - normalized = 1 / (1 + Math.abs(score)); - } else { - normalized = score / (1 + score); - } - } - - normalized = Math.max(0, Math.min(1, normalized)); - weightedSum += normalized * weight; - totalWeight += weight; - weightIdx++; + // Apply recency boost if configured + if (boostRecent && recencyValue != null) { + normalized = applyRecencyBoost( + normalized, + recencyValue, + boostRecencyDecay, + ); } - return totalWeight > 0 ? weightedSum / totalWeight : null; + weightedSum += normalized * weight; + totalWeight += weight; } - return sum / count; + if (totalWeight === 0) return null; + return weightedSum / totalWeight; } ); }, diff --git a/graphql/codegen/src/core/codegen/docs-utils.ts b/graphql/codegen/src/core/codegen/docs-utils.ts index a56db2d6b..786314fde 100644 --- a/graphql/codegen/src/core/codegen/docs-utils.ts +++ b/graphql/codegen/src/core/codegen/docs-utils.ts @@ -180,6 +180,18 @@ function isSearchComputedField(f: CleanField): boolean { return false; } +/** + * Check whether the schema's VectorNearbyInput type includes an + * `includeChunks` field. When present, tables with embedding columns + * support transparent chunk-aware vector search. + */ +function hasIncludeChunksCapability(registry?: TypeRegistry): boolean { + if (!registry) return false; + const vectorInput = registry.get('VectorNearbyInput'); + if (!vectorInput || vectorInput.kind !== 'INPUT_OBJECT') return false; + return !!vectorInput.inputFields?.some((f) => f.name === 'includeChunks'); +} + /** * Categorize "special" fields on a table into PostGIS, pgvector, and * Unified Search groups. Returns only non-empty groups. @@ -225,11 +237,17 @@ export function categorizeSpecialFields( } if (embedding.length > 0) { + const chunkAware = hasIncludeChunksCapability(typeRegistry); + const baseDesc = + 'High-dimensional vector columns for semantic similarity search. Query via the Unified Search API pgvector adapter using cosine, L2, or inner-product distance.'; + const chunkDesc = chunkAware + ? baseDesc + + ' Supports chunk-aware search: set `includeChunks: true` in VectorNearbyInput to transparently query across parent and chunk embeddings, returning the minimum distance.' + : baseDesc; groups.push({ category: 'embedding', label: 'pgvector embedding fields', - description: - 'High-dimensional vector columns for semantic similarity search. Query via the Unified Search API pgvector adapter using cosine, L2, or inner-product distance.', + description: chunkDesc, fields: embedding, }); }