Skip to content

Commit b95a049

Browse files
fix(kb): chunking config persistence (#3877)
* fix(kb): persist chunking config correctly * fix kb config as sot * remove dead code * fix doc req bodies * add defaults for async for legacy docs
1 parent a79c8a7 commit b95a049

File tree

14 files changed

+93
-112
lines changed

14 files changed

+93
-112
lines changed

apps/sim/app/api/knowledge/[id]/documents/route.test.ts

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -457,11 +457,8 @@ describe('Knowledge Base Documents API Route', () => {
457457
},
458458
],
459459
processingOptions: {
460-
chunkSize: 1024,
461-
minCharactersPerChunk: 100,
462460
recipe: 'default',
463461
lang: 'en',
464-
chunkOverlap: 200,
465462
},
466463
}
467464

@@ -533,11 +530,8 @@ describe('Knowledge Base Documents API Route', () => {
533530
},
534531
],
535532
processingOptions: {
536-
chunkSize: 50, // Invalid: too small
537-
minCharactersPerChunk: 0, // Invalid: too small
538533
recipe: 'default',
539534
lang: 'en',
540-
chunkOverlap: 1000, // Invalid: too large
541535
},
542536
}
543537

apps/sim/app/api/knowledge/[id]/documents/route.ts

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -38,26 +38,14 @@ const CreateDocumentSchema = z.object({
3838
documentTagsData: z.string().optional(),
3939
})
4040

41-
/**
42-
* Schema for bulk document creation with processing options
43-
*
44-
* Processing options units:
45-
* - chunkSize: tokens (1 token ≈ 4 characters)
46-
* - minCharactersPerChunk: characters
47-
* - chunkOverlap: characters
48-
*/
4941
const BulkCreateDocumentsSchema = z.object({
5042
documents: z.array(CreateDocumentSchema),
51-
processingOptions: z.object({
52-
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
53-
chunkSize: z.number().min(100).max(4000),
54-
/** Minimum chunk size in characters */
55-
minCharactersPerChunk: z.number().min(1).max(2000),
56-
recipe: z.string(),
57-
lang: z.string(),
58-
/** Overlap between chunks in characters */
59-
chunkOverlap: z.number().min(0).max(500),
60-
}),
43+
processingOptions: z
44+
.object({
45+
recipe: z.string().optional(),
46+
lang: z.string().optional(),
47+
})
48+
.optional(),
6149
bulk: z.literal(true),
6250
})
6351

@@ -246,8 +234,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
246234
knowledgeBaseId,
247235
documentsCount: createdDocuments.length,
248236
uploadType: 'bulk',
249-
chunkSize: validatedData.processingOptions.chunkSize,
250-
recipe: validatedData.processingOptions.recipe,
237+
recipe: validatedData.processingOptions?.recipe,
251238
})
252239
} catch (_e) {
253240
// Silently fail
@@ -256,7 +243,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
256243
processDocumentsWithQueue(
257244
createdDocuments,
258245
knowledgeBaseId,
259-
validatedData.processingOptions,
246+
validatedData.processingOptions ?? {},
260247
requestId
261248
).catch((error: unknown) => {
262249
logger.error(`[${requestId}] Critical error in document processing pipeline:`, error)

apps/sim/app/api/knowledge/[id]/documents/upsert/route.ts

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,12 @@ const UpsertDocumentSchema = z.object({
2525
fileSize: z.number().min(1, 'File size must be greater than 0'),
2626
mimeType: z.string().min(1, 'MIME type is required'),
2727
documentTagsData: z.string().optional(),
28-
processingOptions: z.object({
29-
chunkSize: z.number().min(100).max(4000),
30-
minCharactersPerChunk: z.number().min(1).max(2000),
31-
recipe: z.string(),
32-
lang: z.string(),
33-
chunkOverlap: z.number().min(0).max(500),
34-
}),
28+
processingOptions: z
29+
.object({
30+
recipe: z.string().optional(),
31+
lang: z.string().optional(),
32+
})
33+
.optional(),
3534
workflowId: z.string().optional(),
3635
})
3736

@@ -166,7 +165,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
166165
processDocumentsWithQueue(
167166
createdDocuments,
168167
knowledgeBaseId,
169-
validatedData.processingOptions,
168+
validatedData.processingOptions ?? {},
170169
requestId
171170
).catch((error: unknown) => {
172171
logger.error(`[${requestId}] Critical error in document processing pipeline:`, error)
@@ -178,8 +177,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
178177
knowledgeBaseId,
179178
documentsCount: 1,
180179
uploadType: 'single',
181-
chunkSize: validatedData.processingOptions.chunkSize,
182-
recipe: validatedData.processingOptions.recipe,
180+
recipe: validatedData.processingOptions?.recipe,
183181
})
184182
} catch (_e) {
185183
// Silently fail

apps/sim/app/api/v1/knowledge/[id]/documents/route.ts

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,6 @@ export async function POST(request: NextRequest, { params }: DocumentsRouteParam
187187
requestId
188188
)
189189

190-
const chunkingConfig = result.kb.chunkingConfig ?? { maxSize: 1024, minSize: 100, overlap: 200 }
191-
192190
const documentData: DocumentData = {
193191
documentId: newDocument.id,
194192
filename: file.name,
@@ -197,18 +195,7 @@ export async function POST(request: NextRequest, { params }: DocumentsRouteParam
197195
mimeType: contentType,
198196
}
199197

200-
processDocumentsWithQueue(
201-
[documentData],
202-
knowledgeBaseId,
203-
{
204-
chunkSize: chunkingConfig.maxSize,
205-
minCharactersPerChunk: chunkingConfig.minSize,
206-
chunkOverlap: chunkingConfig.overlap,
207-
recipe: 'default',
208-
lang: 'en',
209-
},
210-
requestId
211-
).catch(() => {
198+
processDocumentsWithQueue([documentData], knowledgeBaseId, {}, requestId).catch(() => {
212199
// Processing errors are logged internally
213200
})
214201

apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -195,9 +195,6 @@ export function AddDocumentsModal({
195195

196196
try {
197197
await uploadFiles([fileToRetry], knowledgeBaseId, {
198-
chunkSize: chunkingConfig?.maxSize || 1024,
199-
minCharactersPerChunk: chunkingConfig?.minSize || 1,
200-
chunkOverlap: chunkingConfig?.overlap || 200,
201198
recipe: 'default',
202199
})
203200
removeFile(index)
@@ -217,9 +214,6 @@ export function AddDocumentsModal({
217214

218215
try {
219216
await uploadFiles(files, knowledgeBaseId, {
220-
chunkSize: chunkingConfig?.maxSize || 1024,
221-
minCharactersPerChunk: chunkingConfig?.minSize || 1,
222-
chunkOverlap: chunkingConfig?.overlap || 200,
223217
recipe: 'default',
224218
})
225219
logger.info(`Successfully uploaded ${files.length} files`)

apps/sim/app/workspace/[workspaceId]/knowledge/components/base-card/base-card.tsx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ interface BaseCardProps {
2020
createdAt?: string
2121
updatedAt?: string
2222
connectorTypes?: string[]
23+
chunkingConfig?: { maxSize: number; minSize: number; overlap: number }
2324
onUpdate?: (id: string, name: string, description: string) => Promise<void>
2425
onDelete?: (id: string) => Promise<void>
2526
}
@@ -78,6 +79,7 @@ export function BaseCard({
7879
description,
7980
updatedAt,
8081
connectorTypes = [],
82+
chunkingConfig,
8183
onUpdate,
8284
onDelete,
8385
}: BaseCardProps) {
@@ -256,6 +258,7 @@ export function BaseCard({
256258
knowledgeBaseId={id}
257259
initialName={title}
258260
initialDescription={description === 'No description provided' ? '' : description}
261+
chunkingConfig={chunkingConfig}
259262
onSave={handleSave}
260263
/>
261264
)}

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -269,9 +269,6 @@ export const CreateBaseModal = memo(function CreateBaseModal({
269269
if (files.length > 0) {
270270
try {
271271
const uploadedFiles = await uploadFiles(files, newKnowledgeBase.id, {
272-
chunkSize: data.maxChunkSize,
273-
minCharactersPerChunk: data.minChunkSize,
274-
chunkOverlap: data.overlapSize,
275272
recipe: 'default',
276273
})
277274

@@ -358,25 +355,31 @@ export const CreateBaseModal = memo(function CreateBaseModal({
358355
<Label htmlFor='minChunkSize'>Min Chunk Size (characters)</Label>
359356
<Input
360357
id='minChunkSize'
358+
type='number'
359+
min={1}
360+
max={2000}
361+
step={1}
361362
placeholder='100'
362363
{...register('minChunkSize', { valueAsNumber: true })}
363364
className={cn(errors.minChunkSize && 'border-[var(--text-error)]')}
364365
autoComplete='off'
365366
data-form-type='other'
366-
name='min-chunk-size'
367367
/>
368368
</div>
369369

370370
<div className='flex flex-col gap-2'>
371371
<Label htmlFor='maxChunkSize'>Max Chunk Size (tokens)</Label>
372372
<Input
373373
id='maxChunkSize'
374+
type='number'
375+
min={100}
376+
max={4000}
377+
step={1}
374378
placeholder='1024'
375379
{...register('maxChunkSize', { valueAsNumber: true })}
376380
className={cn(errors.maxChunkSize && 'border-[var(--text-error)]')}
377381
autoComplete='off'
378382
data-form-type='other'
379-
name='max-chunk-size'
380383
/>
381384
</div>
382385
</div>
@@ -385,12 +388,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
385388
<Label htmlFor='overlapSize'>Overlap (tokens)</Label>
386389
<Input
387390
id='overlapSize'
391+
type='number'
392+
min={0}
393+
max={500}
394+
step={1}
388395
placeholder='200'
389396
{...register('overlapSize', { valueAsNumber: true })}
390397
className={cn(errors.overlapSize && 'border-[var(--text-error)]')}
391398
autoComplete='off'
392399
data-form-type='other'
393-
name='overlap-size'
394400
/>
395401
<p className='text-[var(--text-muted)] text-xs'>
396402
1 token ≈ 4 characters. Max chunk size and overlap are in tokens.

apps/sim/app/workspace/[workspaceId]/knowledge/components/edit-knowledge-base-modal/edit-knowledge-base-modal.tsx

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import {
1717
Textarea,
1818
} from '@/components/emcn'
1919
import { cn } from '@/lib/core/utils/cn'
20+
import type { ChunkingConfig } from '@/lib/knowledge/types'
2021

2122
const logger = createLogger('EditKnowledgeBaseModal')
2223

@@ -26,6 +27,7 @@ interface EditKnowledgeBaseModalProps {
2627
knowledgeBaseId: string
2728
initialName: string
2829
initialDescription: string
30+
chunkingConfig?: ChunkingConfig
2931
onSave: (id: string, name: string, description: string) => Promise<void>
3032
}
3133

@@ -49,6 +51,7 @@ export const EditKnowledgeBaseModal = memo(function EditKnowledgeBaseModal({
4951
knowledgeBaseId,
5052
initialName,
5153
initialDescription,
54+
chunkingConfig,
5255
onSave,
5356
}: EditKnowledgeBaseModalProps) {
5457
const [isSubmitting, setIsSubmitting] = useState(false)
@@ -137,6 +140,47 @@ export const EditKnowledgeBaseModal = memo(function EditKnowledgeBaseModal({
137140
</p>
138141
)}
139142
</div>
143+
144+
{chunkingConfig && (
145+
<div className='flex flex-col gap-2'>
146+
<Label>Chunking Configuration</Label>
147+
<div className='grid grid-cols-3 gap-2'>
148+
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
149+
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
150+
Max Size
151+
</p>
152+
<p className='font-medium text-[var(--text-primary)] text-sm'>
153+
{chunkingConfig.maxSize.toLocaleString()}
154+
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
155+
tokens
156+
</span>
157+
</p>
158+
</div>
159+
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
160+
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
161+
Min Size
162+
</p>
163+
<p className='font-medium text-[var(--text-primary)] text-sm'>
164+
{chunkingConfig.minSize.toLocaleString()}
165+
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
166+
chars
167+
</span>
168+
</p>
169+
</div>
170+
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
171+
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
172+
Overlap
173+
</p>
174+
<p className='font-medium text-[var(--text-primary)] text-sm'>
175+
{chunkingConfig.overlap.toLocaleString()}
176+
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
177+
tokens
178+
</span>
179+
</p>
180+
</div>
181+
</div>
182+
</div>
183+
)}
140184
</div>
141185
</ModalBody>
142186

apps/sim/app/workspace/[workspaceId]/knowledge/hooks/use-knowledge-upload.ts

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,6 @@ export interface UploadError {
4646
}
4747

4848
export interface ProcessingOptions {
49-
chunkSize?: number
50-
minCharactersPerChunk?: number
51-
chunkOverlap?: number
5249
recipe?: string
5350
}
5451

@@ -1011,10 +1008,7 @@ export function useKnowledgeUpload(options: UseKnowledgeUploadOptions = {}) {
10111008
...file,
10121009
})),
10131010
processingOptions: {
1014-
chunkSize: processingOptions.chunkSize || 1024,
1015-
minCharactersPerChunk: processingOptions.minCharactersPerChunk || 1,
1016-
chunkOverlap: processingOptions.chunkOverlap || 200,
1017-
recipe: processingOptions.recipe || 'default',
1011+
recipe: processingOptions.recipe ?? 'default',
10181012
lang: 'en',
10191013
},
10201014
bulk: true,

apps/sim/app/workspace/[workspaceId]/knowledge/knowledge.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,7 @@ export function Knowledge() {
602602
knowledgeBaseId={activeKnowledgeBase.id}
603603
initialName={activeKnowledgeBase.name}
604604
initialDescription={activeKnowledgeBase.description || ''}
605+
chunkingConfig={activeKnowledgeBase.chunkingConfig}
605606
onSave={handleUpdateKnowledgeBase}
606607
/>
607608
)}

0 commit comments

Comments
 (0)