Skip to content

Basic chunking fixes #6150

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 0 additions & 87 deletions core/indexing/chunk/basic.test.ts

This file was deleted.

94 changes: 72 additions & 22 deletions core/indexing/chunk/basic.ts
Original file line number Diff line number Diff line change
@@ -1,47 +1,97 @@
import { ChunkWithoutID } from "../../index.js";
import { countTokensAsync } from "../../llm/countTokens.js";

/**
* Basic chunker that splits text content into chunks based on token size.
*
* Features:
* - Creates chunks up to maxChunkSize tokens in length
* - Preserves line breaks
* - Handles lines that exceed maxChunkSize
* - Properly tracks line numbers for each chunk
*
* @param contents The text content to chunk
* @param maxChunkSize Maximum number of tokens per chunk
* @returns AsyncGenerator yielding chunks without IDs
*/
export async function* basicChunker(
contents: string,
maxChunkSize: number,
): AsyncGenerator<ChunkWithoutID> {
// Don't generate chunks for empty content
if (contents.trim().length === 0) {
return;
}

// Split the content into lines
const lines = contents.split("\n");
// Handle the case where content ends with newline (creates an empty last element)
if (lines[lines.length - 1] === "" && contents.endsWith("\n")) {
lines.pop();
}

// Initialize chunking state
let chunkContent = "";
let chunkTokens = 0;
let startLine = 0;
let currLine = 0;

const lineTokens = await Promise.all(
contents.split("\n").map(async (l) => {
return {
line: l,
tokenCount: await countTokensAsync(l),
};
}),
);
// Calculate token counts for each line
const lineTokens: { line: string; tokenCount: number }[] = [];
for (const line of lines) {
lineTokens.push({
line,
tokenCount: await countTokensAsync(line),
});
}

// Process lines into chunks
for (let i = 0; i < lineTokens.length; i++) {
const { line, tokenCount } = lineTokens[i];
const newlineTokenCost = 1; // Cost of adding a newline

for (const lt of lineTokens) {
if (chunkTokens + lt.tokenCount > maxChunkSize - 5) {
yield { content: chunkContent, startLine, endLine: currLine - 1 };
// If adding this line would exceed our chunk size and we already have content,
// yield the current chunk first
if (
chunkTokens > 0 &&
chunkTokens + tokenCount + newlineTokenCost > maxChunkSize
) {
yield {
content: chunkContent,
startLine,
endLine: currLine - 1,
};
chunkContent = "";
chunkTokens = 0;
startLine = currLine;
startLine = i;
}

if (lt.tokenCount < maxChunkSize) {
chunkContent += `${lt.line}\n`;
chunkTokens += lt.tokenCount + 1;
}
// Add the current line to our chunk
chunkContent += line + "\n";
chunkTokens += tokenCount + newlineTokenCost;
currLine = i + 1;

currLine++;
// If this line alone exceeds the max size, create a chunk just for this line
// But only if this is the only line in the current chunk
if (tokenCount > maxChunkSize && i === startLine) {
yield {
content: chunkContent,
startLine: i,
endLine: i,
};
chunkContent = "";
chunkTokens = 0;
startLine = i + 1;
currLine = i + 1;
}
}

yield {
content: chunkContent,
startLine,
endLine: currLine - 1,
};
// Yield any remaining content
if (chunkContent.length > 0) {
yield {
content: chunkContent,
startLine,
endLine: currLine - 1,
};
}
}
Loading
Loading