@@ -6,7 +6,7 @@ import { EmbeddingChunk } from '../types/embedding-chunk'
66import { embed } from 'ai'
77import { verifyModel } from '../model/model-manager'
88import { detokenize , effectiveTokenCount , tokenize } from '../utils'
9-
9+ import { randomInt } from 'crypto'
1010/**
1111 * Generates embeddings for a given text using a specified model.
1212 *
@@ -17,6 +17,22 @@ import { detokenize, effectiveTokenCount, tokenize } from '../utils'
1717 * @returns A promise that resolves to an array of embedding chunks.
1818 * @throws An error if the model verification or embedding generation fails.
1919 */
20+
21+ function sanitizeChunk ( text : string ) : string {
22+ return (
23+ text
24+ // Collapse long runs of periods (..... -> .)
25+ . replace ( / ( [ . ] ) \1{ 2 , } / g, '$1' )
26+ // Collapse long runs of dashes, underscores, etc. (optional)
27+ . replace ( / ( [ - _ * ] ) \1{ 2 , } / g, '$1' )
28+ // Remove zero-width and control characters
29+ . replace ( / [ \u0000 - \u001F \u007F - \u009F \u200B ] / g, '' )
30+ // Collapse extra whitespace
31+ . replace ( / \s { 2 , } / g, ' ' )
32+ . trim ( )
33+ )
34+ }
35+
2036export async function generateEmbeddings (
2137 text : string ,
2238 chunkSizeToken : number ,
@@ -73,46 +89,90 @@ export async function generateEmbeddings(
7389 let completedCount = 0
7490 const totalChunks = chunks . length
7591 console . log ( 'DEBUG: generateEmbeddings totalChunks:' , totalChunks )
76- const embeddingPromises = chunks . map ( async ( chunk , index ) => {
77- try {
78- const { embedding } = await embed ( {
79- model : ollama . embedding ( modelName ) ,
80- value : chunk ,
81- } )
82- // console.log(
83- // `Embedding generated for chunk ${index + 1}/${chunks.length}`
84- // );
85- completedCount ++
86- const completionPercentage = ( ( completedCount / totalChunks ) * 100 ) . toFixed ( 2 )
87- // console.log(
88- // `Embedding generation: ${completionPercentage}% (${completedCount}/${totalChunks})`
89- // );
90- const tokens = tokenize ( chunk )
91- console . log (
92- `DEBUG: generateEmbeddings: ${ completionPercentage } % (${ completedCount } /${ totalChunks } ) | ` +
93- `[${ index } ]: ${ chunk . length } chars | ` +
94- `Adjusted token (${ chunkSizeToken } ): ${ tokens . length } ` ,
95- )
92+ async function embedChunk ( chunk : string , index : number ) : Promise < EmbeddingChunk | null > {
93+ const sanitized = sanitizeChunk ( chunk )
94+ // Log full chunk if sanitization changed it
95+ if ( sanitized !== chunk ) {
96+ const sanitizeLog = `
97+ Sanitized chunk ${ index + 1 } :
98+ Before: ${ chunk }
99+ After : ${ sanitized }
100+ Length: ${ chunk . length } -> ${ sanitized . length }
101+ -------`
102+ console . log ( sanitizeLog )
103+ }
104+ const maxRetries = 5
105+ const tokens = tokenize ( chunk )
106+ const preview = chunk . slice ( 0 , 500 )
96107
97- return {
98- order : index + 1 , // 1-based order
99- chunk : chunk ,
100- embedding : embedding , // assumed to be a number[]
101- sourceType : 'user' as const , // Specify sourceType for user-generated embeddings
108+ for ( let attempt = 1 ; attempt <= maxRetries ; attempt ++ ) {
109+ try {
110+ const { embedding } = await embed ( {
111+ model : ollama . embedding ( modelName ) ,
112+ value : sanitized ,
113+ } )
114+ completedCount ++
115+ const completionPercentage = ( ( completedCount / totalChunks ) * 100 ) . toFixed ( 2 )
116+
117+ const successLog = `
118+ Successful embedding for chunk ${ index + 1 } /${ totalChunks }
119+ Length: ${ chunk . length } , Tokens: ${ tokens . length }
120+ Preview: ${ preview }
121+ Completion: ${ completionPercentage } % (${ completedCount } /${ totalChunks } )
122+ -------`
123+ console . log ( successLog )
124+ return {
125+ order : index + 1 ,
126+ chunk : sanitized ,
127+ embedding,
128+ sourceType : 'user' as const ,
129+ }
130+ } catch ( err : unknown ) {
131+ let message : string
132+ if ( err instanceof Error ) {
133+ message = err . message
134+ } else if ( typeof err === 'string' ) {
135+ message = err
136+ } else {
137+ message = JSON . stringify ( err )
138+ }
139+
140+ const errorLog = `
141+ Attempt ${ attempt } /${ maxRetries } failed for chunk ${ index + 1 } /${ totalChunks }
142+ Length: ${ chunk . length } , Tokens: ${ tokens . length }
143+ Preview: ${ preview }
144+ Error: ${ message }
145+ -------`
146+ console . error ( errorLog )
147+ if ( attempt < maxRetries ) {
148+ const jitter = randomInt ( 0 , 100 )
149+ const delay = 500 * 2 ** ( attempt - 1 ) + jitter
150+ await new Promise < void > ( ( resolve ) => {
151+ setTimeout ( ( ) => resolve ( ) , delay )
152+ } )
153+ }
102154 }
103- } catch ( error ) {
104- throw new Error (
105- `Failed to generate embedding for chunk ${ index + 1 } /${ totalChunks } : ${ error } ` ,
106- )
107155 }
108- } )
109156
110- const results = await Promise . all ( embeddingPromises )
157+ const finalErrorLog = `
158+ Failed permanently for chunk ${ index + 1 } /${ totalChunks }
159+ Length: ${ chunk . length } , Tokens: ${ tokens . length }
160+ Preview: ${ preview }
161+ -------`
162+ console . error ( finalErrorLog )
163+ return null
164+ }
165+
166+ const embeddingPromises = chunks . map ( ( chunk , index ) => embedChunk ( chunk , index ) )
167+ const settled = await Promise . all ( embeddingPromises )
168+
169+ const results = settled . filter ( ( r ) : r is EmbeddingChunk => r !== null )
170+
111171 const endTime = Date . now ( )
112172 const totalTimeTakenMs = endTime - startTime
113173 const totalTimeTakenSec = ( totalTimeTakenMs / 1000 ) . toFixed ( 2 )
114174 console . log (
115- `Generated ${ chunks . length } embeddings in ${ totalTimeTakenMs } ms (${ totalTimeTakenSec } s)` ,
175+ `Generated ${ results . length } / ${ chunks . length } embeddings in ${ totalTimeTakenMs } ms (${ totalTimeTakenSec } s)` ,
116176 )
117177
118178 return results
0 commit comments