@@ -227,6 +227,61 @@ def _split_into_sentences(text: str) -> List[str]:
227227 return sentences
228228
229229
230+ def _split_long_segment (segment : str , max_length : int ) -> List [str ]:
231+ """Fallback splitter for oversized segments."""
232+ if len (segment ) <= max_length :
233+ return [segment ]
234+
235+ parts : List [str ] = []
236+ words = segment .split ()
237+
238+ if not words :
239+ for i in range (0 , len (segment ), max_length ):
240+ chunk = segment [i :i + max_length ]
241+ if chunk .strip ():
242+ parts .append (chunk )
243+ return parts
244+
245+ current_words : List [str ] = []
246+ current_len = 0
247+
248+ for word in words :
249+ word_len = len (word )
250+
251+ if word_len > max_length :
252+ if current_words :
253+ parts .append (' ' .join (current_words ))
254+ current_words = []
255+ current_len = 0
256+
257+ for i in range (0 , word_len , max_length ):
258+ chunk = word [i :i + max_length ]
259+ if chunk .strip ():
260+ parts .append (chunk )
261+ continue
262+
263+ separator = 1 if current_words else 0
264+ proposed = current_len + word_len + separator
265+
266+ if proposed <= max_length :
267+ if separator :
268+ current_len += 1
269+ current_words .append (word )
270+ current_len += word_len
271+ continue
272+
273+ if current_words :
274+ parts .append (' ' .join (current_words ))
275+
276+ current_words = [word ]
277+ current_len = word_len
278+
279+ if current_words :
280+ parts .append (' ' .join (current_words ))
281+
282+ return parts
283+
284+
230285def split_text_by_length (text : str , max_length : int = 4096 , preserve_words : bool = True ) -> List [str ]:
231286 """Split text into chunks no longer than ``max_length`` characters."""
232287 if not text :
@@ -236,6 +291,8 @@ def split_text_by_length(text: str, max_length: int = 4096, preserve_words: bool
236291 return [text ]
237292
238293 chunks : List [str ] = []
294+ effective_max = max (1 , max_length )
295+ tolerance = min (32 , max (8 , effective_max // 10 ))
239296
240297 if preserve_words :
241298 sentences = _split_into_sentences (text )
@@ -257,8 +314,8 @@ def split_text_by_length(text: str, max_length: int = 4096, preserve_words: bool
257314 if current_segment :
258315 chunks .append (' ' .join (current_segment ))
259316
260- if len (sentence ) > max_length :
261- chunks .append ( sentence )
317+ if len (sentence ) > effective_max + tolerance :
318+ chunks .extend ( _split_long_segment ( sentence , max_length ) )
262319 current_segment = []
263320 current_length = 0
264321 continue
0 commit comments