3434# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
3535_SURROGATE_RE = re .compile (r"[\ud800-\udfff]" )
3636
37+
3738def to_valid_utf8_text (s : Optional [str ]) -> str :
3839 """Return a UTF-8 safe string for protobuf.
3940
@@ -42,9 +43,10 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
4243 """
4344 if not s :
4445 return ""
45- s = _SURROGATE_RE .sub ("\uFFFD " , s )
46+ s = _SURROGATE_RE .sub ("\ufffd " , s )
4647 return s .encode ("utf-8" , errors = "replace" ).decode ("utf-8" )
4748
49+
4850def read_text_with_fallback (file_path : str ) -> str :
4951 """Read text from file supporting multiple encodings with graceful fallback.
5052
@@ -67,6 +69,7 @@ def read_text_with_fallback(file_path: str) -> str:
6769 continue
6870 return raw .decode ("utf-8" , errors = "replace" )
6971
72+
7073# Ensure no existing handlers
7174for handler in logging .root .handlers [:]:
7275 logging .root .removeHandler (handler )
@@ -88,6 +91,7 @@ def read_text_with_fallback(file_path: str) -> str:
8891
8992parser = Parser ()
9093
94+
9195class DocReaderServicer (docreader_pb2_grpc .DocReaderServicer ):
9296 def __init__ (self ):
9397 super ().__init__ ()
@@ -127,29 +131,34 @@ def ReadFromFile(self, request, context):
127131 # Get Storage and VLM config from request
128132 storage_config = None
129133 vlm_config = None
130-
134+
131135 sc = request .read_config .storage_config
132136 # Keep parser-side key name as cos_config for backward compatibility
133137 storage_config = {
134- ' provider' : ' minio' if sc .provider == 2 else ' cos' ,
135- ' region' : sc .region ,
136- ' bucket_name' : sc .bucket_name ,
137- ' access_key_id' : sc .access_key_id ,
138- ' secret_access_key' : sc .secret_access_key ,
139- ' app_id' : sc .app_id ,
140- ' path_prefix' : sc .path_prefix ,
138+ " provider" : " minio" if sc .provider == 2 else " cos" ,
139+ " region" : sc .region ,
140+ " bucket_name" : sc .bucket_name ,
141+ " access_key_id" : sc .access_key_id ,
142+ " secret_access_key" : sc .secret_access_key ,
143+ " app_id" : sc .app_id ,
144+ " path_prefix" : sc .path_prefix ,
141145 }
142- logger .info (f"Using Storage config: provider={ storage_config .get ('provider' )} , bucket={ storage_config ['bucket_name' ]} " )
143-
146+ logger .info (
147+ f"Using Storage config: provider={ storage_config .get ('provider' )} , bucket={ storage_config ['bucket_name' ]} "
148+ )
149+
144150 vlm_config = {
145- 'model_name' : request .read_config .vlm_config .model_name ,
146- 'base_url' : request .read_config .vlm_config .base_url ,
147- 'api_key' : request .read_config .vlm_config .api_key or '' ,
148- 'interface_type' : request .read_config .vlm_config .interface_type or 'openai' ,
151+ "model_name" : request .read_config .vlm_config .model_name ,
152+ "base_url" : request .read_config .vlm_config .base_url ,
153+ "api_key" : request .read_config .vlm_config .api_key or "" ,
154+ "interface_type" : request .read_config .vlm_config .interface_type
155+ or "openai" ,
149156 }
150- logger .info (f"Using VLM config: model={ vlm_config ['model_name' ]} , "
151- f"base_url={ vlm_config ['base_url' ]} , "
152- f"interface_type={ vlm_config ['interface_type' ]} " )
157+ logger .info (
158+ f"Using VLM config: model={ vlm_config ['model_name' ]} , "
159+ f"base_url={ vlm_config ['base_url' ]} , "
160+ f"interface_type={ vlm_config ['interface_type' ]} "
161+ )
153162
154163 chunking_config = ChunkingConfig (
155164 chunk_size = chunk_size ,
@@ -177,10 +186,12 @@ def ReadFromFile(self, request, context):
177186 logger .info (
178187 f"Successfully parsed file { request .file_name } , returning { len (result .chunks )} chunks"
179188 )
180-
189+
181190 # Build response, including image info
182191 response = ReadResponse (
183- chunks = [self ._convert_chunk_to_proto (chunk ) for chunk in result .chunks ]
192+ chunks = [
193+ self ._convert_chunk_to_proto (chunk ) for chunk in result .chunks
194+ ]
184195 )
185196 logger .info (f"Response size: { response .ByteSize ()} bytes" )
186197 return response
@@ -220,29 +231,34 @@ def ReadFromURL(self, request, context):
220231 # Get Storage and VLM config from request
221232 storage_config = None
222233 vlm_config = None
223-
234+
224235 sc = request .read_config .storage_config
225236 storage_config = {
226- ' provider' : ' minio' if sc .provider == 2 else ' cos' ,
227- ' region' : sc .region ,
228- ' bucket_name' : sc .bucket_name ,
229- ' access_key_id' : sc .access_key_id ,
230- ' secret_access_key' : sc .secret_access_key ,
231- ' app_id' : sc .app_id ,
232- ' path_prefix' : sc .path_prefix ,
237+ " provider" : " minio" if sc .provider == 2 else " cos" ,
238+ " region" : sc .region ,
239+ " bucket_name" : sc .bucket_name ,
240+ " access_key_id" : sc .access_key_id ,
241+ " secret_access_key" : sc .secret_access_key ,
242+ " app_id" : sc .app_id ,
243+ " path_prefix" : sc .path_prefix ,
233244 }
234- logger .info (f"Using Storage config: provider={ storage_config .get ('provider' )} , bucket={ storage_config ['bucket_name' ]} " )
245+ logger .info (
246+ f"Using Storage config: provider={ storage_config .get ('provider' )} , bucket={ storage_config ['bucket_name' ]} "
247+ )
235248
236249 vlm_config = {
237- 'model_name' : request .read_config .vlm_config .model_name ,
238- 'base_url' : request .read_config .vlm_config .base_url ,
239- 'api_key' : request .read_config .vlm_config .api_key or '' ,
240- 'interface_type' : request .read_config .vlm_config .interface_type or 'openai' ,
250+ "model_name" : request .read_config .vlm_config .model_name ,
251+ "base_url" : request .read_config .vlm_config .base_url ,
252+ "api_key" : request .read_config .vlm_config .api_key or "" ,
253+ "interface_type" : request .read_config .vlm_config .interface_type
254+ or "openai" ,
241255 }
242- logger .info (f"Using VLM config: model={ vlm_config ['model_name' ]} , "
243- f"base_url={ vlm_config ['base_url' ]} , "
244- f"interface_type={ vlm_config ['interface_type' ]} " )
245-
256+ logger .info (
257+ f"Using VLM config: model={ vlm_config ['model_name' ]} , "
258+ f"base_url={ vlm_config ['base_url' ]} , "
259+ f"interface_type={ vlm_config ['interface_type' ]} "
260+ )
261+
246262 chunking_config = ChunkingConfig (
247263 chunk_size = chunk_size ,
248264 chunk_overlap = chunk_overlap ,
@@ -254,7 +270,9 @@ def ReadFromURL(self, request, context):
254270
255271 # Parse URL
256272 logger .info (f"Starting URL parsing process" )
257- result = self .parser .parse_url (request .url , request .title , chunking_config )
273+ result = self .parser .parse_url (
274+ request .url , request .title , chunking_config
275+ )
258276 if not result :
259277 error_msg = "Failed to parse URL"
260278 logger .error (error_msg )
@@ -266,9 +284,11 @@ def ReadFromURL(self, request, context):
266284 logger .info (
267285 f"Successfully parsed URL { request .url } , returning { len (result .chunks )} chunks"
268286 )
269-
287+
270288 response = ReadResponse (
271- chunks = [self ._convert_chunk_to_proto (chunk ) for chunk in result .chunks ]
289+ chunks = [
290+ self ._convert_chunk_to_proto (chunk ) for chunk in result .chunks
291+ ]
272292 )
273293 logger .info (f"Response size: { response .ByteSize ()} bytes" )
274294 return response
@@ -280,7 +300,7 @@ def ReadFromURL(self, request, context):
280300 context .set_code (grpc .StatusCode .INTERNAL )
281301 context .set_details (str (e ))
282302 return ReadResponse (error = str (e ))
283-
303+
284304 def _convert_chunk_to_proto (self , chunk ):
285305 """Convert internal Chunk object to protobuf Chunk message
286306 Ensures all string fields are valid UTF-8 for protobuf (no lone surrogates).
@@ -294,10 +314,12 @@ def _convert_chunk_to_proto(self, chunk):
294314 start = getattr (chunk , "start" , 0 ),
295315 end = getattr (chunk , "end" , 0 ),
296316 )
297-
317+
298318 # If chunk has images attribute and is not empty, add image info
299319 if hasattr (chunk , "images" ) and chunk .images :
300- logger .info (f"Adding { len (chunk .images )} images to chunk { getattr (chunk , 'seq' , 0 )} " )
320+ logger .info (
321+ f"Adding { len (chunk .images )} images to chunk { getattr (chunk , 'seq' , 0 )} "
322+ )
301323 for img_info in chunk .images :
302324 # img_info expected as dict
303325 proto_image = Image (
@@ -309,9 +331,10 @@ def _convert_chunk_to_proto(self, chunk):
309331 end = int (img_info .get ("end" , 0 ) or 0 ),
310332 )
311333 proto_chunk .images .append (proto_image )
312-
334+
313335 return proto_chunk
314336
337+
315338def init_ocr_engine (ocr_backend , ocr_config ):
316339 """Initialize OCR engine"""
317340 try :
@@ -328,50 +351,53 @@ def init_ocr_engine(ocr_backend, ocr_config):
328351 return False
329352
330353
331- def serve ():
332-
333- init_ocr_engine (os .getenv ("OCR_BACKEND" , "paddle" ), {
334- "OCR_API_BASE_URL" : os .getenv ("OCR_API_BASE_URL" , "" ),
335- })
336-
354+ def main ():
355+ init_ocr_engine (
356+ os .getenv ("OCR_BACKEND" , "paddle" ),
357+ {
358+ "OCR_API_BASE_URL" : os .getenv ("OCR_API_BASE_URL" , "" ),
359+ },
360+ )
361+
337362 # Set max number of worker threads
338363 max_workers = int (os .environ .get ("GRPC_MAX_WORKERS" , "4" ))
339364 logger .info (f"Starting DocReader service with { max_workers } worker threads" )
340-
365+
341366 # Get port number
342367 port = os .environ .get ("GRPC_PORT" , "50051" )
343-
368+
344369 # Create server
345370 server = grpc .server (
346371 futures .ThreadPoolExecutor (max_workers = max_workers ),
347372 options = [
348- (' grpc.max_send_message_length' , MAX_MESSAGE_LENGTH ),
349- (' grpc.max_receive_message_length' , MAX_MESSAGE_LENGTH ),
373+ (" grpc.max_send_message_length" , MAX_MESSAGE_LENGTH ),
374+ (" grpc.max_receive_message_length" , MAX_MESSAGE_LENGTH ),
350375 ],
351376 )
352-
377+
353378 # Register services
354379 docreader_pb2_grpc .add_DocReaderServicer_to_server (DocReaderServicer (), server )
355-
380+
356381 # Register health check service
357382 health_servicer = HealthServicer ()
358383 health_pb2_grpc .add_HealthServicer_to_server (health_servicer , server )
359-
384+
360385 # Set listen address
361386 server .add_insecure_port (f"[::]:{ port } " )
362-
387+
363388 # Start service
364389 server .start ()
365-
390+
366391 logger .info (f"Server started on port { port } " )
367392 logger .info ("Server is ready to accept connections" )
368-
393+
369394 try :
370395 # Wait for service termination
371396 server .wait_for_termination ()
372397 except KeyboardInterrupt :
373398 logger .info ("Received termination signal, shutting down server" )
374399 server .stop (0 )
375400
401+
376402if __name__ == "__main__" :
377- serve ()
403+ main ()
0 commit comments