Skip to content

Commit c1f731e

Browse files
committed
chore(docreader): 重新组织模块文件
1 parent 0d790ff commit c1f731e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+3435
-2398
lines changed

.dockerignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
**/.venv/
2+
**/.python-version

docker-compose.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ services:
122122

123123
docreader:
124124
image: wechatopenai/weknora-docreader:latest
125+
build:
126+
context: .
127+
dockerfile: docker/Dockerfile.docreader
125128
container_name: WeKnora-docreader
126129
ports:
127130
- "${DOCREADER_PORT:-50051}:50051"

docker/Dockerfile.docreader

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,6 @@ RUN echo "检查本地protoc安装包..." && \
5353
rm -f ${PROTOC_PACKAGE}; \
5454
fi
5555

56-
# 复制依赖文件
57-
COPY services/docreader/requirements.txt .
58-
59-
# 安装依赖
60-
RUN pip cache purge && pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
61-
6256
# 预下载 PP-OCRv4 模型
6357
RUN mkdir -p /root/.paddleocr/whl/det/ch && \
6458
mkdir -p /root/.paddleocr/whl/rec/ch && \
@@ -80,16 +74,19 @@ RUN mkdir -p /root/.paddleocr/whl/det/ch && \
8074
rm -f /root/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer.tar && \
8175
rm -f /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer.tar
8276

83-
# 复制源代码和生成脚本
84-
COPY services/docreader/src/ /app/src/
85-
COPY services/docreader/scripts/ /app/scripts/
77+
# 复制依赖文件
78+
COPY docreader/pyproject.toml docreader/uv.lock ./
79+
RUN pip install uv --break-system-packages && \
80+
python -m uv sync --locked --no-dev
8681

87-
# 确保模型目录存在
88-
RUN ls -la /root/.paddleocr/whl/
82+
# 复制源代码和生成脚本
83+
COPY docreader .
8984

9085
# 生成 protobuf 代码
91-
RUN chmod +x /app/scripts/generate_proto.sh && bash /app/scripts/generate_proto.sh
86+
RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
9287

88+
# 确保模型目录存在
89+
RUN ls -la /root/.paddleocr/whl/
9390

9491
# =========================
9592
# 运行阶段
@@ -139,22 +136,24 @@ RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
139136
chmod +x /bin/grpc_health_probe
140137

141138
# 从构建阶段复制已安装的依赖和生成的代码
142-
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
139+
ENV VIRTUAL_ENV=/app/.venv
140+
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
141+
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
142+
143143
COPY --from=builder /usr/local/bin /usr/local/bin
144144
COPY --from=builder /root/.paddleocr /root/.paddleocr
145145

146146
# 安装 Playwright 浏览器
147147
RUN python -m playwright install webkit
148148
RUN python -m playwright install-deps webkit
149149

150-
COPY --from=builder /app/src /app/src
150+
# COPY docreader/scripts/download_deps.py download_deps.py
151+
# RUN python -m download_deps
151152

152-
# 设置 Python 路径
153-
ENV PYTHONPATH=/app/src
154-
RUN cd /app/src && python -m download_deps
153+
COPY --from=builder /app/ ./
155154

156155
# 暴露 gRPC 端口
157156
EXPOSE 50051
158157

159158
# 直接运行 Python 服务(日志输出到 stdout/stderr)
160-
CMD ["python", "/app/src/server/server.py"]
159+
CMD ["uv", "run", "main.py"]
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

services/docreader/src/server/server.py renamed to docreader/main.py

Lines changed: 87 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
3535
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
3636

37+
3738
def to_valid_utf8_text(s: Optional[str]) -> str:
3839
"""Return a UTF-8 safe string for protobuf.
3940
@@ -42,9 +43,10 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
4243
"""
4344
if not s:
4445
return ""
45-
s = _SURROGATE_RE.sub("\uFFFD", s)
46+
s = _SURROGATE_RE.sub("\ufffd", s)
4647
return s.encode("utf-8", errors="replace").decode("utf-8")
4748

49+
4850
def read_text_with_fallback(file_path: str) -> str:
4951
"""Read text from file supporting multiple encodings with graceful fallback.
5052
@@ -67,6 +69,7 @@ def read_text_with_fallback(file_path: str) -> str:
6769
continue
6870
return raw.decode("utf-8", errors="replace")
6971

72+
7073
# Ensure no existing handlers
7174
for handler in logging.root.handlers[:]:
7275
logging.root.removeHandler(handler)
@@ -88,6 +91,7 @@ def read_text_with_fallback(file_path: str) -> str:
8891

8992
parser = Parser()
9093

94+
9195
class DocReaderServicer(docreader_pb2_grpc.DocReaderServicer):
9296
def __init__(self):
9397
super().__init__()
@@ -127,29 +131,34 @@ def ReadFromFile(self, request, context):
127131
# Get Storage and VLM config from request
128132
storage_config = None
129133
vlm_config = None
130-
134+
131135
sc = request.read_config.storage_config
132136
# Keep parser-side key name as cos_config for backward compatibility
133137
storage_config = {
134-
'provider': 'minio' if sc.provider == 2 else 'cos',
135-
'region': sc.region,
136-
'bucket_name': sc.bucket_name,
137-
'access_key_id': sc.access_key_id,
138-
'secret_access_key': sc.secret_access_key,
139-
'app_id': sc.app_id,
140-
'path_prefix': sc.path_prefix,
138+
"provider": "minio" if sc.provider == 2 else "cos",
139+
"region": sc.region,
140+
"bucket_name": sc.bucket_name,
141+
"access_key_id": sc.access_key_id,
142+
"secret_access_key": sc.secret_access_key,
143+
"app_id": sc.app_id,
144+
"path_prefix": sc.path_prefix,
141145
}
142-
logger.info(f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}")
143-
146+
logger.info(
147+
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
148+
)
149+
144150
vlm_config = {
145-
'model_name': request.read_config.vlm_config.model_name,
146-
'base_url': request.read_config.vlm_config.base_url,
147-
'api_key': request.read_config.vlm_config.api_key or '',
148-
'interface_type': request.read_config.vlm_config.interface_type or 'openai',
151+
"model_name": request.read_config.vlm_config.model_name,
152+
"base_url": request.read_config.vlm_config.base_url,
153+
"api_key": request.read_config.vlm_config.api_key or "",
154+
"interface_type": request.read_config.vlm_config.interface_type
155+
or "openai",
149156
}
150-
logger.info(f"Using VLM config: model={vlm_config['model_name']}, "
151-
f"base_url={vlm_config['base_url']}, "
152-
f"interface_type={vlm_config['interface_type']}")
157+
logger.info(
158+
f"Using VLM config: model={vlm_config['model_name']}, "
159+
f"base_url={vlm_config['base_url']}, "
160+
f"interface_type={vlm_config['interface_type']}"
161+
)
153162

154163
chunking_config = ChunkingConfig(
155164
chunk_size=chunk_size,
@@ -177,10 +186,12 @@ def ReadFromFile(self, request, context):
177186
logger.info(
178187
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
179188
)
180-
189+
181190
# Build response, including image info
182191
response = ReadResponse(
183-
chunks=[self._convert_chunk_to_proto(chunk) for chunk in result.chunks]
192+
chunks=[
193+
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
194+
]
184195
)
185196
logger.info(f"Response size: {response.ByteSize()} bytes")
186197
return response
@@ -220,29 +231,34 @@ def ReadFromURL(self, request, context):
220231
# Get Storage and VLM config from request
221232
storage_config = None
222233
vlm_config = None
223-
234+
224235
sc = request.read_config.storage_config
225236
storage_config = {
226-
'provider': 'minio' if sc.provider == 2 else 'cos',
227-
'region': sc.region,
228-
'bucket_name': sc.bucket_name,
229-
'access_key_id': sc.access_key_id,
230-
'secret_access_key': sc.secret_access_key,
231-
'app_id': sc.app_id,
232-
'path_prefix': sc.path_prefix,
237+
"provider": "minio" if sc.provider == 2 else "cos",
238+
"region": sc.region,
239+
"bucket_name": sc.bucket_name,
240+
"access_key_id": sc.access_key_id,
241+
"secret_access_key": sc.secret_access_key,
242+
"app_id": sc.app_id,
243+
"path_prefix": sc.path_prefix,
233244
}
234-
logger.info(f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}")
245+
logger.info(
246+
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
247+
)
235248

236249
vlm_config = {
237-
'model_name': request.read_config.vlm_config.model_name,
238-
'base_url': request.read_config.vlm_config.base_url,
239-
'api_key': request.read_config.vlm_config.api_key or '',
240-
'interface_type': request.read_config.vlm_config.interface_type or 'openai',
250+
"model_name": request.read_config.vlm_config.model_name,
251+
"base_url": request.read_config.vlm_config.base_url,
252+
"api_key": request.read_config.vlm_config.api_key or "",
253+
"interface_type": request.read_config.vlm_config.interface_type
254+
or "openai",
241255
}
242-
logger.info(f"Using VLM config: model={vlm_config['model_name']}, "
243-
f"base_url={vlm_config['base_url']}, "
244-
f"interface_type={vlm_config['interface_type']}")
245-
256+
logger.info(
257+
f"Using VLM config: model={vlm_config['model_name']}, "
258+
f"base_url={vlm_config['base_url']}, "
259+
f"interface_type={vlm_config['interface_type']}"
260+
)
261+
246262
chunking_config = ChunkingConfig(
247263
chunk_size=chunk_size,
248264
chunk_overlap=chunk_overlap,
@@ -254,7 +270,9 @@ def ReadFromURL(self, request, context):
254270

255271
# Parse URL
256272
logger.info(f"Starting URL parsing process")
257-
result = self.parser.parse_url(request.url, request.title, chunking_config)
273+
result = self.parser.parse_url(
274+
request.url, request.title, chunking_config
275+
)
258276
if not result:
259277
error_msg = "Failed to parse URL"
260278
logger.error(error_msg)
@@ -266,9 +284,11 @@ def ReadFromURL(self, request, context):
266284
logger.info(
267285
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
268286
)
269-
287+
270288
response = ReadResponse(
271-
chunks=[self._convert_chunk_to_proto(chunk) for chunk in result.chunks]
289+
chunks=[
290+
self._convert_chunk_to_proto(chunk) for chunk in result.chunks
291+
]
272292
)
273293
logger.info(f"Response size: {response.ByteSize()} bytes")
274294
return response
@@ -280,7 +300,7 @@ def ReadFromURL(self, request, context):
280300
context.set_code(grpc.StatusCode.INTERNAL)
281301
context.set_details(str(e))
282302
return ReadResponse(error=str(e))
283-
303+
284304
def _convert_chunk_to_proto(self, chunk):
285305
"""Convert internal Chunk object to protobuf Chunk message
286306
Ensures all string fields are valid UTF-8 for protobuf (no lone surrogates).
@@ -294,10 +314,12 @@ def _convert_chunk_to_proto(self, chunk):
294314
start=getattr(chunk, "start", 0),
295315
end=getattr(chunk, "end", 0),
296316
)
297-
317+
298318
# If chunk has images attribute and is not empty, add image info
299319
if hasattr(chunk, "images") and chunk.images:
300-
logger.info(f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}")
320+
logger.info(
321+
f"Adding {len(chunk.images)} images to chunk {getattr(chunk, 'seq', 0)}"
322+
)
301323
for img_info in chunk.images:
302324
# img_info expected as dict
303325
proto_image = Image(
@@ -309,9 +331,10 @@ def _convert_chunk_to_proto(self, chunk):
309331
end=int(img_info.get("end", 0) or 0),
310332
)
311333
proto_chunk.images.append(proto_image)
312-
334+
313335
return proto_chunk
314336

337+
315338
def init_ocr_engine(ocr_backend, ocr_config):
316339
"""Initialize OCR engine"""
317340
try:
@@ -328,50 +351,53 @@ def init_ocr_engine(ocr_backend, ocr_config):
328351
return False
329352

330353

331-
def serve():
332-
333-
init_ocr_engine(os.getenv("OCR_BACKEND", "paddle"), {
334-
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
335-
})
336-
354+
def main():
355+
init_ocr_engine(
356+
os.getenv("OCR_BACKEND", "paddle"),
357+
{
358+
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
359+
},
360+
)
361+
337362
# Set max number of worker threads
338363
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))
339364
logger.info(f"Starting DocReader service with {max_workers} worker threads")
340-
365+
341366
# Get port number
342367
port = os.environ.get("GRPC_PORT", "50051")
343-
368+
344369
# Create server
345370
server = grpc.server(
346371
futures.ThreadPoolExecutor(max_workers=max_workers),
347372
options=[
348-
('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
349-
('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
373+
("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
374+
("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
350375
],
351376
)
352-
377+
353378
# Register services
354379
docreader_pb2_grpc.add_DocReaderServicer_to_server(DocReaderServicer(), server)
355-
380+
356381
# Register health check service
357382
health_servicer = HealthServicer()
358383
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
359-
384+
360385
# Set listen address
361386
server.add_insecure_port(f"[::]:{port}")
362-
387+
363388
# Start service
364389
server.start()
365-
390+
366391
logger.info(f"Server started on port {port}")
367392
logger.info("Server is ready to accept connections")
368-
393+
369394
try:
370395
# Wait for service termination
371396
server.wait_for_termination()
372397
except KeyboardInterrupt:
373398
logger.info("Received termination signal, shutting down server")
374399
server.stop(0)
375400

401+
376402
if __name__ == "__main__":
377-
serve()
403+
main()
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)