From c57c2c4a9e9796dc212df3fee58d20746bc3313d Mon Sep 17 00:00:00 2001 From: cnYui Date: Mon, 8 Jun 2026 20:30:51 +0900 Subject: [PATCH] fix: handle utf-8 chunk boundary detection --- src/gitingest/utils/file_utils.py | 5 +++-- tests/test_filesystem.py | 33 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 tests/test_filesystem.py diff --git a/src/gitingest/utils/file_utils.py b/src/gitingest/utils/file_utils.py index 2c6ef74d..366c9248 100644 --- a/src/gitingest/utils/file_utils.py +++ b/src/gitingest/utils/file_utils.py @@ -15,6 +15,7 @@ locale.setlocale(locale.LC_ALL, "C") _CHUNK_SIZE = 1024 # bytes +_MAX_PARTIAL_CHARACTER_BYTES = 4 def _get_preferred_encodings() -> list[str]: @@ -72,6 +73,6 @@ def _decodes(chunk: bytes, encoding: str) -> bool: """ try: chunk.decode(encoding) - except UnicodeDecodeError: - return False + except UnicodeDecodeError as exc: + return exc.reason == "unexpected end of data" and len(chunk) - exc.start <= _MAX_PARTIAL_CHARACTER_BYTES return True diff --git a/tests/test_filesystem.py b/tests/test_filesystem.py new file mode 100644 index 00000000..066ae194 --- /dev/null +++ b/tests/test_filesystem.py @@ -0,0 +1,33 @@ +"""Tests for filesystem node content handling.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from gitingest.schemas import FileSystemNode, FileSystemNodeType + +if TYPE_CHECKING: + from pathlib import Path + + import pytest + + +def test_content_keeps_utf8_text_when_multibyte_character_crosses_chunk_boundary( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Preserve UTF-8 text when a chunk ends inside a multibyte character.""" + file_path = tmp_path / "boundary.py" + content = f"{'a' * 1023}漢\n" + file_path.write_text(content, encoding="utf-8") + monkeypatch.setattr("gitingest.schemas.filesystem._get_preferred_encodings", lambda: ["utf-8"]) + + node = FileSystemNode( + name=file_path.name, + type=FileSystemNodeType.FILE, + path_str=file_path.name, + path=file_path, + size=file_path.stat().st_size, + ) + + assert node.content == content