diff --git a/.gitignore b/.gitignore index feb4b40..4c01451 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ build/ .coverage .pytest_cache/ .ruff_cache/ +.env +config.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index e6591cd..2d98cf2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [0.1.2] - 2026-02-27 + +### Added +- `config-example.yaml` with example project configuration (`config.yaml` is gitignored) +- `src/docproc/config.py` — typed configuration loader with Pydantic models +- Environment variable substitution (`${VAR}`) with dotenv support +- Path resolution against project root +- Singleton caching for configuration +- Configuration validation (watch dir exists, recipients non-empty, API key set) + ## [0.1.1] - 2026-02-27 ### Added diff --git a/config-example.yaml b/config-example.yaml new file mode 100644 index 0000000..5ab6327 --- /dev/null +++ b/config-example.yaml @@ -0,0 +1,15 @@ +directories: + watch: "./inbox" + output: "./output" + +deepfellow: + base_url: "http://localhost:8000" + responses_endpoint: "/v1/responses" + api_key: "${DEEPFELLOW_API_KEY}" + vision_model: "gpt-4-vision" + llm_model: "deepseek" + rag_collection: "documents" + +recipients: + - name: "Piotr Zalewa" + tags: ["aquarium", "fish", "reef"] diff --git a/pyproject.toml b/pyproject.toml index a176fca..1ef47b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "docproc" -version = "0.1.1" +version = "0.1.2" requires-python = ">=3.14" dependencies = [ "watchdog>=4.0.0", diff --git a/src/docproc/__init__.py b/src/docproc/__init__.py index 485f44a..b3f4756 100644 --- a/src/docproc/__init__.py +++ b/src/docproc/__init__.py @@ -1 +1 @@ -__version__ = "0.1.1" +__version__ = "0.1.2" diff --git a/src/docproc/config.py b/src/docproc/config.py new file mode 100644 index 0000000..b690a0e --- /dev/null +++ b/src/docproc/config.py @@ -0,0 +1,199 @@ +"""Configuration loader for PaperRoute. + +Reads config.yaml, substitutes environment variables, resolves paths, +validates constraints, and caches the result as a singleton. +""" + +import os +import re +from pathlib import Path + +import yaml +from dotenv import load_dotenv +from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator + +_ENV_VAR_PATTERN = re.compile(r"\$\{([^}]+)\}") + + +class DirectoriesConfig(BaseModel): + model_config = ConfigDict(frozen=True) + + watch: Path + output: Path + + +class DeepfellowConfig(BaseModel): + model_config = ConfigDict(frozen=True) + + base_url: str = Field(min_length=1) + responses_endpoint: str = Field(min_length=1) + api_key: str + vision_model: str = Field(min_length=1) + llm_model: str = Field(min_length=1) + rag_collection: str = Field(min_length=1) + + @field_validator("api_key") + @classmethod + def api_key_must_not_be_blank(cls, v: str) -> str: + if not v.strip(): + msg = "API key must not be blank" + raise ValueError(msg) + return v + + +class Recipient(BaseModel): + model_config = ConfigDict(frozen=True) + + name: str = Field(min_length=1) + tags: tuple[str, ...] = Field(min_length=1) + + @field_validator("tags") + @classmethod + def tags_must_not_contain_blanks(cls, v: tuple[str, ...]) -> tuple[str, ...]: + for tag in v: + if not tag.strip(): + msg = "Tags must not contain blank strings" + raise ValueError(msg) + return v + + +class Config(BaseModel): + model_config = ConfigDict(frozen=True) + + directories: DirectoriesConfig + deepfellow: DeepfellowConfig + recipients: tuple[Recipient, ...] = Field(min_length=1) + + +_config: Config | None = None + + +def _substitute_env_vars(value: str) -> str: + """Replace ${VAR} patterns with environment variable values.""" + + def replacer(match: re.Match[str]) -> str: + var_name = match.group(1) + try: + return os.environ[var_name] + except KeyError: + msg = f"Environment variable '{var_name}' is not set" + raise ValueError(msg) from None + + return _ENV_VAR_PATTERN.sub(replacer, value) + + +def _process_env_vars(data: object) -> object: + """Recursively walk parsed YAML and substitute env vars in strings.""" + if isinstance(data, str): + return _substitute_env_vars(data) + if isinstance(data, dict): + return {k: _process_env_vars(v) for k, v in data.items()} + if isinstance(data, list): + return [_process_env_vars(item) for item in data] + return data + + +def _find_project_root() -> Path: + """Walk up from this file's directory looking for config.yaml.""" + current = Path(__file__).resolve().parent + while current != current.parent: + if (current / "config.yaml").exists(): + return current + current = current.parent + msg = "Could not find config.yaml in any parent directory" + raise FileNotFoundError(msg) + + +def _resolve_paths(config: Config, root: Path) -> Config: + """Resolve relative directory paths against the project root.""" + watch = config.directories.watch + output = config.directories.output + if not watch.is_absolute(): + watch = (root / watch).resolve() + if not output.is_absolute(): + output = (root / output).resolve() + return config.model_copy( + update={ + "directories": config.directories.model_copy( + update={"watch": watch, "output": output} + ) + } + ) + + +def _validate_config(config: Config) -> None: + """Validate runtime constraints that depend on the environment.""" + if not config.directories.watch.exists(): + msg = f"Watch directory does not exist: {config.directories.watch}" + raise FileNotFoundError(msg) + + +def load_config(config_path: Path | None = None) -> Config: + """Load, parse, validate, and cache the configuration.""" + global _config + if _config is not None: + if config_path is None: + return _config + msg = "Configuration is already loaded. Call _reset_config() first to reload." + raise RuntimeError(msg) + + load_dotenv() + + if config_path is None: + root = _find_project_root() + config_path = root / "config.yaml" + else: + config_path = config_path.resolve() + + root = config_path.parent + + if not config_path.is_file(): + msg = f"Configuration file not found: {config_path}" + raise FileNotFoundError(msg) + + try: + text = config_path.read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError) as e: + msg = f"Failed to read configuration file {config_path}: {e}" + raise ValueError(msg) from e + + try: + raw = yaml.safe_load(text) + except yaml.YAMLError as e: + msg = f"Failed to parse configuration file {config_path}: {e}" + raise ValueError(msg) from e + + if not isinstance(raw, dict): + msg = f"Configuration file is empty or invalid: {config_path}" + raise ValueError(msg) + + try: + processed = _process_env_vars(raw) + except ValueError as e: + msg = f"Invalid configuration in {config_path}: {e}" + raise ValueError(msg) from e + + try: + config = Config.model_validate(processed) + except ValidationError as e: + msg = f"Invalid configuration in {config_path}: {e}" + raise ValueError(msg) from e + + config = _resolve_paths(config, root) + _validate_config(config) + + _config = config + return _config + + +def get_config() -> Config: + """Return cached config, loading it if necessary.""" + if _config is None: + return load_config() + return _config + + +def _reset_config() -> None: + """Clear the singleton cache (for tests).""" + global _config + _config = None diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..9f5003b --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,254 @@ +"""Tests for the configuration system.""" + +from pathlib import Path +from unittest import mock + +import pytest +import yaml + +from docproc.config import ( + Config, + _find_project_root, + _process_env_vars, + _reset_config, + _substitute_env_vars, + get_config, + load_config, +) + +MINIMAL_CONFIG = { + "directories": {"watch": "./inbox", "output": "./output"}, + "deepfellow": { + "base_url": "http://localhost:8000", + "responses_endpoint": "/v1/responses", + "api_key": "test-key", + "vision_model": "gpt-4-vision", + "llm_model": "deepseek", + "rag_collection": "documents", + }, + "recipients": [{"name": "Test User", "tags": ["tag1", "tag2"]}], +} + + +@pytest.fixture(autouse=True) +def reset_config(): + _reset_config() + yield + _reset_config() + + +@pytest.fixture() +def config_dir(tmp_path: Path) -> Path: + """Create a temporary directory with config.yaml and required dirs.""" + (tmp_path / "inbox").mkdir() + (tmp_path / "output").mkdir() + config_path = tmp_path / "config.yaml" + config_path.write_text(yaml.dump(MINIMAL_CONFIG)) + return tmp_path + + +# --- _substitute_env_vars --- + + +@pytest.mark.parametrize( + "template,env,expected", + [ + ("${MY_VAR}", {"MY_VAR": "hello"}, "hello"), + ("prefix-${MY_VAR}-suffix", {"MY_VAR": "mid"}, "prefix-mid-suffix"), + ("no vars here", {}, "no vars here"), + ("${A}and${B}", {"A": "1", "B": "2"}, "1and2"), + ], +) +def test_substitute_env_vars_replaces_patterns(template, env, expected): + with mock.patch.dict("os.environ", env, clear=True): + assert _substitute_env_vars(template) == expected + + +def test_substitute_env_vars_raises_on_missing_var(): + with ( + mock.patch.dict("os.environ", {}, clear=True), + pytest.raises(ValueError, match="MISSING_VAR"), + ): + _substitute_env_vars("${MISSING_VAR}") + + +# --- _process_env_vars --- + + +def test_process_env_vars_handles_nested_dict(): + data = {"outer": {"inner": "${VAR}"}} + with mock.patch.dict("os.environ", {"VAR": "value"}): + result = _process_env_vars(data) + assert result == {"outer": {"inner": "value"}} + + +def test_process_env_vars_handles_list(): + data = ["${VAR}", "plain"] + with mock.patch.dict("os.environ", {"VAR": "value"}): + result = _process_env_vars(data) + assert result == ["value", "plain"] + + +def test_process_env_vars_passes_non_strings_through(): + assert _process_env_vars(42) == 42 + assert _process_env_vars(3.14) == 3.14 + assert _process_env_vars(True) is True + assert _process_env_vars(None) is None + + +# --- load_config --- + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_returns_typed_config(mock_load_dotenv, config_dir): + config = load_config(config_dir / "config.yaml") + assert isinstance(config, Config) + assert config.deepfellow.base_url == "http://localhost:8000" + assert config.deepfellow.api_key == "test-key" + assert mock_load_dotenv.call_count == 1 + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_resolves_relative_paths(mock_load_dotenv, config_dir): + config = load_config(config_dir / "config.yaml") + assert config.directories.watch.is_absolute() + assert config.directories.output.is_absolute() + assert config.directories.watch == (config_dir / "inbox").resolve() + assert config.directories.output == (config_dir / "output").resolve() + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_loads_recipients(mock_load_dotenv, config_dir): + config = load_config(config_dir / "config.yaml") + assert len(config.recipients) == 1 + assert config.recipients[0].name == "Test User" + assert config.recipients[0].tags == ("tag1", "tag2") + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_caches_singleton(mock_load_dotenv, config_dir): + config1 = load_config(config_dir / "config.yaml") + config2 = load_config() + assert config1 is config2 + + +@mock.patch("docproc.config.load_dotenv") +def test_reset_config_clears_cache(mock_load_dotenv, config_dir): + config1 = load_config(config_dir / "config.yaml") + _reset_config() + config2 = load_config(config_dir / "config.yaml") + assert config1 is not config2 + + +# --- validation errors --- + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_raises_on_missing_watch_dir(mock_load_dotenv, tmp_path): + (tmp_path / "output").mkdir() + config_path = tmp_path / "config.yaml" + config_path.write_text(yaml.dump(MINIMAL_CONFIG)) + with pytest.raises(FileNotFoundError, match="Watch directory"): + load_config(config_path) + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_raises_on_empty_recipients(mock_load_dotenv, config_dir): + config_data = {**MINIMAL_CONFIG, "recipients": []} + (config_dir / "config.yaml").write_text(yaml.dump(config_data)) + with pytest.raises(ValueError, match="Invalid configuration"): + load_config(config_dir / "config.yaml") + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_raises_on_missing_env_var(mock_load_dotenv, config_dir): + config_data = { + **MINIMAL_CONFIG, + "deepfellow": {**MINIMAL_CONFIG["deepfellow"], "api_key": "${NONEXISTENT}"}, + } + (config_dir / "config.yaml").write_text(yaml.dump(config_data)) + with ( + mock.patch.dict("os.environ", {}, clear=True), + pytest.raises(ValueError, match="NONEXISTENT"), + ): + load_config(config_dir / "config.yaml") + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_raises_on_blank_api_key(mock_load_dotenv, config_dir): + config_data = { + **MINIMAL_CONFIG, + "deepfellow": {**MINIMAL_CONFIG["deepfellow"], "api_key": " "}, + } + (config_dir / "config.yaml").write_text(yaml.dump(config_data)) + with pytest.raises(ValueError, match="Invalid configuration"): + load_config(config_dir / "config.yaml") + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_raises_on_empty_yaml(mock_load_dotenv, config_dir): + (config_dir / "config.yaml").write_text("") + with pytest.raises(ValueError, match="empty or invalid"): + load_config(config_dir / "config.yaml") + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_raises_on_missing_config_file(mock_load_dotenv, tmp_path): + with pytest.raises(FileNotFoundError, match="Configuration file not found"): + load_config(tmp_path / "nonexistent.yaml") + + +@mock.patch("docproc.config.load_dotenv") +def test_load_config_raises_on_invalid_yaml(mock_load_dotenv, config_dir): + (config_dir / "config.yaml").write_text("{{invalid: yaml: [") + with pytest.raises(ValueError, match="Failed to parse"): + load_config(config_dir / "config.yaml") + + +# --- get_config --- + + +@mock.patch("docproc.config.load_dotenv") +def test_get_config_returns_cached_instance(mock_load_dotenv, config_dir): + config1 = load_config(config_dir / "config.yaml") + config2 = get_config() + assert config1 is config2 + + +@mock.patch("docproc.config.load_dotenv") +def test_get_config_loads_on_first_call(mock_load_dotenv, config_dir): + import docproc.config as config_module + + with mock.patch.object( + config_module, "_find_project_root", return_value=config_dir + ): + config = get_config() + assert isinstance(config, Config) + + +# --- _find_project_root --- + + +def test_find_project_root_raises_when_no_config_found(tmp_path): + fake_file = tmp_path / "sub" / "deep" / "file.py" + fake_file.parent.mkdir(parents=True) + fake_file.touch() + import docproc.config as config_module + + with ( + mock.patch.object(config_module, "__file__", str(fake_file)), + pytest.raises(FileNotFoundError, match="config.yaml"), + ): + _find_project_root() + + +def test_find_project_root_finds_config_in_parent(tmp_path): + (tmp_path / "config.yaml").touch() + fake_file = tmp_path / "src" / "pkg" / "module.py" + fake_file.parent.mkdir(parents=True) + fake_file.touch() + import docproc.config as config_module + + with mock.patch.object(config_module, "__file__", str(fake_file)): + root = _find_project_root() + assert root == tmp_path diff --git a/tests/test_init.py b/tests/test_init.py index 50de518..ec200c9 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -2,4 +2,4 @@ def test_version_matches_expected(): - assert __version__ == "0.1.1" + assert __version__ == "0.1.2" diff --git a/uv.lock b/uv.lock index 46a95c8..c394fe8 100644 --- a/uv.lock +++ b/uv.lock @@ -184,7 +184,7 @@ wheels = [ [[package]] name = "docproc" -version = "0.1.1" +version = "0.1.2" source = { editable = "." } dependencies = [ { name = "gradio" },