diff --git a/packages/cli/src/repowise/cli/commands/init_cmd.py b/packages/cli/src/repowise/cli/commands/init_cmd.py index 96b01dc..5d4ccb6 100644 --- a/packages/cli/src/repowise/cli/commands/init_cmd.py +++ b/packages/cli/src/repowise/cli/commands/init_cmd.py @@ -408,6 +408,7 @@ async def _make_cost_tracker() -> CostTracker: progress=gen_callback, resume=resume, cost_tracker=cost_tracker, + generation_config=gen_config, ) ) @@ -977,6 +978,7 @@ def init_command( # Merge exclude_patterns from config.yaml and --exclude/-x flags config = load_config(repo_path) + language = config.get("language", "en") exclude_patterns: list[str] = list(config.get("exclude_patterns") or []) + list(exclude) # Resolve commit limit: CLI flag → config.yaml → default (500) @@ -1038,6 +1040,8 @@ def init_command( f" Provider: [cyan]{provider.provider_name}[/cyan] / Model: [cyan]{provider.model_name}[/cyan]" ) console.print(f" Embedder: [cyan]{embedder_name_resolved}[/cyan]") + if language != "en": + console.print(f" Language: [cyan]{language}[/cyan]") # Validate provider connection from repowise.core.providers.llm.base import ProviderError @@ -1149,8 +1153,7 @@ def init_command( # Cost estimation from repowise.core.generation import GenerationConfig - - gen_config = GenerationConfig(max_concurrency=concurrency) + gen_config = GenerationConfig(max_concurrency=concurrency, language=language) plans = build_generation_plan( result.parsed_files, result.graph_builder, gen_config, skip_tests, skip_infra ) @@ -1291,6 +1294,7 @@ async def _make_cost_tracker() -> CostTracker: progress=gen_callback, resume=resume, cost_tracker=cost_tracker, + generation_config=gen_config, ) ) diff --git a/packages/cli/src/repowise/cli/commands/update_cmd.py b/packages/cli/src/repowise/cli/commands/update_cmd.py index d228f1c..7676a75 100644 --- a/packages/cli/src/repowise/cli/commands/update_cmd.py +++ b/packages/cli/src/repowise/cli/commands/update_cmd.py @@ -88,7 +88,7 @@ def _on_done(result: "RepoUpdateResult") -> None: ) from repowise.core.workspace import RepoUpdateResult - + results = run_async( update_workspace( ws_root, @@ -121,6 +121,7 @@ def _on_done(result: "RepoUpdateResult") -> None: @click.option("--provider", "provider_name", default=None, help="LLM provider name.") @click.option("--model", default=None, help="Model identifier override.") @click.option("--since", default=None, help="Base git ref to diff from (overrides state).") +@click.option("--concurrency", type=int, default=5, help="Max concurrent LLM calls.") @click.option( "--cascade-budget", type=int, @@ -152,6 +153,7 @@ def update_command( dry_run: bool, workspace: bool, repo_alias: str | None, + concurrency: int = 5, ) -> None: """Incrementally update wiki pages for files changed since last sync.""" start = time.monotonic() @@ -209,11 +211,12 @@ def update_command( from repowise.core.generation import ContextAssembler, GenerationConfig, PageGenerator from repowise.core.ingestion import ASTParser, FileTraverser, GraphBuilder - config = GenerationConfig() + cfg = load_config(repo_path) + language = cfg.get("language", "en") + config = GenerationConfig(max_concurrency=concurrency, language=language) # Read exclude patterns from config (set during init or via web UI) - repo_config = load_config(repo_path) - exclude_patterns: list[str] = list(repo_config.get("exclude_patterns") or []) + exclude_patterns: list[str] = list(cfg.get("exclude_patterns") or []) # Full re-ingest for graph (needed for cascade analysis) traverser = FileTraverser(repo_path, extra_exclude_patterns=exclude_patterns or None) @@ -252,8 +255,8 @@ def update_command( try: from repowise.core.ingestion.git_indexer import GitIndexer - _commit_limit = repo_config.get("commit_limit") - _follow_renames = repo_config.get("follow_renames", False) + _commit_limit = cfg.get("commit_limit") + _follow_renames = cfg.get("follow_renames", False) git_indexer = GitIndexer( repo_path, commit_limit=_commit_limit, @@ -329,7 +332,7 @@ def update_command( # Generate affected pages assembler = ContextAssembler(config) - generator = PageGenerator(provider, assembler, config) + generator = PageGenerator(provider, assembler, config, language=config.language) repo_name = repo_path.name generated_pages = run_async( diff --git a/packages/core/src/repowise/core/generation/models.py b/packages/core/src/repowise/core/generation/models.py index f326b36..2cddf8c 100644 --- a/packages/core/src/repowise/core/generation/models.py +++ b/packages/core/src/repowise/core/generation/models.py @@ -82,7 +82,7 @@ class GenerationConfig: max_pages_pct: float = 0.10 # hard cap: total pages ≤ max(50, N_files * this) jobs_dir: str = ".repowise/jobs" large_file_source_pct: float = 0.4 # use structural summary when source tokens > budget * this - + language: str = "en" # --------------------------------------------------------------------------- # GeneratedPage diff --git a/packages/core/src/repowise/core/generation/page_generator.py b/packages/core/src/repowise/core/generation/page_generator.py index 40bf352..027ea2f 100644 --- a/packages/core/src/repowise/core/generation/page_generator.py +++ b/packages/core/src/repowise/core/generation/page_generator.py @@ -37,6 +37,24 @@ compute_page_id, compute_source_hash, ) +# Language name mapping for prompt clarity +_LANGUAGE_NAMES = { + "en": "English", + "ru": "Russian", + "es": "Spanish", + "fr": "French", + "de": "German", + "zh": "Chinese", + "ja": "Japanese", + "ko": "Korean", + "it": "Italian", + "pt": "Portuguese", + "nl": "Dutch", + "pl": "Polish", + "tr": "Turkish", + "ar": "Arabic", + "hi": "Hindi", +} log = structlog.get_logger(__name__) @@ -135,12 +153,14 @@ def __init__( assembler: ContextAssembler, config: GenerationConfig, jinja_env: jinja2.Environment | None = None, - vector_store: Any | None = None, # VectorStore | None + vector_store: Any | None = None, + language: str = "en", ) -> None: self._provider = provider self._assembler = assembler self._config = config self._vector_store = vector_store + self._language = language self._cache: dict[str, GeneratedResponse] = {} if jinja_env is None: @@ -152,7 +172,6 @@ def __init__( autoescape=False, ) self._jinja_env = jinja_env - # ------------------------------------------------------------------ # Per-type generation methods # ------------------------------------------------------------------ @@ -921,21 +940,41 @@ async def _generate_file_page_from_ctx( ) page.metadata["hallucination_warnings"] = hal_warnings return page - + async def _call_provider( self, page_type: str, user_prompt: str, request_id: str, ) -> GeneratedResponse: - """Call the provider with caching.""" key = self._compute_cache_key(page_type, user_prompt) if self._config.cache_enabled and key in self._cache: log.debug("Cache hit", page_type=page_type, key=key[:8]) return self._cache[key] + base_system = SYSTEM_PROMPTS[page_type] + + # Validate and sanitize language + lang_code = self._language.lower().strip() if self._language else "en" + # Remove any newlines or control characters (prevent prompt injection) + lang_code = ''.join(ch for ch in lang_code if ch.isalnum() or ch == '_') + if lang_code not in _LANGUAGE_NAMES: + log.warning(f"Unknown language code '{lang_code}', falling back to English") + lang_code = "en" + lang_name = _LANGUAGE_NAMES.get(lang_code, "English") + + if lang_code != "en": + language_instruction = ( + f"Generate all documentation content in {lang_name}. " + "Keep all code, file paths, and symbol names in their original form. " + "Do not translate them.\n\n" + ) + system_prompt = language_instruction + base_system + else: + system_prompt = base_system + response = await self._provider.generate( - SYSTEM_PROMPTS[page_type], + system_prompt, user_prompt, max_tokens=self._config.max_tokens, temperature=self._config.temperature, @@ -946,10 +985,10 @@ async def _call_provider( self._cache[key] = response return response - + def _compute_cache_key(self, page_type: str, user_prompt: str) -> str: - """Return SHA256(model + page_type + user_prompt) as cache key.""" - raw = f"{self._provider.model_name}:{page_type}:{user_prompt}" + """Return SHA256(model + language + page_type + user_prompt) as cache key.""" + raw = f"{self._provider.model_name}:{self._language}:{page_type}:{user_prompt}" return hashlib.sha256(raw.encode()).hexdigest() def _build_generated_page( @@ -1299,6 +1338,7 @@ def _validate_symbol_references( continue # Check against known names base = ref.split(".")[-1] + if ref in known or base in known: continue # Skip if the ref is a substring of any known symbol (covers partial diff --git a/packages/core/src/repowise/core/pipeline/orchestrator.py b/packages/core/src/repowise/core/pipeline/orchestrator.py index 65535cd..773c557 100644 --- a/packages/core/src/repowise/core/pipeline/orchestrator.py +++ b/packages/core/src/repowise/core/pipeline/orchestrator.py @@ -321,6 +321,7 @@ async def _ingestion_stage() -> tuple: concurrency=concurrency, progress=progress, resume=resume, + generation_config=config, ) # ---- Execution flow tracing ----------------------------------------------- @@ -721,6 +722,7 @@ async def run_generation( vector_store: Any | None, concurrency: int, progress: ProgressCallback | None, + generation_config: Any, resume: bool = False, cost_tracker: Any | None = None, ) -> list[Any]: @@ -741,7 +743,9 @@ async def run_generation( if cost_tracker is not None and llm_client is not None and hasattr(llm_client, "_cost_tracker"): llm_client._cost_tracker = cost_tracker - config = GenerationConfig(max_concurrency=concurrency) + # Create a new config based on the passed generation_config, but with desired max_concurrency + from dataclasses import replace + config = replace(generation_config, max_concurrency=concurrency) assembler = ContextAssembler(config) # Resolve embedder and vector store @@ -750,8 +754,6 @@ async def run_generation( if vector_store is None: vector_store = InMemoryVectorStore(embedder_impl) - generator = PageGenerator(llm_client, assembler, config, vector_store=vector_store) - # Job system — use a temp-like dir under repo_path for checkpoints jobs_dir = repo_path / ".repowise" / "jobs" jobs_dir.mkdir(parents=True, exist_ok=True) @@ -778,6 +780,14 @@ def on_total_known(total: int) -> None: if progress: progress.on_phase_start("generation", total) + generator = PageGenerator( + llm_client, + assembler, + config, + vector_store=vector_store, + language=config.language, + ) + generated_pages = await generator.generate_all( parsed_files, source_map,