Codeturion · Codeturion · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 30, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,3 +38,8 @@ codesurface = "codesurface.server:main"
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/codesurface"]
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+]
diff --git a/src/codesurface/db.py b/src/codesurface/db.py
@@ -143,11 +143,14 @@ def delete_by_files(conn: sqlite3.Connection, file_paths: list[str]) -> int:
 
 
 def search(conn: sqlite3.Connection, query: str, n: int = 10,
-           member_type: str | None = None) -> list[dict]:
+           member_type: str | None = None,
+           file_path: str | None = None) -> list[dict]:
     """Full-text search with BM25 ranking + PascalCase-aware matching.
 
     Column weights: member_name (10x) > class_name (5x) > search_text (4x) > signature (3x) > fqn/summary (1x)
     Type bonus: class/struct/enum defs rank higher than same-named members.
+
+    file_path: optional path prefix or exact file to scope results.
     """
     clean = _escape_fts(query)
     if not clean.strip():
@@ -157,27 +160,33 @@ def search(conn: sqlite3.Connection, query: str, n: int = 10,
     ranking = """bm25(api_fts, 1.0, 5.0, 10.0, 0.5, 3.0, 4.0)
                 + CASE WHEN r.member_type = 'type' THEN -1.0 ELSE 0.0 END"""
 
-    if member_type:
-        sql = f"""
-            SELECT r.*, {ranking} AS rank
-            FROM api_fts f
-            JOIN api_records r ON r.rowid = f.rowid
-            WHERE api_fts MATCH ? AND r.member_type = ?
-            ORDER BY rank
-            LIMIT ?
-        """
-        rows = conn.execute(sql, (clean, member_type, n)).fetchall()
-    else:
-        sql = f"""
-            SELECT r.*, {ranking} AS rank
-            FROM api_fts f
-            JOIN api_records r ON r.rowid = f.rowid
-            WHERE api_fts MATCH ?
-            ORDER BY rank
-            LIMIT ?
-        """
-        rows = conn.execute(sql, (clean, n)).fetchall()
+    conditions = ["api_fts MATCH ?"]
+    params: list = [clean]
 
+    if member_type:
+        conditions.append("r.member_type = ?")
+        params.append(member_type)
+
+    if file_path:
+        if file_path.endswith("/"):
+            conditions.append("r.file_path LIKE ?")
+            params.append(file_path + "%")
+        else:
+            conditions.append("(r.file_path = ? OR r.file_path LIKE ?)")
+            params.extend([file_path, file_path + "/%"])
+
+    where = " AND ".join(conditions)
+    params.append(n)
+
+    sql = f"""
+        SELECT r.*, {ranking} AS rank
+        FROM api_fts f
+        JOIN api_records r ON r.rowid = f.rowid
+        WHERE {where}
+        ORDER BY rank
+        LIMIT ?
+    """
+    rows = conn.execute(sql, params).fetchall()
     return [dict(row) for row in rows]
 
 
@@ -190,19 +199,29 @@ def get_by_fqn(conn: sqlite3.Connection, fqn: str) -> dict | None:
 
 
 def get_class_members(conn: sqlite3.Connection, class_name: str,
-                      namespace: str | None = None) -> list[dict]:
-    """Get all members of a class by class name, optionally filtered by namespace."""
+                      namespace: str | None = None,
+                      file_path: str | None = None) -> list[dict]:
+    """Get all members of a class by class name, optionally filtered by namespace and/or file_path."""
+    conditions = ["class_name = ?"]
+    params: list = [class_name]
+
     if namespace is not None:
-        rows = conn.execute(
-            "SELECT * FROM api_records WHERE class_name = ? AND namespace = ? "
-            "ORDER BY member_type, member_name",
-            (class_name, namespace),
-        ).fetchall()
-    else:
-        rows = conn.execute(
-            "SELECT * FROM api_records WHERE class_name = ? ORDER BY member_type, member_name",
-            (class_name,),
-        ).fetchall()
+        conditions.append("namespace = ?")
+        params.append(namespace)
+
+    if file_path:
+        if file_path.endswith("/"):
+            conditions.append("file_path LIKE ?")
+            params.append(file_path + "%")
+        else:
+            conditions.append("(file_path = ? OR file_path LIKE ?)")
+            params.extend([file_path, file_path + "/%"])
+
+    where = " AND ".join(conditions)
+    rows = conn.execute(
+        f"SELECT * FROM api_records WHERE {where} ORDER BY member_type, member_name",
+        params,
+    ).fetchall()
     return [dict(row) for row in rows]
 
 

diff --git a/src/codesurface/filters.py b/src/codesurface/filters.py
@@ -0,0 +1,126 @@
+"""Path filtering for codesurface indexing.
+
+Handles default exclusions (worktrees, submodules, vendored/build dirs)
+and user-configured exclusions (.codesurfaceignore, --exclude CLI flag).
+"""
+from __future__ import annotations
+
+import fnmatch
+from pathlib import Path
+
+# Directories excluded by name in every project — vendored deps, build
+# output, VCS internals, and IDE config that never contain user source.
+_DEFAULT_EXCLUDED_DIRS: frozenset[str] = frozenset({
+    # JS / Node
+    "node_modules", "bower_components",
+    # Python
+    ".venv", "venv", "env", "__pycache__", ".tox", ".mypy_cache",
+    ".pytest_cache", "site-packages",
+    # Go
+    "vendor", "testdata", "third_party", "examples", "example",
+    # .NET / Java
+    "bin", "obj", "packages", ".gradle", ".mvn",
+    "generated", "generated-sources", "generated-test-sources",
+    # Build output / caches
+    "dist", "build", "out", "target", ".next", ".nuxt", ".nx",
+    # VCS / IDE
+    ".git", ".hg", ".svn",
+    ".idea", ".vscode", ".vs",
+    # Misc
+    ".yarn", ".pnp", "coverage", ".turbo", ".cache", ".worktrees",
+})
+
+
+def _read_git_file(path: Path) -> str | None:
+    """Read .git FILE content if present. Returns None if .git is a directory."""
+    git = path / ".git"
+    if git.is_file():
+        try:
+            return git.read_text().strip()
+        except OSError:
+            return None
+    return None
+
+
+def _is_git_worktree(git_content: str) -> bool:
+    """True if .git file references a worktrees/ path."""
+    return "/worktrees/" in git_content
+
+
+def _is_git_submodule(git_content: str) -> bool:
+    """True if .git file references a modules/ path."""
+    return "/modules/" in git_content
+
+
+def _read_ignore_file(project_root: Path) -> list[str]:
+    """Read .codesurfaceignore and return non-empty, non-comment lines."""
+    ignore_path = project_root / ".codesurfaceignore"
+    if not ignore_path.is_file():
+        return []
+    lines = []
+    for line in ignore_path.read_text().splitlines():
+        stripped = line.strip()
+        if stripped and not stripped.startswith("#"):
+            lines.append(stripped)
+    return lines
+
+
+class PathFilter:
+    """Determines which directories and files to skip during indexing.
+
+    Default exclusions (always applied):
+    - Any directory named .worktrees
+    - Any subdirectory with a .git FILE referencing /worktrees/ (git worktree)
+    - Any subdirectory with a .git FILE referencing /modules/ (submodule),
+      unless include_submodules=True
+
+    User exclusions via exclude_globs (CLI) and .codesurfaceignore (project file).
+    """
+
+    def __init__(
+        self,
+        project_root: Path,
+        exclude_globs: list[str] | None = None,
+        include_submodules: bool = False,
+    ) -> None:
+        self._root = project_root
+        self._include_submodules = include_submodules
+        self._globs: list[str] = list(exclude_globs or [])
+        self._globs.extend(_read_ignore_file(project_root))
+
+    def is_dir_excluded_name(self, name: str) -> bool:
+        """Fast check using only the directory basename (no I/O)."""
+        return name in _DEFAULT_EXCLUDED_DIRS
+
+    def is_dir_excluded(self, path: Path) -> bool:
+        """Return True if this directory should be skipped entirely."""
+        name = path.name
+
+        if name in _DEFAULT_EXCLUDED_DIRS:
+            return True
+
+        # .git FILE detection (worktrees / submodules)
+        git_content = _read_git_file(path)
+        if git_content is not None:
+            if _is_git_worktree(git_content):
+                return True
+            if _is_git_submodule(git_content) and not self._include_submodules:
+                return True
+
+        return False
+
+    def is_file_excluded(self, path: Path) -> bool:
+        """Return True if this file matches any user exclusion glob."""
+        if not self._globs:
+            return False
+        try:
+            rel = str(path.relative_to(self._root)).replace("\\", "/")
+        except ValueError:
+            return False
+        return any(fnmatch.fnmatch(rel, g) for g in self._globs)
+
+    def is_file_excluded_rel(self, rel_path: str) -> bool:
+        """Return True if a relative path matches any user exclusion glob."""
+        if not self._globs:
+            return False
+        return any(fnmatch.fnmatch(rel_path, g) for g in self._globs)
diff --git a/src/codesurface/parsers/__init__.py b/src/codesurface/parsers/__init__.py
@@ -2,10 +2,12 @@
 
 from __future__ import annotations
 
+import os
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    from ..filters import PathFilter
     from .base import BaseParser
 
 _REGISTRY: dict[str, type[BaseParser]] = {}
@@ -28,22 +30,42 @@ def get_parser(lang: str) -> BaseParser:
     return cls()
 
 
-def detect_languages(project_dir: Path) -> list[str]:
-    """Detect which registered languages are present in *project_dir*."""
+def detect_languages(
+    project_dir: Path,
+    path_filter: "PathFilter | None" = None,
+) -> list[str]:
+    """Detect which registered languages are present in *project_dir*.
+
+    Uses os.walk with *path_filter* pruning so vendored directories
+    (node_modules, .git, etc.) are skipped during detection.
+    """
+    exts = tuple(_EXT_TO_LANG.keys())
     found: set[str] = set()
-    for ext, lang in _EXT_TO_LANG.items():
-        # Quick check: does at least one file with this extension exist?
-        try:
-            next(project_dir.rglob(f"*{ext}"))
-            found.add(lang)
-        except StopIteration:
-            pass
+
+    for root, dirs, files in os.walk(project_dir):
+        root_path = Path(root)
+        if path_filter is not None:
+            dirs[:] = [d for d in dirs if not path_filter.is_dir_excluded(root_path / d)]
+
+        for filename in files:
+            for ext in exts:
+                if filename.endswith(ext):
+                    found.add(_EXT_TO_LANG[ext])
+                    break
+
+        # Stop early once all registered languages are found
+        if len(found) == len(_REGISTRY):
+            break
+
     return sorted(found)
 
 
-def get_parsers_for_project(project_dir: Path) -> list[BaseParser]:
+def get_parsers_for_project(
+    project_dir: Path,
+    path_filter: "PathFilter | None" = None,
+) -> list[BaseParser]:
     """Return parser instances for every language detected in *project_dir*."""
-    return [get_parser(lang) for lang in detect_languages(project_dir)]
+    return [get_parser(lang) for lang in detect_languages(project_dir, path_filter)]
 
 
 def all_extensions() -> list[str]: