Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4f87c3c
docs: add filtering and worktree support design
michael-howell-island Mar 29, 2026
54376ca
docs: add filtering implementation plan
michael-howell-island Mar 29, 2026
6c299b2
feat: add PathFilter with default worktree/submodule skip rules
michael-howell-island Mar 29, 2026
81cc39c
feat: add .codesurfaceignore and --exclude glob support to PathFilter
michael-howell-island Mar 30, 2026
b6815e4
feat: thread PathFilter through parse_directory for dir and file excl…
michael-howell-island Mar 30, 2026
d13cb01
fix: add path_filter support to Go, Java, Python parse_directory over…
michael-howell-island Mar 30, 2026
c2dd5ff
feat: add --exclude and --include-submodules CLI args, wire PathFilte…
michael-howell-island Mar 30, 2026
67fe469
feat: add file_path scoping to search, get_signature, get_class tools
michael-howell-island Mar 30, 2026
4f08f53
fix: Path(rel) in incremental reindex, apply file_path to get_class_m…
michael-howell-island Mar 30, 2026
f7116b5
docs: add filtering features, install instructions for fork
michael-howell-island Mar 30, 2026
0385cc6
fix: replace rglob with os.walk to prune excluded dirs before descent
michael-howell-island Mar 31, 2026
cc24549
Merge pull request #1 from michael-howell-island/feat/filtering-and-w…
michael-howell-island Mar 31, 2026
8fd7a72
docs: add startup progress reporting design
michael-howell-island Mar 31, 2026
15d5c6b
docs: add startup progress implementation plan
michael-howell-island Mar 31, 2026
b56ef8a
feat: add on_progress callback to BaseParser.parse_directory
michael-howell-island Mar 31, 2026
8929c53
feat: forward on_progress callback in Python, Go, Java parser overrides
michael-howell-island Mar 31, 2026
be30dec
feat: stream indexing progress to stderr with file count and percentage
michael-howell-island Mar 31, 2026
3753853
fix: remove duplicate done line in main, strengthen progress test ass…
michael-howell-island Mar 31, 2026
075f619
fix: apply is_file_excluded in _count_files to match parser behavior
michael-howell-island Mar 31, 2026
b918d8b
fix: call on_progress even on parse failure, move sys imports to modu…
michael-howell-island Mar 31, 2026
260e019
perf: optimize indexing speed and fix startup hang on large JS/TS repos
michael-howell-island Apr 1, 2026
4dc4af7
fix: expand tilde in --project path to support ~/work/cloud style args
michael-howell-island Apr 1, 2026
84432c4
chore: remove fork-specific README additions and planning docs
michael-howell-island Apr 1, 2026
8075185
Merge PR #9 with conflict resolution
Codeturion Apr 1, 2026
77af855
Align C++ parser with BaseParser, restore TS test file skip
Codeturion Apr 1, 2026
e5380c6
Fix progress count accuracy and eliminate third directory walk
Codeturion Apr 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,8 @@ codesurface = "codesurface.server:main"

[tool.hatch.build.targets.wheel]
packages = ["src/codesurface"]

[dependency-groups]
dev = [
"pytest>=9.0.2",
]
85 changes: 52 additions & 33 deletions src/codesurface/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,14 @@ def delete_by_files(conn: sqlite3.Connection, file_paths: list[str]) -> int:


def search(conn: sqlite3.Connection, query: str, n: int = 10,
member_type: str | None = None) -> list[dict]:
member_type: str | None = None,
file_path: str | None = None) -> list[dict]:
"""Full-text search with BM25 ranking + PascalCase-aware matching.

Column weights: member_name (10x) > class_name (5x) > search_text (4x) > signature (3x) > fqn/summary (1x)
Type bonus: class/struct/enum defs rank higher than same-named members.

file_path: optional path prefix or exact file to scope results.
"""
clean = _escape_fts(query)
if not clean.strip():
Expand All @@ -157,27 +160,33 @@ def search(conn: sqlite3.Connection, query: str, n: int = 10,
ranking = """bm25(api_fts, 1.0, 5.0, 10.0, 0.5, 3.0, 4.0)
+ CASE WHEN r.member_type = 'type' THEN -1.0 ELSE 0.0 END"""

if member_type:
sql = f"""
SELECT r.*, {ranking} AS rank
FROM api_fts f
JOIN api_records r ON r.rowid = f.rowid
WHERE api_fts MATCH ? AND r.member_type = ?
ORDER BY rank
LIMIT ?
"""
rows = conn.execute(sql, (clean, member_type, n)).fetchall()
else:
sql = f"""
SELECT r.*, {ranking} AS rank
FROM api_fts f
JOIN api_records r ON r.rowid = f.rowid
WHERE api_fts MATCH ?
ORDER BY rank
LIMIT ?
"""
rows = conn.execute(sql, (clean, n)).fetchall()
conditions = ["api_fts MATCH ?"]
params: list = [clean]

if member_type:
conditions.append("r.member_type = ?")
params.append(member_type)

if file_path:
if file_path.endswith("/"):
conditions.append("r.file_path LIKE ?")
params.append(file_path + "%")
else:
conditions.append("(r.file_path = ? OR r.file_path LIKE ?)")
params.extend([file_path, file_path + "/%"])
Comment on lines +170 to +176
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for constructing the file_path SQL condition is duplicated here and in get_class_members (lines 212-218). Consider extracting this into a shared utility function to ensure consistency and reduce maintenance overhead.


where = " AND ".join(conditions)
params.append(n)

sql = f"""
SELECT r.*, {ranking} AS rank
FROM api_fts f
JOIN api_records r ON r.rowid = f.rowid
WHERE {where}
ORDER BY rank
LIMIT ?
"""
rows = conn.execute(sql, params).fetchall()
return [dict(row) for row in rows]


Expand All @@ -190,19 +199,29 @@ def get_by_fqn(conn: sqlite3.Connection, fqn: str) -> dict | None:


def get_class_members(conn: sqlite3.Connection, class_name: str,
namespace: str | None = None) -> list[dict]:
"""Get all members of a class by class name, optionally filtered by namespace."""
namespace: str | None = None,
file_path: str | None = None) -> list[dict]:
"""Get all members of a class by class name, optionally filtered by namespace and/or file_path."""
conditions = ["class_name = ?"]
params: list = [class_name]

if namespace is not None:
rows = conn.execute(
"SELECT * FROM api_records WHERE class_name = ? AND namespace = ? "
"ORDER BY member_type, member_name",
(class_name, namespace),
).fetchall()
else:
rows = conn.execute(
"SELECT * FROM api_records WHERE class_name = ? ORDER BY member_type, member_name",
(class_name,),
).fetchall()
conditions.append("namespace = ?")
params.append(namespace)

if file_path:
if file_path.endswith("/"):
conditions.append("file_path LIKE ?")
params.append(file_path + "%")
else:
conditions.append("(file_path = ? OR file_path LIKE ?)")
params.extend([file_path, file_path + "/%"])

where = " AND ".join(conditions)
rows = conn.execute(
f"SELECT * FROM api_records WHERE {where} ORDER BY member_type, member_name",
params,
).fetchall()
return [dict(row) for row in rows]


Expand Down
126 changes: 126 additions & 0 deletions src/codesurface/filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Path filtering for codesurface indexing.

Handles default exclusions (worktrees, submodules, vendored/build dirs)
and user-configured exclusions (.codesurfaceignore, --exclude CLI flag).
"""
from __future__ import annotations

import fnmatch
from pathlib import Path

# Directories excluded by name in every project — vendored deps, build
# output, VCS internals, and IDE config that never contain user source.
_DEFAULT_EXCLUDED_DIRS: frozenset[str] = frozenset({
# JS / Node
"node_modules", "bower_components",
# Python
".venv", "venv", "env", "__pycache__", ".tox", ".mypy_cache",
".pytest_cache", "site-packages",
# Go
"vendor", "testdata", "third_party", "examples", "example",
# .NET / Java
"bin", "obj", "packages", ".gradle", ".mvn",
"generated", "generated-sources", "generated-test-sources",
# Build output / caches
"dist", "build", "out", "target", ".next", ".nuxt", ".nx",
# VCS / IDE
".git", ".hg", ".svn",
".idea", ".vscode", ".vs",
# Misc
".yarn", ".pnp", "coverage", ".turbo", ".cache", ".worktrees",
})


def _read_git_file(path: Path) -> str | None:
"""Read .git FILE content if present. Returns None if .git is a directory."""
git = path / ".git"
if git.is_file():
try:
return git.read_text().strip()
except OSError:
return None
return None


def _is_git_worktree(git_content: str) -> bool:
"""True if .git file references a worktrees/ path."""
return "/worktrees/" in git_content


def _is_git_submodule(git_content: str) -> bool:
"""True if .git file references a modules/ path."""
return "/modules/" in git_content


def _read_ignore_file(project_root: Path) -> list[str]:
"""Read .codesurfaceignore and return non-empty, non-comment lines."""
ignore_path = project_root / ".codesurfaceignore"
if not ignore_path.is_file():
return []
lines = []
for line in ignore_path.read_text().splitlines():
stripped = line.strip()
if stripped and not stripped.startswith("#"):
lines.append(stripped)
return lines


class PathFilter:
"""Determines which directories and files to skip during indexing.

Default exclusions (always applied):
- Any directory named .worktrees
- Any subdirectory with a .git FILE referencing /worktrees/ (git worktree)
- Any subdirectory with a .git FILE referencing /modules/ (submodule),
unless include_submodules=True

User exclusions via exclude_globs (CLI) and .codesurfaceignore (project file).
"""

def __init__(
self,
project_root: Path,
exclude_globs: list[str] | None = None,
include_submodules: bool = False,
) -> None:
self._root = project_root
self._include_submodules = include_submodules
self._globs: list[str] = list(exclude_globs or [])
self._globs.extend(_read_ignore_file(project_root))

def is_dir_excluded_name(self, name: str) -> bool:
"""Fast check using only the directory basename (no I/O)."""
return name in _DEFAULT_EXCLUDED_DIRS

def is_dir_excluded(self, path: Path) -> bool:
"""Return True if this directory should be skipped entirely."""
name = path.name

if name in _DEFAULT_EXCLUDED_DIRS:
return True

# .git FILE detection (worktrees / submodules)
git_content = _read_git_file(path)
if git_content is not None:
if _is_git_worktree(git_content):
return True
if _is_git_submodule(git_content) and not self._include_submodules:
return True

return False

def is_file_excluded(self, path: Path) -> bool:
"""Return True if this file matches any user exclusion glob."""
if not self._globs:
return False
try:
rel = str(path.relative_to(self._root)).replace("\\", "/")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using path.relative_to(self._root) inside a tight loop for every file is inefficient. Since the walk already provides the current root and filename, string-based path joining and slicing would be significantly faster and more consistent with the performance goals stated in the parser base class.

except ValueError:
return False
return any(fnmatch.fnmatch(rel, g) for g in self._globs)

def is_file_excluded_rel(self, rel_path: str) -> bool:
"""Return True if a relative path matches any user exclusion glob."""
if not self._globs:
return False
return any(fnmatch.fnmatch(rel_path, g) for g in self._globs)
44 changes: 33 additions & 11 deletions src/codesurface/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

from __future__ import annotations

import os
from pathlib import Path
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from ..filters import PathFilter
from .base import BaseParser

_REGISTRY: dict[str, type[BaseParser]] = {}
Expand All @@ -28,22 +30,42 @@ def get_parser(lang: str) -> BaseParser:
return cls()


def detect_languages(project_dir: Path) -> list[str]:
"""Detect which registered languages are present in *project_dir*."""
def detect_languages(
project_dir: Path,
path_filter: "PathFilter | None" = None,
) -> list[str]:
"""Detect which registered languages are present in *project_dir*.

Uses os.walk with *path_filter* pruning so vendored directories
(node_modules, .git, etc.) are skipped during detection.
"""
exts = tuple(_EXT_TO_LANG.keys())
found: set[str] = set()
for ext, lang in _EXT_TO_LANG.items():
# Quick check: does at least one file with this extension exist?
try:
next(project_dir.rglob(f"*{ext}"))
found.add(lang)
except StopIteration:
pass

for root, dirs, files in os.walk(project_dir):
root_path = Path(root)
if path_filter is not None:
dirs[:] = [d for d in dirs if not path_filter.is_dir_excluded(root_path / d)]

for filename in files:
for ext in exts:
if filename.endswith(ext):
found.add(_EXT_TO_LANG[ext])
break

# Stop early once all registered languages are found
if len(found) == len(_REGISTRY):
break

return sorted(found)


def get_parsers_for_project(project_dir: Path) -> list[BaseParser]:
def get_parsers_for_project(
project_dir: Path,
path_filter: "PathFilter | None" = None,
) -> list[BaseParser]:
"""Return parser instances for every language detected in *project_dir*."""
return [get_parser(lang) for lang in detect_languages(project_dir)]
return [get_parser(lang) for lang in detect_languages(project_dir, path_filter)]


def all_extensions() -> list[str]:
Expand Down
Loading