monkscode · monkscode · Apr 24, 2026 · Apr 23, 2026 · Apr 24, 2026 · coderabbitai
diff --git a/.env.example b/.env.example
@@ -7,21 +7,15 @@
 #   - fastapi-latest / browser-service-latest (stable from main)
 #   - fastapi-develop / browser-service-develop (latest from develop)
 #   - fastapi-1001 / browser-service-1001 (specific version)
-FASTAPI_IMAGE_TAG=monkscode/nlrf:fastapi-pr-50
-BROWSER_SERVICE_IMAGE_TAG=monkscode/nlrf:browser-service-pr-50
-TEST_RUNNER_IMAGE_TAG=monkscode/nlrf:test-runner-pr-50
+FASTAPI_IMAGE_TAG=monkscode/nlrf:fastapi-local
+BROWSER_SERVICE_IMAGE_TAG=monkscode/nlrf:browser-service-local
+TEST_RUNNER_IMAGE_TAG=monkscode/nlrf:test-runner-local
 
-# --- Docker-in-Docker Configuration ---
-# CRITICAL: Set this to the host machine's absolute path to robot_tests directory
-# This is required for test execution in spawned Docker containers
-# 
-# Linux/Mac: Use $(pwd)/robot_tests
-# Windows: Use ${PWD}/robot_tests or full path like C:/Users/YourName/project/robot_tests
-#
-# Example values:
-#   Linux/Mac: /home/user/Natural-Language-to-Robot-Framework/robot_tests
-#   Windows: C:/Users/<username>/Documents/GitHub/Natural-Language-to-Robot-Framework/robot_tests
-HOST_ROBOT_TESTS_DIR=./robot_tests
+# --- Docker-in-Docker bind mount ---
+# HOST_ROBOT_TESTS_DIR is auto-set by docker-compose.yml to ${PWD}/robot_tests
+# and by run.sh to the absolute script directory. You normally do not need to set
+# it here. Override only if you run FastAPI outside compose (e.g., `docker run`)
+# and need a custom absolute host path for test-runner bind mounts.
 
 # Maximum time in seconds to wait for a test container to finish execution
 # Default: 1800 (30 minutes). Increase for very long test suites.

diff --git a/run.sh b/run.sh
@@ -8,6 +8,10 @@ export PYTHONUTF8=1
 # This uses pure Python protobuf parsing (slower but compatible)
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 
+# Add Docker Desktop binaries to PATH so docker-credential-desktop is resolvable
+# by the Python docker SDK when authenticating against Docker Hub on Windows.
+export PATH="/c/Program Files/Docker/Docker/resources/bin:$PATH"
+
 # Check for .env file
 if [ ! -f "src/backend/.env" ]; then
     echo "Error: src/backend/.env file not found."
@@ -20,11 +24,25 @@ set -a
 source src/backend/.env
 set +a
 
+# --- Local-dev overrides (run.sh process only, never containers) ---
+# These override values from src/backend/.env for processes launched by this script.
+# Docker Compose reads src/backend/.env directly and does NOT source run.sh, so the
+# on-disk value (BROWSER_HEADLESS=true) remains authoritative for containers.
+export BROWSER_HEADLESS=false
+export LOG_FORMAT=console          # human-readable colored logs
+# export CREWAI_VERBOSE=true         # show agent reasoning in console
+
 # Support both APP_PORT (new) and PORT (legacy) variables with a sane default
 APP_PORT="${APP_PORT:-${PORT:-5000}}"
 export APP_PORT
 export PORT="$APP_PORT"
 
+# When running locally (not inside Docker Compose), HOST_ROBOT_TESTS_DIR must be
+# an absolute Windows path so the Docker daemon can resolve the bind mount.
+# The .env value (./robot_tests) is relative and only valid inside Docker Compose.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -W 2>/dev/null || pwd)"
+export HOST_ROBOT_TESTS_DIR="${SCRIPT_DIR}/robot_tests"
+
 # Cross-platform venv activation and path handling
 VENV_DIR="venv"
 if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "cygwin" || "$OSTYPE" == "win32" ]]; then

diff --git a/src/backend/.env.example b/src/backend/.env.example
@@ -150,6 +150,13 @@ OPTIMIZATION_CONTEXT_PRUNING_ENABLED=true
 # Default: 0.6
 OPTIMIZATION_CONTEXT_PRUNING_THRESHOLD=0.6
 
+# --- Log Format ---
+# Controls log output format for both FastAPI and BrowserUse services.
+# Options:
+#   console — human-readable colored output (recommended for local development)
+#   (unset) — JSON structured output (recommended for production/log aggregation)
+LOG_FORMAT=
+
 # --- LLM Observability ---
 # Controls where OTel traces are exported. Options:
 #   sqlite  — Built-in SQLite trace store at data/llm_traces.db (zero infra, default)

diff --git a/src/backend/config/logging_config.py b/src/backend/config/logging_config.py
@@ -67,7 +67,7 @@ def setup_logging(log_dir: str = "logs", log_level: str = "INFO") -> None:
     )
 
     renderer = (
-        structlog.dev.ConsoleRenderer()
+        structlog.dev.ConsoleRenderer(colors=sys.stdout.isatty())
         if os.environ.get("LOG_FORMAT", "").lower() == "console"
         else structlog.processors.JSONRenderer()
     )

diff --git a/src/backend/crew_ai/crew.py b/src/backend/crew_ai/crew.py
@@ -416,6 +416,16 @@ def run_crew(query: str, model_provider: str, model_name: str, library_type: str
             if optimization_metrics:
                 logger.info("📊 Optimization metrics collected")
 
+            # CrewAI's event bus dispatches sync handlers via ThreadPoolExecutor (fire-and-forget).
+            # TaskCompletedEvent for task 3 is submitted to the pool but may execute AFTER
+            # kickoff() returns and unregister_workflow() clears the routing maps — causing
+            # _resolve() to return None and silently drop the 100% event.
+            # Pushing 100% here (success path only) guarantees delivery before unregistration.
+            # _push_if_forward deduplicates: if the thread pool wins the race, this is a no-op.
+            if progress_queue is not None:
+                from src.backend.crew_ai.progress_events import _push_if_forward
+                _push_if_forward(workflow_id, progress_queue, "🎉 Test generation complete", 100)
+
             return result, crew, optimization_metrics, hint_metadata, agents.llm._monitor
 
         except Exception as e:

diff --git a/src/backend/crew_ai/library_context/browser_context.py b/src/backend/crew_ai/library_context/browser_context.py
@@ -87,7 +87,9 @@ def core_rules(self) -> str:
    text > role > data-testid > id > css > xpath
    - text=<value> → Most stable
    - role=<role>[name="<name>"] → Accessibility-first
-   - CSS selectors need no prefix
+   - css=<selector> → Always prefix CSS selectors with `css=` (e.g. `css=#searchBox`, `css=.btn`).
+     A bare `#` at the start of a variable value or argument is parsed as a Robot Framework
+     comment and the locator becomes empty at runtime.
 
 6. **COMMON PITFALLS:**
    ❌ Missing viewport config → Elements not found
@@ -205,7 +207,8 @@ def code_assembly_context(self) -> str:
 2. MUST include "New Context    viewport=None" for proper element detection
 3. Browser Library uses 'browser' and 'headless' parameters (NOT 'options')
 4. Browser Library auto-waits, so explicit waits are rarely needed
-5. Locators can be CSS selectors without prefix
+5. Always prefix CSS selectors with `css=` (e.g. `css=#searchBox`, `css=.btn`) — a bare `#` at the
+   start of a variable value is parsed as a Robot Framework comment and the locator becomes empty
 6. Text and role selectors are preferred for stability
 
 **KEYWORD REFERENCE:**

diff --git a/src/backend/crew_ai/optimization/nl_feedback_engine.py b/src/backend/crew_ai/optimization/nl_feedback_engine.py
@@ -42,6 +42,10 @@
 # Scope auto-determination — maps triage category to hint scope
 # ---------------------------------------------------------------------------
 
+# Scope map covers the full planned triage taxonomy, not just Phase 1.
+# "assertion" is pre-wired for DAY_08's wrong_assertion pattern (not yet implemented).
+# "positive"/"negative" are reserved for future LLM-triage paths that may bypass
+# the empty-text short-circuit. Do not remove — additions happen in Phase 3.
 _SCOPE_BY_CATEGORY = {
     "structural": "domain",
     "keyword": "global",
@@ -204,6 +208,14 @@ class NLFeedbackEngine(LearningEngine):
     _CROSS_REF_BOOST = 0.20
     _MAX_CONFIDENCE = 1.0
 
+    # Scope WHERE clause shared by get_hints and update_hint_effectiveness.
+    # Parameter order is (domain, url).
+    _SCOPE_WHERE = (
+        "scope = 'global' "
+        "OR (scope = 'domain' AND domain = ?) "
+        "OR (scope = 'url' AND url = ?)"
+    )
+
     def __init__(self, db_conn: sqlite3.Connection = None):
         """
         Initialize NLFeedbackEngine.
@@ -272,20 +284,13 @@ def process_feedback(
         best = matches[0]
         pattern_count = len(matches)
 
-        # Calculate confidence
-        confidence = min(
-            self._BASE_CONFIDENCE + (pattern_count * self._PATTERN_BOOST),
-            self._MAX_CONFIDENCE - self._CROSS_REF_BOOST,  # leave room for boost
-        )
-
-        # Cross-reference with error message
+        # Base + per-pattern boost, add cross-ref boost when error confirms, cap once.
+        confidence = self._BASE_CONFIDENCE + (pattern_count * self._PATTERN_BOOST)
         if error_message and self._error_confirms_feedback(
             best["taxonomy"], error_message
         ):
-            confidence = min(
-                confidence + self._CROSS_REF_BOOST,
-                self._MAX_CONFIDENCE,
-            )
+            confidence += self._CROSS_REF_BOOST
+        confidence = min(confidence, self._MAX_CONFIDENCE)
 
         result = {
             "category": best["category"],
@@ -331,7 +336,7 @@ def learn_from_feedback(self, record, feedback_insight) -> None:
 
         category = feedback_insight.get("category", "uncategorized")
         # Skip purely informational triage results (no text correction)
-        if category in ("positive",):
+        if category == "positive":
             return
 
         scope = _SCOPE_BY_CATEGORY.get(category, "domain")
@@ -422,11 +427,7 @@ def get_hints(
                 "       success_count, evidence_count "
                 "FROM nl_feedback_corrections "
                 "WHERE is_active = 1 "
-                "AND ( "
-                "    scope = 'global' "
-                "    OR (scope = 'domain' AND domain = ?) "
-                "    OR (scope = 'url' AND url = ?) "
-                ") "
+                f"AND ({self._SCOPE_WHERE}) "
                 "ORDER BY success_count DESC, "
                 "         evidence_count DESC, "
                 "         last_seen DESC "
@@ -450,12 +451,24 @@ def get_hints(
 
         # Format as hint strings (cap at 5)
         hints = [
-            f"\u26a0\ufe0f USER FEEDBACK: {h['feedback_text']}"
+            self._format_feedback_hint(h['feedback_text'])
             for h in deduped[:5]
         ]
 
         return hints if hints else None
 
+    def _format_feedback_hint(self, feedback_text: str) -> str:
+        # Pass feedback through verbatim with a universal warning header.
+        # The header travels with every hint to every agent (Planner, Assembler,
+        # Validator), so the "do not copy literally" guardrail applies even for
+        # agents whose task prompts lack an explicit disclaimer. Preserving the
+        # original line order retains the user's reasoning structure.
+        return (
+            "\u26a0\ufe0f USER FEEDBACK (reference context from a past test — "
+            "do NOT copy any code literally, treat as guidance only):\n"
+            f"{feedback_text.strip()}"
+        )
+
     def update_hint_effectiveness(
         self,
         domain: Optional[str],
@@ -487,11 +500,7 @@ def update_hint_effectiveness(
                 "       applied_count, success_count, failure_count "
                 "FROM nl_feedback_corrections "
                 "WHERE is_active = 1 "
-                "AND ( "
-                "    scope = 'global' "
-                "    OR (scope = 'domain' AND domain = ?) "
-                "    OR (scope = 'url' AND url = ?) "
-                ")",
+                f"AND ({self._SCOPE_WHERE})",
                 (domain, url),
             ).fetchall()
 

diff --git a/src/backend/crew_ai/progress_events.py b/src/backend/crew_ai/progress_events.py
@@ -174,8 +174,14 @@ def _push_if_forward(
     """Push a progress event only if it advances the current progress value.
 
     Thread-safe. Discards the event silently if progress would not increase.
+    Also ignores events for workflows whose queue has been unregistered — this
+    prevents a late async TaskCompletedEvent handler from enqueuing a duplicate
+    100% event after crew.py's manual push + unregister_workflow() have run,
+    and from re-creating the _current_progress entry (memory leak).
     """
     with _lock:
+        if _workflow_queues.get(workflow_id) is not queue:
+            return
         current = _current_progress.get(workflow_id, 0)
         if progress <= current:
             return