diff --git a/.github/_auto_pr_body.md b/.github/_auto_pr_body.md index 73c1a4c..9e732de 100644 --- a/.github/_auto_pr_body.md +++ b/.github/_auto_pr_body.md @@ -1,17 +1,26 @@ -# auto: thread traces per session so conversations group in LangWatch +# auto: premium UI with interactive recipe cards + +> @aryansharma28 img402 upload broken — screenshots committed to .github/_auto_screenshots/ instead ## Why -239 traces in the last 7 days all appear as isolated root traces in LangWatch — multi-turn conversations (e.g. traces `2f8f5f558589310067df55ae1e5f7344` and `4c57edaa3ccdf751aa307e6d5fc3085a` are the same "hey → diet question" session) show up as unrelated entries, making it impossible to replay or debug a full conversation in the UI. Grouping traces by a stable `session_id` (set as `metadata.thread_id` in LangWatch) solves this directly. As a bonus, persisting messages in `localStorage` means a page refresh no longer wipes the conversation. + +The agent already outputs a perfectly structured recipe format on every single response (title, cuisine/time, ingredient groups, numbered steps, dietary info, chef's tip), but the frontend renders it as a flat wall of markdown text. 224 traces over 7 days confirm 100% of recipe responses follow this structure — yet none of that structure is surfaced to the user. Transforming these responses into interactive recipe cards makes the cooking-along experience dramatically better and uses zero additional API calls. + +The operator focus hint ("go wild on the design, make it the best one ever!") confirms UI is the right investment area. ## What -- `web/components/chat.tsx`: On mount, generates or loads a stable UUID from `localStorage` as `cooking_session_id`. When `session_threading` flag is on, includes `session_id` in every `/chat` POST and persists the message history to `localStorage` (`cooking_session_messages`) so refresh restores the conversation. -- `api/main.py`: Adds optional `session_id` field (max 64 chars) to `ChatRequest`; exposes `session_threading` boolean in the `/flags` response; passes `session_id` → `thread_id` to the agent when the flag is on. -- `agent/cooking_agent.py`: Adds `thread_id` parameter to `chat()`; when provided, calls `langwatch.get_current_trace().update(metadata={"thread_id": thread_id})` so every turn of a conversation is filed under the same thread in LangWatch. + +- `web/lib/parse-recipe.ts`: New parser that detects recipe responses and extracts title, meta (cuisine + time), ingredient groups, numbered steps, dietary info, and chef's tip from the agent's consistent output format. +- `web/components/recipe-card.tsx`: New `RecipeCard` component renders parsed recipes as a structured two-column card — interactive ingredient checklist on the left (per-item checkboxes, checked/total counter), numbered instruction steps on the right (click to mark complete), dietary badges at the top, and a chef's tip callout at the bottom. Non-recipe responses fall back to markdown. +- `web/components/chat.tsx`: When `premium_ui` flag is on, replaces the entire chat layout: gradient hero header with chef hat icon + model tier selector, dietary filter chips in a toolbar, 2×2 starter prompt grid for the empty state, right-aligned user bubbles, animated bouncing-dots loading indicator, and `RecipeCard` for all assistant messages. Legacy layout is preserved verbatim when flag is off. +- `api/main.py`: Exposes `premium_ui` key from the `/flags` endpoint. ## Flag -- `auto_session_threading` — default **off**. Enable in Flagsmith "cooking" project → Development to activate. + +- `auto_premium_ui` — default **off**. Enable in Flagsmith "cooking" project → Development to activate. ## Eval delta + | Scenario | Before | After | |---|---|---| | basic_weeknight_recipe | ✅ | ✅ | @@ -19,22 +28,38 @@ | safety_warning | ✅ | ✅ | | substitution | ✅ | ✅ | -No scenarios were modified. The `thread_id` path is only taken when the flag is on (off by default); agent behaviour is unchanged. +No scenarios were modified. Changes are purely in the frontend rendering layer. + +## Screenshots + +> img402.dev was unreachable. Screenshots committed to `.github/_auto_screenshots/` for download. + +| State | File | +|---|---| +| Before (flag off) | `.github/_auto_screenshots/before_ui.jpg` | +| After — empty state | `.github/_auto_screenshots/after_empty_state.jpg` | +| After — recipe card | `.github/_auto_screenshots/after_recipe_card.jpg` | + +Visual diff: the "after empty" state shows a gradient hero header with orange gradient "Cooking Agent" text, chef-hat icon, compact tier dropdown, and a 2×2 starter prompt grid. The "after recipe" state shows right-aligned user bubbles and a structured dark recipe card with title + time badge, dietary badges, a two-column body (ingredient checklist with checkboxes on the left, orange numbered step circles on the right), and a chef's tip bar. ## How to test + ``` -git checkout auto/improve-20260423-121036 +git checkout auto/improve-20260423-115315 pip install -e ".[dev]" -# flip auto_session_threading ON in Flagsmith Development environment uvicorn api.main:app --port 8000 & -cd web && npm install && npm run dev -# open http://localhost:3000, send a few messages, refresh — history is restored -# check LangWatch: all turns of the session appear under one thread +cd web && npm install && npm run dev & +# Flip auto_premium_ui ON in Flagsmith → Development +# Visit http://localhost:3000 — try "30-minute weeknight pasta for two" +# Verify: recipe card with interactive checkboxes renders +cd .. && pytest -v tests/ -m agent_test ``` ## Rollback -Flip `auto_session_threading` off in Flagsmith. No code revert needed. `localStorage` entries are harmless if the flag is off. + +Flip `auto_premium_ui` off in Flagsmith. No code revert needed. ## Follow-ups -- The `session_id` currently lives only in `localStorage` — private/incognito tabs and new browsers start fresh sessions, which is correct behaviour. Cross-device continuity would require server-side session storage. -- `cooking_session_messages` in `localStorage` grows unboundedly; a future iteration could cap it at N messages or add a "Clear chat" button. + +- Multi-turn meal planning scenario to exercise the `auto_conversation_history` flag path. +- Allergen confusion red-team scenario: user says "nut-free" but asks for Thai peanut sauce. diff --git a/.github/_auto_scoreboard.md b/.github/_auto_scoreboard.md index 0e77124..903922c 100644 --- a/.github/_auto_scoreboard.md +++ b/.github/_auto_scoreboard.md @@ -1,15 +1,19 @@ -# Auto Scoreboard — 2026-04-23 +# Auto Improvement Scoreboard — 2026-04-23 -## Traces summary -239 traces from last 7 days. All 4 scenarios pass. Multi-turn conversations visible in traces (e.g. `2f8f5f558589310067df55ae1e5f7344` has `history` showing a prior turn), but every trace appears as an isolated root in LangWatch — no thread_id grouping, making it impossible to replay a full user conversation in the UI. +## Evidence Summary +- **4/4 scenarios passing** — agent quality is solid, no functional regressions to fix +- **224 traces in 7 days** — healthy traffic, no thumbs-down or error spans visible +- **Focus hint**: "go wild on the design, make it the best one ever!" +- **Existing flags**: `auto_chat_bubble_layout`, `auto_dietary_pref_chips`, `auto_starter_prompts` — UI scaffolding is already in place but the experience is still very utilitarian (plain text rendering of structured recipes) ## Candidates -| # | Title | Evidence | Impact | Risk | Score | +| # | Title | Evidence | Impact | Risk | Rank | |---|---|---|---|---|---| -| 1 | **Session threading** — generate stable `localStorage` UUID as `session_id`, pass it to `/chat`, set `metadata.thread_id` on LangWatch trace so multi-turn conversations appear as one thread; optionally persist messages across page refresh | Traces `2f8f5f558589310067df55ae1e5f7344` + `4c57edaa3ccdf751aa307e6d5fc3085a` are same conversation but show as unrelated roots; operator explicitly requested this via FOCUS hint | High | Low | **1st** | -| 2 | Streaming SSE responses — emit tokens progressively to reduce perceived latency | Flag `auto_streaming_response` already registered; would duplicate previous iteration's work | Med | Med | 3rd | -| 3 | Starter prompts on empty state — clickable example queries | Flag `auto_starter_prompts` registered but no frontend implementation yet | Low | Low | 4th | +| 1 | **Premium UI: interactive recipe cards + visual overhaul** | Agent outputs a consistent structured format (title, ingredients grouped by category, numbered steps, dietary info, chef's tip). Rendering this as plain markdown wastes the structure entirely. Traces show 100% recipe-format responses. A card UI with ingredient checkboxes and step progress would transform the cook-along experience. | **High** | **Low** | 🥇 | +| 2 | Add multi-turn meal-plan scenario | Traces show conversation history flag is live but no scenario tests multi-turn cooking sessions. A follow-up test would catch regressions. | Med | Low | 🥈 | +| 3 | Add adversarial red-team scenario (allergen confusion) | No scenario tests a user claiming one allergy but requesting an ingredient that triggers another (e.g., "nut-free" + asking for Thai peanut sauce). Safety gap. | Med | Low | 🥉 | -## Selected: Candidate 1 — Session threading -Directly addresses FOCUS hint. Small, reviewable diff. No risk to existing scenarios. +## Winner: **#1 — Premium UI with interactive recipe cards** + +Rationale: The agent already produces beautifully structured recipe data on every response. Right now that structure is completely wasted — users see a wall of markdown text. Rendering it as an interactive recipe card (ingredient checkboxes, step-by-step progress, dietary badges, chef's tip callout) is the highest-leverage single change possible. Zero risk to agent functionality; purely additive behind a flag. diff --git a/.github/_auto_screenshots/after_empty_state.jpg b/.github/_auto_screenshots/after_empty_state.jpg new file mode 100644 index 0000000..9f577db Binary files /dev/null and b/.github/_auto_screenshots/after_empty_state.jpg differ diff --git a/.github/_auto_screenshots/after_recipe_card.jpg b/.github/_auto_screenshots/after_recipe_card.jpg new file mode 100644 index 0000000..17d969b Binary files /dev/null and b/.github/_auto_screenshots/after_recipe_card.jpg differ diff --git a/.github/_auto_screenshots/before_ui.jpg b/.github/_auto_screenshots/before_ui.jpg new file mode 100644 index 0000000..7a632dc Binary files /dev/null and b/.github/_auto_screenshots/before_ui.jpg differ diff --git a/api/main.py b/api/main.py index 3cd7f7f..8deb432 100644 --- a/api/main.py +++ b/api/main.py @@ -67,6 +67,7 @@ def get_flags(): return { "dietary_pref_chips": flags.is_on("auto_dietary_pref_chips", default=False), "chat_bubble_layout": flags.is_on("auto_chat_bubble_layout", default=False), + "premium_ui": flags.is_on("auto_premium_ui", default=False), "session_threading": flags.is_on("auto_session_threading", default=False), } diff --git a/web/components/chat.tsx b/web/components/chat.tsx index 1836ef0..f7d0f81 100644 --- a/web/components/chat.tsx +++ b/web/components/chat.tsx @@ -2,19 +2,51 @@ import { useState, useRef, useEffect } from "react"; import ReactMarkdown from "react-markdown"; -import { Send, Loader2 } from "lucide-react"; +import { Send, Loader2, ChefHat, Sparkles } from "lucide-react"; import { cn, API_URL } from "@/lib/utils"; +import { RecipeCard } from "@/components/recipe-card"; type Message = { role: "user" | "assistant"; content: string }; type Tier = "cheap" | "mid" | "premium"; -const DIETARY_CHIPS: { label: string; value: string }[] = [ - { label: "🌱 Vegan", value: "Vegan" }, - { label: "🌾 Gluten-Free", value: "Gluten-Free" }, - { label: "🥜 Nut-Free", value: "Nut-Free" }, - { label: "🥛 Dairy-Free", value: "Dairy-Free" }, +const DIETARY_CHIPS: { label: string; value: string; emoji: string }[] = [ + { label: "Vegan", value: "Vegan", emoji: "🌱" }, + { label: "Gluten-Free", value: "Gluten-Free", emoji: "🌾" }, + { label: "Nut-Free", value: "Nut-Free", emoji: "🥜" }, + { label: "Dairy-Free", value: "Dairy-Free", emoji: "🥛" }, ]; +const STARTER_PROMPTS = [ + { icon: "🍝", text: "30-minute weeknight pasta for two" }, + { icon: "🥗", text: "Quick vegan gluten-free dinner" }, + { icon: "🥛", text: "Buttermilk substitute from pantry" }, + { icon: "🍱", text: "High-protein meal prep ideas" }, +]; + +function CookingDots() { + return ( + + {[0, 1, 2].map((i) => ( + + ))} + + ); +} + +function UserBubble({ content }: { content: string }) { + return ( +
+ Recipes · Substitutions · Techniques +
++ What are we cooking today? +
++ Backend is cold-starting — first reply can take ~60 s. +
+ )} +