Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
c3ef0a4
created model, ingesting of pdf, chunking, embedding, and running the…
mdeekshita Mar 18, 2026
4d028cf
chore: created and updated sqlalchemy models for food, foodlog, tag a…
Jrodrigo06 Mar 17, 2026
fe7f837
feat/ Added similarity search and chunk_repository to query database
SiennaHarg9876 Mar 18, 2026
72cbbcb
WIP
Jrodrigo06 Mar 19, 2026
f5573e8
WIP: 2
Jrodrigo06 Mar 19, 2026
342cada
Added RAG Tagging Service
keshavgoel787 Mar 23, 2026
a24eade
Create Tests
keshavgoel787 Mar 23, 2026
aa5e999
feat: ingest pipeline — pdf parsing, chunking, embedding, vector stor…
Jrodrigo06 Mar 24, 2026
d5cd769
feat: created script to run different strategies and edited existing …
SiennaHarg9876 Mar 23, 2026
72757b3
feat: created script to run different strategies and edited existing …
SiennaHarg9876 Mar 23, 2026
c03cad7
3 different chunking + embedding models to compare
mdeekshita Mar 25, 2026
165c503
testing? potentially?
mdeekshita Mar 25, 2026
f9b071e
fix: resolve rebase conflicts and restore passing tests
Jrodrigo06 Mar 30, 2026
3d4f364
feat/Added import for semantic chunking
SiennaHarg9876 Mar 31, 2026
92f6c01
fix: updated gemini and setup env variable through container
Jrodrigo06 Mar 31, 2026
9f8a344
fix: update RAGTaggingService tests and config for google-genai SDK
Jrodrigo06 Apr 4, 2026
538d619
chore: restore package-lock.json to match main
Jrodrigo06 Apr 4, 2026
d0b7f42
Merge branch 'main' into feat/rag-knowledge-base
Jrodrigo06 Apr 4, 2026
5392c88
Fix
Jrodrigo06 Apr 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .claude/settings.local.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"permissions": {
"allow": [
"Bash(git -C \"f:/ADriveStuff/Generate/remetra\" diff HEAD -- backend/services/pdfconvert.py)"
]
}
}
Binary file added backend/data/raw/AIP Avoids.pdf
Binary file not shown.
Binary file added backend/data/raw/AIP Foods List.pdf
Binary file not shown.
Binary file added backend/data/raw/FDA 9 Examples.pdf
Binary file not shown.
Binary file added backend/data/raw/Histamine Chart.pdf
Binary file not shown.
Binary file added backend/data/raw/Low and High FODMAP Chart.pdf
Binary file not shown.
Binary file added backend/data/raw/NHS Low FODMAP Diet.pdf
Binary file not shown.
Binary file added backend/data/raw/PMC food intolerances.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions backend/models/food_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ class FoodLog(Base):
created_at = Column(DateTime(timezone=True), server_default=func.now())

food = relationship("Food", back_populates="food_logs")
food_log_tags = relationship("FoodLogTag", back_populates="food_log")
16 changes: 16 additions & 0 deletions backend/models/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ class Tag(Base):
name = Column(String, unique=True, nullable=False)
description = Column(String, nullable=True)
is_system = Column(Boolean, nullable=False, default=False)
llm_suggested = Column(Boolean, nullable=False, default=False)
created_at = Column(DateTime(timezone=True), server_default=func.now())

food_tags = relationship("FoodTag", back_populates="tag")
food_log_tags = relationship("FoodLogTag", back_populates="tag")


class FoodTag(Base):
Expand All @@ -36,3 +38,17 @@ class FoodTag(Base):
tag = relationship("Tag", back_populates="food_tags")

__table_args__ = (UniqueConstraint("food_id", "tag_id"),)


class FoodLogTag(Base):
"""Join table linking food logs to tags."""

__tablename__ = "food_log_tags"

id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
food_log_id = Column(UUID(as_uuid=True), ForeignKey("food_logs.id", ondelete="CASCADE"), nullable=False)
tag_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False)
created_at = Column(DateTime(timezone=True), server_default=func.now())

tag = relationship("Tag", back_populates="food_log_tags")
food_log = relationship("FoodLog", back_populates="food_log_tags")
3 changes: 2 additions & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ dependencies = [
"scipy>=1.17.1",
"sqlalchemy>=2.0.46",
"uvicorn[standard]>=0.40.0",
"google-generativeai>=0.8.0",
"google-genai>=1.0.0",
"torch",
"sentence-transformers>=5.3.0",
"pypdf>=6.9.2",
"semantic-text-splitter>=0.29.0",
]

[tool.uv.sources]
Expand Down
1 change: 1 addition & 0 deletions backend/schemas/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class TagBase(BaseModel):
name: str
description: Optional[str] = None
is_system: bool = False
llm_suggested: bool = False


class TagCreate(TagBase):
Expand Down
33 changes: 33 additions & 0 deletions backend/scripts/seed_knowledge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import argparse
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parent.parent))

from database import SessionLocal
from repositories.chunk_repository import ChunkRepository
from services.ingest import ingest_pdf


def main():
dbSession = SessionLocal()
try:
chunkRepo = ChunkRepository()
chunkRepo.clear_chunks(dbSession)

parser = argparse.ArgumentParser()
parser.add_argument("--strategy", type=str, default="fixed")
args = parser.parse_args()
strat = args.strategy

for pdf_path in Path("backend/data/raw/").glob("*.pdf"):
source = pdf_path.name
with open(pdf_path, "rb") as file:
chunks = ingest_pdf(file, source, strat, model_type="all-MiniLM-L6-v2")
chunkRepo.create_chunks(dbSession, source, chunks)
finally:
dbSession.close()


if __name__ == "__main__":
main()
7 changes: 3 additions & 4 deletions backend/services/RAGTaggingService.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
from typing import Optional

import google.generativeai as genai
from google import genai
from sqlalchemy.orm import Session

from schemas.tag import (
Expand All @@ -25,8 +25,7 @@ class RAGTaggingService:
"""

def __init__(self):
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
self.llm = genai.GenerativeModel("gemini-1.5-flash")
self.client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

def suggest(
self,
Expand Down Expand Up @@ -86,7 +85,7 @@ def suggest(

# 3. Call LLM
try:
response = self.llm.generate_content(prompt)
response = self.client.models.generate_content(model="gemini-2.5-flash", contents=prompt)
raw_text = response.text.strip()

# Strip markdown code fences if the model wraps the JSON
Expand Down
26 changes: 19 additions & 7 deletions backend/services/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,29 @@
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


def embed(texts: list[str]) -> list[list[float]]:
"""Embed a list of strings using all-MiniLM-L6-v2."""
embeddings = model.encode(texts, convert_to_numpy=True)
return embeddings.tolist()
def embed(texts: list[str], model_type: str = "all-MiniLM-L6-v2") -> list[list[float]]:
"""Embed a list of strings using all-MiniLM-L6-v2 as the default"""
if model_type == "all-MiniLM-L6-v2":
embeddings = model.encode(texts, convert_to_numpy=True)
return embeddings.tolist()

elif model_type in ["text-embedding-ada-002", "text-embedding-3-large"]: # pragma: no cover
import openai

def ingest_pdf(file, source: str) -> list[dict[str, Any]]:
response = openai.Embedding.create(input=texts, model=model_type)
return [item["embedding"] for item in response["data"]]

else: # pragma: no cover
raise ValueError("Embedding Model not known")


def ingest_pdf(
file, source: str, strategy: str = "semantic", model_type: str = "all-MiniLM-L6-v2"
) -> list[dict[str, Any]]:
"""Parse, chunk, and embed a PDF. Returns chunk dicts ready to persist."""
full_text = convert(file)
chunks = chunk_text(full_text)
vectors = embed(chunks)
chunks = chunk_text(full_text, strategy=strategy)
vectors = embed(chunks, model_type)

return [
{
Expand Down
31 changes: 26 additions & 5 deletions backend/services/pdfconvert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import re

from pypdf import PdfReader
from semantic_text_splitter import TextSplitter


def convert(data) -> str:
Expand All @@ -12,13 +15,31 @@ def convert(data) -> str:
return "\n".join(pages_text)


def chunk_text(text: str, size: int = 512, overlap: int = 50) -> list[str]:
def chunk_text(
text: str, strategy: str = "fixed", size: int = 512, overlap: int = 50, sentence_group: int = 3
) -> list[str]:
"""Split text into overlapping word-level chunks."""
words = text.split()
chunks = []
step = size - overlap
for i in range(0, len(words), step):
chunk = " ".join(words[i : i + size])
if chunk:

if strategy == "fixed":
step = size - overlap
for i in range(0, len(words), step):
chunk = " ".join(words[i : i + size])
if chunk:
chunks.append(chunk)

elif strategy == "semantic":
# paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
# chunks.extend(paragraphs)
splitter = TextSplitter.from_tiktoken_model("gpt-3.5-turbo", size)
chunks = splitter.chunks(text)

elif strategy == "sentence": # pragma: no cover
sentences = re.split(r"(?<=[.!?]) +", text)
sentences = [s.strip() for s in sentences if s.strip()]
for i in range(0, len(sentences), sentence_group):
chunk = " ".join(sentences[i : i + sentence_group])
chunks.append(chunk)

return chunks
27 changes: 27 additions & 0 deletions backend/tests/integration/test_chunk_embed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pathlib import Path

from sqlalchemy.orm import Session

from repositories.chunk_repository import ChunkRepository
from services.ingest import ingest_pdf

RAW_PDF_DIR = Path("backend/data/raw/")
CHUNKING_STRATEGIES = ["fixed", "semantic", "sentence"]
EMBEDDING_MODELS = ["all-MiniLM-L6-v2", "text-embedding-ada-002", "text-embedding-3-large"]


def seed_experiments(dbSession: Session):
chunkRepo = ChunkRepository()

for strategy in CHUNKING_STRATEGIES:
for model_type in EMBEDDING_MODELS:
print(f"\nSeeding chunks for strategy='{strategy}', model='{model_type}'")

# Clear previous chunks for this new combination
chunkRepo.clear_chunks(dbSession, source=f"{strategy}_{model_type}")

for pdf_path in RAW_PDF_DIR.glob("*.pdf"):
source = pdf_path.stem
with open(pdf_path, "rb") as file:
chunks_to_store = ingest_pdf(file, source=source, strategy=strategy, model_type=model_type)
chunkRepo.create_chunks(dbSession, chunks_to_store)
42 changes: 21 additions & 21 deletions backend/tests/integration/test_rag_tagging_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ class TestRAGTaggingService:
@patch("services.RAGTaggingService.genai")
def test_suggest_with_ingredients_returns_structured_response(self, mock_genai, db_session):
"""Given a food with populated ingredients, returns non-empty suggested_ingredients with buckets."""
mock_llm = MagicMock()
mock_genai.GenerativeModel.return_value = mock_llm
mock_llm.generate_content.return_value = MagicMock(
text="""{"suggested_ingredients": [{"name": "wheat flour", "buckets": ["gluten", "wheat"]}],
mock_client = MagicMock()
mock_genai.Client.return_value = mock_client
mock_client.models.generate_content.return_value = MagicMock(
text="""{"suggested_ingredients": [{"name": "wheat flour", "buckets": ["gluten", "wheat"]}],
"suggested_buckets": [{"name": "gluten", "description": "contains wheat flour"}]}"""
)

Expand All @@ -33,10 +33,10 @@ def test_suggest_with_ingredients_returns_structured_response(self, mock_genai,
@patch("services.RAGTaggingService.genai")
def test_suggest_with_no_ingredients_returns_suggestions_from_food_name(self, mock_genai, db_session):
"""Given a food with ingredients=None, returns conservative suggestions without hallucinating."""
mock_llm = MagicMock()
mock_genai.GenerativeModel.return_value = mock_llm
mock_llm.generate_content.return_value = MagicMock(
text="""{"suggested_ingredients": [{"name": "milk", "buckets": ["dairy"]}],
mock_client = MagicMock()
mock_genai.Client.return_value = mock_client
mock_client.models.generate_content.return_value = MagicMock(
text="""{"suggested_ingredients": [{"name": "milk", "buckets": ["dairy"]}],
"suggested_buckets": [{"name": "dairy", "description": "ice cream typically contains milk"}]}"""
)

Expand All @@ -54,10 +54,10 @@ def test_suggest_does_not_persist_to_db(self, mock_genai, db_session):
"""Suggestions must never be saved — no new rows in tags or food_tags tables."""
from models.tag import FoodTag, Tag

mock_llm = MagicMock()
mock_genai.GenerativeModel.return_value = mock_llm
mock_llm.generate_content.return_value = MagicMock(
text="""{"suggested_ingredients":
mock_client = MagicMock()
mock_genai.Client.return_value = mock_client
mock_client.models.generate_content.return_value = MagicMock(
text="""{"suggested_ingredients":
[{"name": "cheese", "buckets": ["dairy"]}], "suggested_buckets":
[{"name": "dairy", "description": "contains cheese"}]}"""
)
Expand All @@ -74,9 +74,9 @@ def test_suggest_does_not_persist_to_db(self, mock_genai, db_session):
@patch("services.RAGTaggingService.genai")
def test_suggest_handles_llm_json_parse_failure_gracefully(self, mock_genai, db_session):
"""If LLM returns malformed JSON, returns empty lists rather than crashing."""
mock_llm = MagicMock()
mock_genai.GenerativeModel.return_value = mock_llm
mock_llm.generate_content.return_value = MagicMock(text="this is not json")
mock_client = MagicMock()
mock_genai.Client.return_value = mock_client
mock_client.models.generate_content.return_value = MagicMock(text="this is not json")

service = RAGTaggingService()
result = service.suggest(db_session, "mystery food", ["unknown"])
Expand All @@ -88,9 +88,9 @@ def test_suggest_handles_llm_json_parse_failure_gracefully(self, mock_genai, db_
@patch("services.RAGTaggingService.genai")
def test_suggest_handles_llm_call_failure_gracefully(self, mock_genai, db_session):
"""If the LLM call itself throws, returns empty lists rather than crashing."""
mock_llm = MagicMock()
mock_genai.GenerativeModel.return_value = mock_llm
mock_llm.generate_content.side_effect = Exception("API unavailable")
mock_client = MagicMock()
mock_genai.Client.return_value = mock_client
mock_client.models.generate_content.side_effect = Exception("API unavailable")

service = RAGTaggingService()
result = service.suggest(db_session, "some food", ["ingredient"])
Expand All @@ -102,15 +102,15 @@ def test_suggest_handles_llm_call_failure_gracefully(self, mock_genai, db_sessio
@patch("services.RAGTaggingService.genai")
def test_suggest_handles_markdown_wrapped_json(self, mock_genai, db_session):
"""If LLM wraps JSON in markdown code fences, it should still parse correctly."""
mock_llm = MagicMock()
mock_genai.GenerativeModel.return_value = mock_llm
mock_client = MagicMock()
mock_genai.Client.return_value = mock_client
json_text = (
"```json\n"
'{"suggested_ingredients": [{"name": "milk", "buckets": ["dairy"]}],'
' "suggested_buckets": [{"name": "dairy", "description": "contains milk"}]}'
"\n```"
)
mock_llm.generate_content.return_value = MagicMock(text=json_text)
mock_client.models.generate_content.return_value = MagicMock(text=json_text)

service = RAGTaggingService()
result = service.suggest(db_session, "latte", ["milk", "espresso"])
Expand Down
Loading
Loading