GenerateNU · Jrodrigo06 · Apr 4, 2026 · Mar 18, 2026 · Mar 17, 2026 · Mar 18, 2026
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,7 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(git -C \"f:/ADriveStuff/Generate/remetra\" diff HEAD -- backend/services/pdfconvert.py)"
+    ]
+  }
+}
diff --git a/backend/data/raw/AIP Avoids.pdf b/backend/data/raw/AIP Avoids.pdf
diff --git a/backend/data/raw/AIP Foods List.pdf b/backend/data/raw/AIP Foods List.pdf
diff --git a/backend/data/raw/FDA 9 Examples.pdf b/backend/data/raw/FDA 9 Examples.pdf
diff --git a/backend/data/raw/Histamine Chart.pdf b/backend/data/raw/Histamine Chart.pdf
diff --git a/backend/data/raw/Low and High FODMAP Chart.pdf b/backend/data/raw/Low and High FODMAP Chart.pdf
diff --git a/backend/data/raw/NHS Low FODMAP Diet.pdf b/backend/data/raw/NHS Low FODMAP Diet.pdf
diff --git a/backend/data/raw/PMC food intolerances.pdf b/backend/data/raw/PMC food intolerances.pdf
diff --git a/backend/models/food_log.py b/backend/models/food_log.py
@@ -24,3 +24,4 @@ class FoodLog(Base):
     created_at = Column(DateTime(timezone=True), server_default=func.now())
 
     food = relationship("Food", back_populates="food_logs")
+    food_log_tags = relationship("FoodLogTag", back_populates="food_log")
diff --git a/backend/models/tag.py b/backend/models/tag.py
@@ -17,9 +17,11 @@ class Tag(Base):
     name = Column(String, unique=True, nullable=False)
     description = Column(String, nullable=True)
     is_system = Column(Boolean, nullable=False, default=False)
+    llm_suggested = Column(Boolean, nullable=False, default=False)
     created_at = Column(DateTime(timezone=True), server_default=func.now())
 
     food_tags = relationship("FoodTag", back_populates="tag")
+    food_log_tags = relationship("FoodLogTag", back_populates="tag")
 
 
 class FoodTag(Base):
@@ -36,3 +38,17 @@ class FoodTag(Base):
     tag = relationship("Tag", back_populates="food_tags")
 
     __table_args__ = (UniqueConstraint("food_id", "tag_id"),)
+
+
+class FoodLogTag(Base):
+    """Join table linking food logs to tags."""
+
+    __tablename__ = "food_log_tags"
+
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    food_log_id = Column(UUID(as_uuid=True), ForeignKey("food_logs.id", ondelete="CASCADE"), nullable=False)
+    tag_id = Column(UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False)
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+
+    tag = relationship("Tag", back_populates="food_log_tags")
+    food_log = relationship("FoodLog", back_populates="food_log_tags")
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -21,10 +21,11 @@ dependencies = [
     "scipy>=1.17.1",
     "sqlalchemy>=2.0.46",
     "uvicorn[standard]>=0.40.0",
-    "google-generativeai>=0.8.0",
+    "google-genai>=1.0.0",
     "torch",
     "sentence-transformers>=5.3.0",
     "pypdf>=6.9.2",
+    "semantic-text-splitter>=0.29.0",
 ]
 
 [tool.uv.sources]

diff --git a/backend/schemas/tag.py b/backend/schemas/tag.py
@@ -13,6 +13,7 @@ class TagBase(BaseModel):
     name: str
     description: Optional[str] = None
     is_system: bool = False
+    llm_suggested: bool = False
 
 
 class TagCreate(TagBase):

diff --git a/backend/scripts/seed_knowledge.py b/backend/scripts/seed_knowledge.py
@@ -0,0 +1,33 @@
+import argparse
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+
+from database import SessionLocal
+from repositories.chunk_repository import ChunkRepository
+from services.ingest import ingest_pdf
+
+
+def main():
+    dbSession = SessionLocal()
+    try:
+        chunkRepo = ChunkRepository()
+        chunkRepo.clear_chunks(dbSession)
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--strategy", type=str, default="fixed")
+        args = parser.parse_args()
+        strat = args.strategy
+
+        for pdf_path in Path("backend/data/raw/").glob("*.pdf"):
+            source = pdf_path.name
+            with open(pdf_path, "rb") as file:
+                chunks = ingest_pdf(file, source, strat, model_type="all-MiniLM-L6-v2")
+                chunkRepo.create_chunks(dbSession, source, chunks)
+    finally:
+        dbSession.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/services/RAGTaggingService.py b/backend/services/RAGTaggingService.py
@@ -5,7 +5,7 @@
 import os
 from typing import Optional
 
-import google.generativeai as genai
+from google import genai
 from sqlalchemy.orm import Session
 
 from schemas.tag import (
@@ -25,8 +25,7 @@ class RAGTaggingService:
     """
 
     def __init__(self):
-        genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
-        self.llm = genai.GenerativeModel("gemini-1.5-flash")
+        self.client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
 
     def suggest(
         self,
@@ -86,7 +85,7 @@ def suggest(
 
         # 3. Call LLM
         try:
-            response = self.llm.generate_content(prompt)
+            response = self.client.models.generate_content(model="gemini-2.5-flash", contents=prompt)
             raw_text = response.text.strip()
 
             # Strip markdown code fences if the model wraps the JSON

diff --git a/backend/services/ingest.py b/backend/services/ingest.py
@@ -11,17 +11,29 @@
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
 
-def embed(texts: list[str]) -> list[list[float]]:
-    """Embed a list of strings using all-MiniLM-L6-v2."""
-    embeddings = model.encode(texts, convert_to_numpy=True)
-    return embeddings.tolist()
+def embed(texts: list[str], model_type: str = "all-MiniLM-L6-v2") -> list[list[float]]:
+    """Embed a list of strings using all-MiniLM-L6-v2 as the default"""
+    if model_type == "all-MiniLM-L6-v2":
+        embeddings = model.encode(texts, convert_to_numpy=True)
+        return embeddings.tolist()
 
+    elif model_type in ["text-embedding-ada-002", "text-embedding-3-large"]:  # pragma: no cover
+        import openai
 
-def ingest_pdf(file, source: str) -> list[dict[str, Any]]:
+        response = openai.Embedding.create(input=texts, model=model_type)
+        return [item["embedding"] for item in response["data"]]
+
+    else:  # pragma: no cover
+        raise ValueError("Embedding Model not known")
+
+
+def ingest_pdf(
+    file, source: str, strategy: str = "semantic", model_type: str = "all-MiniLM-L6-v2"
+) -> list[dict[str, Any]]:
     """Parse, chunk, and embed a PDF. Returns chunk dicts ready to persist."""
     full_text = convert(file)
-    chunks = chunk_text(full_text)
-    vectors = embed(chunks)
+    chunks = chunk_text(full_text, strategy=strategy)
+    vectors = embed(chunks, model_type)
 
     return [
         {

diff --git a/backend/services/pdfconvert.py b/backend/services/pdfconvert.py
@@ -1,4 +1,7 @@
+import re
+
 from pypdf import PdfReader
+from semantic_text_splitter import TextSplitter
 
 
 def convert(data) -> str:
@@ -12,13 +15,31 @@ def convert(data) -> str:
     return "\n".join(pages_text)
 
 
-def chunk_text(text: str, size: int = 512, overlap: int = 50) -> list[str]:
+def chunk_text(
+    text: str, strategy: str = "fixed", size: int = 512, overlap: int = 50, sentence_group: int = 3
+) -> list[str]:
     """Split text into overlapping word-level chunks."""
     words = text.split()
     chunks = []
-    step = size - overlap
-    for i in range(0, len(words), step):
-        chunk = " ".join(words[i : i + size])
-        if chunk:
+
+    if strategy == "fixed":
+        step = size - overlap
+        for i in range(0, len(words), step):
+            chunk = " ".join(words[i : i + size])
+            if chunk:
+                chunks.append(chunk)
+
+    elif strategy == "semantic":
+        # paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+        # chunks.extend(paragraphs)
+        splitter = TextSplitter.from_tiktoken_model("gpt-3.5-turbo", size)
+        chunks = splitter.chunks(text)
+
+    elif strategy == "sentence":  # pragma: no cover
+        sentences = re.split(r"(?<=[.!?]) +", text)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        for i in range(0, len(sentences), sentence_group):
+            chunk = " ".join(sentences[i : i + sentence_group])
             chunks.append(chunk)
+
     return chunks
diff --git a/backend/tests/integration/test_chunk_embed.py b/backend/tests/integration/test_chunk_embed.py
@@ -0,0 +1,27 @@
+from pathlib import Path
+
+from sqlalchemy.orm import Session
+
+from repositories.chunk_repository import ChunkRepository
+from services.ingest import ingest_pdf
+
+RAW_PDF_DIR = Path("backend/data/raw/")
+CHUNKING_STRATEGIES = ["fixed", "semantic", "sentence"]
+EMBEDDING_MODELS = ["all-MiniLM-L6-v2", "text-embedding-ada-002", "text-embedding-3-large"]
+
+
+def seed_experiments(dbSession: Session):
+    chunkRepo = ChunkRepository()
+
+    for strategy in CHUNKING_STRATEGIES:
+        for model_type in EMBEDDING_MODELS:
+            print(f"\nSeeding chunks for strategy='{strategy}', model='{model_type}'")
+
+            # Clear previous chunks for this new combination
+            chunkRepo.clear_chunks(dbSession, source=f"{strategy}_{model_type}")
+
+            for pdf_path in RAW_PDF_DIR.glob("*.pdf"):
+                source = pdf_path.stem
+                with open(pdf_path, "rb") as file:
+                    chunks_to_store = ingest_pdf(file, source=source, strategy=strategy, model_type=model_type)
+                    chunkRepo.create_chunks(dbSession, chunks_to_store)
diff --git a/backend/tests/integration/test_rag_tagging_service.py b/backend/tests/integration/test_rag_tagging_service.py
@@ -12,10 +12,10 @@ class TestRAGTaggingService:
     @patch("services.RAGTaggingService.genai")
     def test_suggest_with_ingredients_returns_structured_response(self, mock_genai, db_session):
         """Given a food with populated ingredients, returns non-empty suggested_ingredients with buckets."""
-        mock_llm = MagicMock()
-        mock_genai.GenerativeModel.return_value = mock_llm
-        mock_llm.generate_content.return_value = MagicMock(
-            text="""{"suggested_ingredients": [{"name": "wheat flour", "buckets": ["gluten", "wheat"]}], 
+        mock_client = MagicMock()
+        mock_genai.Client.return_value = mock_client
+        mock_client.models.generate_content.return_value = MagicMock(
+            text="""{"suggested_ingredients": [{"name": "wheat flour", "buckets": ["gluten", "wheat"]}],
             "suggested_buckets": [{"name": "gluten", "description": "contains wheat flour"}]}"""
         )
 
@@ -33,10 +33,10 @@ def test_suggest_with_ingredients_returns_structured_response(self, mock_genai,
     @patch("services.RAGTaggingService.genai")
     def test_suggest_with_no_ingredients_returns_suggestions_from_food_name(self, mock_genai, db_session):
         """Given a food with ingredients=None, returns conservative suggestions without hallucinating."""
-        mock_llm = MagicMock()
-        mock_genai.GenerativeModel.return_value = mock_llm
-        mock_llm.generate_content.return_value = MagicMock(
-            text="""{"suggested_ingredients": [{"name": "milk", "buckets": ["dairy"]}], 
+        mock_client = MagicMock()
+        mock_genai.Client.return_value = mock_client
+        mock_client.models.generate_content.return_value = MagicMock(
+            text="""{"suggested_ingredients": [{"name": "milk", "buckets": ["dairy"]}],
             "suggested_buckets": [{"name": "dairy", "description": "ice cream typically contains milk"}]}"""
         )
 
@@ -54,10 +54,10 @@ def test_suggest_does_not_persist_to_db(self, mock_genai, db_session):
         """Suggestions must never be saved — no new rows in tags or food_tags tables."""
         from models.tag import FoodTag, Tag
 
-        mock_llm = MagicMock()
-        mock_genai.GenerativeModel.return_value = mock_llm
-        mock_llm.generate_content.return_value = MagicMock(
-            text="""{"suggested_ingredients": 
+        mock_client = MagicMock()
+        mock_genai.Client.return_value = mock_client
+        mock_client.models.generate_content.return_value = MagicMock(
+            text="""{"suggested_ingredients":
             [{"name": "cheese", "buckets": ["dairy"]}], "suggested_buckets":
               [{"name": "dairy", "description": "contains cheese"}]}"""
         )
@@ -74,9 +74,9 @@ def test_suggest_does_not_persist_to_db(self, mock_genai, db_session):
     @patch("services.RAGTaggingService.genai")
     def test_suggest_handles_llm_json_parse_failure_gracefully(self, mock_genai, db_session):
         """If LLM returns malformed JSON, returns empty lists rather than crashing."""
-        mock_llm = MagicMock()
-        mock_genai.GenerativeModel.return_value = mock_llm
-        mock_llm.generate_content.return_value = MagicMock(text="this is not json")
+        mock_client = MagicMock()
+        mock_genai.Client.return_value = mock_client
+        mock_client.models.generate_content.return_value = MagicMock(text="this is not json")
 
         service = RAGTaggingService()
         result = service.suggest(db_session, "mystery food", ["unknown"])
@@ -88,9 +88,9 @@ def test_suggest_handles_llm_json_parse_failure_gracefully(self, mock_genai, db_
     @patch("services.RAGTaggingService.genai")
     def test_suggest_handles_llm_call_failure_gracefully(self, mock_genai, db_session):
         """If the LLM call itself throws, returns empty lists rather than crashing."""
-        mock_llm = MagicMock()
-        mock_genai.GenerativeModel.return_value = mock_llm
-        mock_llm.generate_content.side_effect = Exception("API unavailable")
+        mock_client = MagicMock()
+        mock_genai.Client.return_value = mock_client
+        mock_client.models.generate_content.side_effect = Exception("API unavailable")
 
         service = RAGTaggingService()
         result = service.suggest(db_session, "some food", ["ingredient"])
@@ -102,15 +102,15 @@ def test_suggest_handles_llm_call_failure_gracefully(self, mock_genai, db_sessio
     @patch("services.RAGTaggingService.genai")
     def test_suggest_handles_markdown_wrapped_json(self, mock_genai, db_session):
         """If LLM wraps JSON in markdown code fences, it should still parse correctly."""
-        mock_llm = MagicMock()
-        mock_genai.GenerativeModel.return_value = mock_llm
+        mock_client = MagicMock()
+        mock_genai.Client.return_value = mock_client
         json_text = (
             "```json\n"
             '{"suggested_ingredients": [{"name": "milk", "buckets": ["dairy"]}],'
             ' "suggested_buckets": [{"name": "dairy", "description": "contains milk"}]}'
             "\n```"
         )
-        mock_llm.generate_content.return_value = MagicMock(text=json_text)
+        mock_client.models.generate_content.return_value = MagicMock(text=json_text)
 
         service = RAGTaggingService()
         result = service.suggest(db_session, "latte", ["milk", "espresso"])
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,3 +24,4 @@ class FoodLog(Base):
		created_at = Column(DateTime(timezone=True), server_default=func.now())

		food = relationship("Food", back_populates="food_logs")
		food_log_tags = relationship("FoodLogTag", back_populates="food_log")