From d789f1bd315e21d9ae5fe89bc873e7f804638826 Mon Sep 17 00:00:00 2001 From: azambrano Date: Wed, 26 Nov 2025 13:53:02 +0100 Subject: [PATCH] RAG build parallelization, Mongo connection, alternative UI and knowledge domain management --- .env.example | 20 + .gitignore | 13 + Dockerfile | 37 + README.md | 534 ++++++++++-- document_manager_ui.py | 777 ++++++++++++++++++ energy_bench.py | 9 +- kb_creator.py | 12 +- kb_creator_Europe.py | 8 +- kb_creator_Italy.py | 8 +- kb_creator_Switzerland.py | 8 +- .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../files_Europe/chroma_db/chroma.sqlite3 | Bin .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../files_Europe/graph_documents.joblib | Bin .../files_Europe/preprocessed_chunks.joblib | Bin .../{ => kbs}/files_Europe/raw_docs.joblib | Bin .../{ => kbs}/files_Europe/rdf_graph.ttl | 0 .../data_level0.bin | Bin .../header.bin | Bin .../index_metadata.pickle | Bin .../length.bin | Bin .../link_lists.bin | Bin .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../files_Generic/chroma_db/chroma.sqlite3 | Bin .../files_Generic/graph_documents.joblib | Bin .../files_Generic/preprocessed_chunks.joblib | Bin .../{ => kbs}/files_Generic/raw_docs.joblib | Bin .../{ => kbs}/files_Generic/rdf_graph.ttl | 0 .../data_level0.bin | Bin .../header.bin | Bin .../index_metadata.pickle | Bin .../length.bin | Bin .../link_lists.bin | Bin .../data_level0.bin | Bin .../header.bin | Bin .../index_metadata.pickle | Bin .../length.bin | Bin .../link_lists.bin | Bin .../data_level0.bin | Bin .../header.bin | Bin .../index_metadata.pickle | Bin .../length.bin | Bin .../link_lists.bin | Bin .../files_Italy/chroma_db/chroma.sqlite3 | Bin .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../files_Italy/graph_documents.joblib | Bin .../files_Italy/preprocessed_chunks.joblib | Bin .../{ => kbs}/files_Italy/raw_docs.joblib | Bin .../{ => kbs}/files_Italy/rdf_graph.ttl | 0 .../data_level0.bin | Bin .../header.bin | Bin .../index_metadata.pickle | Bin .../length.bin | Bin .../link_lists.bin | Bin .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../chroma_db/chroma.sqlite3 | Bin .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../data_level0.bin | Bin .../header.bin | Bin .../length.bin | Bin .../link_lists.bin | 0 .../files_Switzerland/graph_documents.joblib | Bin .../preprocessed_chunks.joblib | Bin .../files_Switzerland/raw_docs.joblib | Bin .../{ => kbs}/files_Switzerland/rdf_graph.ttl | 0 knowledge_base/knowledge_extractor.py | 539 +++++++++--- knowledge_base/knowledge_manager.py | 22 +- knowledge_base/utils/graph_helpers.py | 1 - llm/__init__.py | 2 +- llm/langchain.py | 101 ++- requirements.txt | 3 + run_document_manager.sh | 3 + src/__init__.py | 3 + src/chat_manager.py | 70 ++ src/database.py | 32 + src/document_manager.py | 143 ++++ src/domain_manager.py | 34 + src/kb_builder.py | 428 ++++++++++ src/orchestrator_manager.py | 34 + src/utils.py | 8 + streamlit_ui.py | 42 +- 115 files changed, 2701 insertions(+), 190 deletions(-) create mode 100644 .env.example create mode 100644 Dockerfile create mode 100644 document_manager_ui.py rename knowledge_base/{ => kbs}/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/header.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/length.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/header.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/length.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/header.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/length.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/chroma.sqlite3 (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/header.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/length.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Europe/graph_documents.joblib (100%) rename knowledge_base/{ => kbs}/files_Europe/preprocessed_chunks.joblib (100%) rename knowledge_base/{ => kbs}/files_Europe/raw_docs.joblib (100%) rename knowledge_base/{ => kbs}/files_Europe/rdf_graph.ttl (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/header.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/index_metadata.pickle (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/length.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/header.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/length.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/header.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/length.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/header.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/length.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Generic/chroma_db/chroma.sqlite3 (100%) rename knowledge_base/{ => kbs}/files_Generic/graph_documents.joblib (100%) rename knowledge_base/{ => kbs}/files_Generic/preprocessed_chunks.joblib (100%) rename knowledge_base/{ => kbs}/files_Generic/raw_docs.joblib (100%) rename knowledge_base/{ => kbs}/files_Generic/rdf_graph.ttl (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/header.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/index_metadata.pickle (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/length.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/header.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/index_metadata.pickle (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/length.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/header.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/index_metadata.pickle (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/length.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/chroma.sqlite3 (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/header.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/length.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Italy/graph_documents.joblib (100%) rename knowledge_base/{ => kbs}/files_Italy/preprocessed_chunks.joblib (100%) rename knowledge_base/{ => kbs}/files_Italy/raw_docs.joblib (100%) rename knowledge_base/{ => kbs}/files_Italy/rdf_graph.ttl (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/header.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/index_metadata.pickle (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/length.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/header.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/length.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/chroma.sqlite3 (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/header.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/length.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/data_level0.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/header.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/length.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/link_lists.bin (100%) rename knowledge_base/{ => kbs}/files_Switzerland/graph_documents.joblib (100%) rename knowledge_base/{ => kbs}/files_Switzerland/preprocessed_chunks.joblib (100%) rename knowledge_base/{ => kbs}/files_Switzerland/raw_docs.joblib (100%) rename knowledge_base/{ => kbs}/files_Switzerland/rdf_graph.ttl (100%) create mode 100755 run_document_manager.sh create mode 100644 src/__init__.py create mode 100644 src/chat_manager.py create mode 100644 src/database.py create mode 100644 src/document_manager.py create mode 100644 src/domain_manager.py create mode 100644 src/kb_builder.py create mode 100644 src/orchestrator_manager.py create mode 100644 src/utils.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..fca89a7 --- /dev/null +++ b/.env.example @@ -0,0 +1,20 @@ +# LLM Configuration +# Set to "true" for local LLM (e.g., Ollama), "false" for remote LLM (e.g., OpenAI) +LLM_LOCAL=false + +# Base URL for local LLM API (only used when LLM_LOCAL=true) +LLM_BASE_URL=http://llmserver:11434/ + +# API Keys for different LLM providers +OPENAI_API_KEY=your-openai-api-key-here +OLLAMA_API_KEY= +ANTHROPIC_API_KEY= +DEEPSEEK_API_KEY= + +# MongoDB Configuration +MONGODB_URI=mongodb://localhost:27017/ +MONGODB_DATABASE=energenius +MONGODB_COLLECTION=documents +# Application Settings +STREAMLIT_SERVER_PORT=8501 +STREAMLIT_SERVER_ADDRESS=0.0.0.0 diff --git a/.gitignore b/.gitignore index e5bb792..2298f5c 100644 --- a/.gitignore +++ b/.gitignore @@ -176,5 +176,18 @@ private_settings.py # Mac .DS_Store +# Temporary files +*.tmp +*.bak +*.log # Traing data /knowledge_base/files/ + + +*/.streamlit/ +*/.vscode/ +*/prompt.txt + +# Project-specific CI/CD and analysis files +sonar-project.properties +.gitlab-ci.yml \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9c6bbf4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM python:3.13-slim + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements files +COPY requirements.txt /app/ + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . /app/ + +# Create directories for knowledge base storage +RUN mkdir -p /app/knowledge_base/temp_pdfs + +# Expose Streamlit default port +EXPOSE 8501 + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV STREAMLIT_SERVER_PORT=8501 +ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl --fail http://localhost:8501/_stcore/health || exit 1 + +# Run the Streamlit app +CMD ["streamlit", "run", "document_manager_ui.py", "--server.port=8501", "--server.address=0.0.0.0"] diff --git a/README.md b/README.md index e6cc8f2..bac0408 100644 --- a/README.md +++ b/README.md @@ -1,89 +1,511 @@ -# EnergeniusRAG +# EnergeniusRAG Platform -# Docs and Diagram +[![Python Version](https://img.shields.io/badge/python-3.13-blue.svg)](https://www.python.org/downloads/) +[![Streamlit](https://img.shields.io/badge/streamlit-1.45.0-FF4B4B.svg)](https://streamlit.io) +[![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) -The folder `docs` is prepared to include relevant documentation and diagrams for the project. +A comprehensive Retrieval-Augmented Generation (RAG) platform for energy efficiency consultation. EnergeniusRAG combines a document management system with an AI-powered conversational agent (GURU) to provide personalized energy efficiency recommendations. -Right now it contains the current _architecture diagram_. +## πŸ“‹ Table of Contents -## Setup +- [Overview](#overview) +- [Features](#features) +- [Architecture](#architecture) +- [Installation](#installation) +- [Configuration](#configuration) +- [Usage](#usage) +- [Knowledge Base Creation](#knowledge-base-creation) +- [API & Integration](#api--integration) +- [Development](#development) +- [Docker Deployment](#docker-deployment) +- [Troubleshooting](#troubleshooting) +- [Citation](#citation) +- [License](#license) -Install [conda](https://docs.conda.io/projects/conda/en/23.1.x/user-guide/install/) and [homebrew](https://brew.sh/) if needed. +## 🎯 Overview -_Note to self_: conda is only needed to use the same version of python as the server. Is that necessary? +GURU is an advanced platform designed to enhance human-AI collaboration for energy efficiency consulting. It combines: -create a conda environment using +- **Document Management**: Upload, organize, and manage energy-related documents (PDFs, URLs) by domain +- **Knowledge Base Creation**: Automatically build vector databases from documents using RAG techniques +- **AI Conversational Agent (GURU)**: Chat with an intelligent assistant powered by LLMs and your knowledge base +- **Multi-Provider Support**: Works with both OpenAI and local Ollama models +- **Domain-Based Organization**: Separate knowledge bases for different regions or topics (Europe, Italy, Switzerland, etc.) -```shell -conda create -n energenius python=3.13 +## ✨ Features + +### Document Manager +- πŸ“€ **Upload PDFs and URLs**: Add documents from multiple sources +- πŸ—‚οΈ **Domain Management**: Organize documents into logical domains (regions, topics) +- 🏷️ **Tagging System**: Tag and categorize documents for easy retrieval +- πŸ” **Search & Filter**: Find documents by name, type, or tags +- πŸ‘οΈ **PDF Preview**: In-browser PDF viewer +- ⚠️ **Duplicate Detection**: Prevents uploading duplicate files (by name or content) +- πŸ“Š **Statistics Dashboard**: View document counts and domain statistics + +### Knowledge Base Builder +- 🧠 **Automated KB Creation**: Build vector databases from domain documents +- ⚑ **Parallel Processing**: Optional multi-threaded processing for faster KB creation +- πŸ”„ **Rebuild & Update**: Refresh knowledge bases when documents change +- πŸ“ˆ **Progress Tracking**: Real-time logs and status updates +- 🎯 **Multi-Provider Support**: Compatible with OpenAI and Ollama embeddings +- πŸ’Ύ **Persistent Storage**: Knowledge bases stored in ChromaDB with RDF graph support + +### Energenius GURU (Chat Interface) +- πŸ’¬ **Conversational AI**: Natural language interaction with energy efficiency expert +- 🌐 **Multi-Language Support**: English and Spanish +- πŸ”§ **Configurable Models**: Choose from various LLM models and embeddings +- 🎚️ **Temperature Control**: Adjust response creativity +- πŸ“š **Knowledge-Enhanced Responses**: Leverage domain-specific knowledge bases +- πŸ’Ύ **Chat History**: Download and upload conversation histories +- πŸ”„ **Session Management**: Clear and restart conversations + +## πŸ—οΈ Architecture + +``` +EnergeniusRAG-shared/ +β”œβ”€β”€ document_manager_ui.py # Main Streamlit application +β”œβ”€β”€ src/ # Core application modules +β”‚ β”œβ”€β”€ database.py # MongoDB connection +β”‚ β”œβ”€β”€ domain_manager.py # Domain CRUD operations +β”‚ β”œβ”€β”€ document_manager.py # Document upload/management +β”‚ β”œβ”€β”€ kb_builder.py # Knowledge base creation +β”‚ β”œβ”€β”€ chat_manager.py # Chat session management +β”‚ └── orchestrator_manager.py # LLM orchestrator wrapper +β”œβ”€β”€ knowledge_base/ # KB extraction & management +β”‚ β”œβ”€β”€ knowledge_extractor.py +β”‚ β”œβ”€β”€ knowledge_manager.py +β”‚ └── kbs/ # Knowledge base storage +β”œβ”€β”€ orchestrator/ # Chat orchestration +β”œβ”€β”€ abstract_orchestrator.py +β”œβ”€β”€ live_orchestrator.py +β”œβ”€β”€ guru.py +β”œβ”€β”€ llm/ # LLM provider interfaces +β”œβ”€β”€ benchmark/ # Evaluation tools +β”œβ”€β”€ requirements.txt # Root dependencies +└── Dockerfile # Docker containerization +``` + +### Technology Stack + +- **Frontend**: Streamlit 1.45.0 +- **Database**: MongoDB (document storage) +- **Vector Store**: ChromaDB (embeddings) +- **LLM Providers**: OpenAI, Ollama +- **Knowledge Graph**: RDFLib +- **Document Processing**: pdfminer.six, pypdf +- **Web Scraping**: BeautifulSoup4, html2text +- **Orchestration**: LangChain 0.3.25 + +## πŸš€ Installation + +### Prerequisites + +- Python 3.13 +- MongoDB (local or remote instance) +- Ollama (optional, for local LLM models) +- Conda (recommended for environment management) + +### Step 1: Clone Repository + +```bash +git clone https://github.com/DataSciencePolimi/EnergeniusRAG-shared.git +cd EnergeniusRAG-shared ``` -activate it using +### Step 2: Create Python Environment -```shell +Using Conda (recommended): + +```bash +conda create -n energenius python=3.13 conda activate energenius ``` -install the pip packages from the _requirements_ in the env +Or using venv: + +```bash +python3.13 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +### Step 3: Install Dependencies -```shell +```bash +# Install root dependencies pip install -r requirements.txt + +``` + +### Step 4: Setup MongoDB + +**Local MongoDB**: +```bash +# Install MongoDB (Ubuntu/Debian) +sudo apt-get install -y mongodb + +# Start MongoDB service +sudo systemctl start mongodb +``` + +**Or use MongoDB Atlas** (cloud): +- Sign up at [MongoDB Atlas](https://www.mongodb.com/cloud/atlas) +- Create a cluster and get your connection string + +### Step 5: Configure Environment + +Create a `.env` file in the root directory: + +```bash +# MongoDB Configuration +MONGODB_URI=mongodb://localhost:27017 +MONGODB_DATABASE=energenius +MONGODB_COLLECTION=documents + +# OpenAI Configuration (optional) +OPENAI_API_KEY=your_openai_api_key_here +OPENAI_BASE_URL=https://api.openai.com/v1 + +# Ollama Configuration (optional, for local models) +OLLAMA_BASE_URL=http://localhost:11434 +``` + +### Step 6: Setup Ollama (Optional) + +For local LLM models: + +```bash +# Install Ollama +curl -fsSL https://ollama.com/install.sh | sh + +# Pull recommended models +ollama pull llama3.2 +ollama pull mistral +ollama pull mxbai-embed-large +ollama pull nomic-embed-text +``` + +## βš™οΈ Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `MONGODB_URI` | MongoDB connection string | `mongodb://localhost:27017` | +| `MONGODB_DATABASE` | Database name | `energenius` | +| `MONGODB_COLLECTION` | Collection name | `documents` | +| `OPENAI_API_KEY` | OpenAI API key | - | +| `OPENAI_BASE_URL` | OpenAI API endpoint | `https://api.openai.com/v1` | +| `OLLAMA_BASE_URL` | Ollama server URL | `http://localhost:11434` | + +### Recommended Models + +**LLMs**: +- OpenAI: `gpt-4.1-nano`, `gpt-5-nano`, `gpt-4o-mini-2024-07-18` +- Ollama: `llama3.2`, `mistral`, `gpt-oss` + +**Embeddings**: +- OpenAI: `text-embedding-3-small`, `text-embedding-3-large` +- Ollama: `mxbai-embed-large`, `nomic-embed-text` + +## πŸ“– Usage + +### Starting the Application + +```bash +# Using the shell script +./run_document_manager.sh + +# Or directly with Streamlit +streamlit run document_manager_ui.py --server.port 8501 +``` + +Access the application at: `http://localhost:8501` + +### 1. Document Management + +#### Create a Domain +1. Navigate to **Document Manager** tab +2. In the sidebar, expand "βž• Create New Domain" +3. Enter domain name (e.g., "Italy", "Europe", "Switzerland") +4. Add optional description +5. Click "Create Domain" + +#### Upload Documents +1. Select **Upload Documents** tab +2. Choose your domain from dropdown +3. Upload PDF files or add URLs: + - **PDF**: Select file(s), add description/tags, click "Upload" + - **URL**: Enter URL, title, description, and tags, click "Add URL" + +#### Manage Documents +1. Select **Manage Documents** tab +2. Choose domain to view +3. Use search and filters to find documents +4. Actions available: + - πŸ‘οΈ Preview PDFs + - ⬇️ Download PDFs + - πŸ—‘οΈ Delete documents + +### 2. Knowledge Base Creation + +#### Build a Knowledge Base +1. Navigate to **Knowledge Base** tab +2. Select domain +3. Configure provider and models: + - **Provider**: OpenAI or Ollama + - **Model**: Choose LLM model + - **Embedding**: Select embedding model +4. Optionally configure performance settings: + - Enable parallel processing (recommended) + - Adjust max workers (4-8 recommended) +5. Click "πŸš€ Create Knowledge Base" + +#### Monitor Progress +- Real-time logs show progress +- Process runs in background (you can switch tabs) +- Check status anytime in the Knowledge Base tab + +#### Rebuild/Delete KB +- **Rebuild**: Updates KB with latest documents +- **Delete**: Removes KB but keeps documents + +### 3. Chat with GURU + +#### Start a Conversation +1. Navigate to **GURU Chat** tab +2. Configure in sidebar: + - **Domain**: Select knowledge base to use + - **Provider**: OpenAI or Ollama + - **Model**: Choose LLM + - **Language**: English or EspaΓ±ol + - **Temperature**: Adjust creativity (0.0-1.0) + - **Use Knowledge Base**: Toggle to use domain KB +3. Type your question in chat input +4. Receive streaming AI responses + +#### Chat Management +- **Clear Chat**: Reset conversation +- **Download Chat**: Save history as JSON +- **Upload Chat**: Continue previous conversation + +## 🧠 Knowledge Base Creation + +### Process Overview + +1. **Document Collection**: Fetches all documents from selected domain +2. **URL Processing**: Downloads and extracts text from web pages +3. **PDF Processing**: Extracts text from PDF files +4. **Text Chunking**: Splits content into manageable chunks +5. **Embedding Generation**: Creates vector embeddings +6. **Vector Storage**: Stores in ChromaDB +7. **Graph Creation**: Builds RDF knowledge graph + +### Performance Optimization + +**Parallel Processing** (Recommended): +- Processes multiple documents simultaneously +- 4-8 workers optimal for most systems +- Significantly faster KB creation + +**Sequential Processing**: +- Lower resource usage +- Better for limited hardware +- More stable for very large documents + +### Storage Structure + +``` +knowledge_base/kbs/ +└── files_{domain}/ + β”œβ”€β”€ raw_docs.joblib # Original documents + β”œβ”€β”€ preprocessed_chunks.joblib # Text chunks + β”œβ”€β”€ graph_documents.joblib # Graph structure + β”œβ”€β”€ rdf_graph.ttl # RDF knowledge graph + └── chroma_db/ # Vector embeddings + └── ... ``` -## Private Settings +## πŸ”Œ API & Integration -In order to run the server, you need to create a file called private_settings.py in the same directory as settings.py. This file should contain the following variables: +### MongoDB Schema -```python -PRIVATE_SETINGS = { - "LLM_LOCAL": True, # Set to True if you are using a local LLM or False if you are using a remote LLM - "LLM_KEY": { - "openai": "" # OpenAI API key - "ollama": "", # ollama API key - "anthropic": "", # Anthropic API key - "deepseek": "", # DeepSeeker API key - }, - "LLM_BASE_URL": "", # Base URL for the LLM local API +**Domain Document**: +```json +{ + "type": "domain", + "name": "Italy", + "description": "Italian energy regulations", + "created_at": "2025-01-01T12:00:00" } ``` -You can use standard urls for local deployment: +**Document Entry**: +```json +{ + "type": "document", + "domain": "Italy", + "doc_type": "pdf", // or "url" + "filename": "energy_guide.pdf", + "content": "", // For PDFs + "url": "https://...", // For URLs + "size": 1048576, + "description": "Energy efficiency guide", + "tags": ["solar", "residential"], + "uploaded_at": "2025-01-01T12:00:00" +} +``` + +**Knowledge Base Metadata**: +```json +{ + "type": "knowledge_base", + "domain": "Italy", + "folder": "files_Italy", + "provider": "ollama", + "model": "llama3.2", + "embedding": "mxbai-embed-large", + "status": "completed", // or "creating", "error" + "document_count": 15, + "pdf_count": 10, + "url_count": 5, + "created_at": "2025-01-01T12:00:00", + "completed_at": "2025-01-01T12:30:00", + "logs": ["Started...", "Processing..."], + "error_message": null +} +``` +## 🐳 Docker Deployment + +### Build Image + +```bash +docker build -t energenius-rag . +``` + +### Run Container + +```bash +docker run -d \ + -p 8501:8501 \ + -e MONGODB_URI=mongodb://host.docker.internal:27017 \ + -e OPENAI_API_KEY=your_key_here \ + -v $(pwd)/knowledge_base/kbs:/app/knowledge_base/kbs \ + --name energenius \ + energenius-rag +``` + +### Docker Compose -- Ollama: `"LLM_BASE_URL": "http://localhost:11434"` -- LM Studio: `"LLM_BASE_URL": "http://localhost:1234/v1"` +Create `docker-compose.yml`: -## Local LLMs +```yaml +version: '3.8' -Right now, test locally with [Ollama](https://ollama.com/) +services: + mongodb: + image: mongo:latest + ports: + - "27017:27017" + volumes: + - mongodb_data:/data/db -Models tried: + energenius: + build: . + ports: + - "8501:8501" + environment: + - MONGODB_URI=mongodb://mongodb:27017 + - MONGODB_DATABASE=energenius + - OPENAI_API_KEY=${OPENAI_API_KEY} + volumes: + - ./knowledge_base/kbs:/app/knowledge_base/kbs + depends_on: + - mongodb -- gpt-oss -- llama3.2 -- mistral +volumes: + mongodb_data: +``` + +Run with: +```bash +docker-compose up -d +``` -Embeddings: +## πŸ”§ Troubleshooting -- mxbai-embed-large -- nomic-embed-text +### Common Issues -In order to run Ollama, launch the Ollama server in a separate terminal: +**MongoDB Connection Failed** +```bash +# Check if MongoDB is running +sudo systemctl status mongodb -```shell -ollama run gpt-oss #llama3.2 or mistral +# Test connection +mongosh mongodb://localhost:27017 ``` -## UI +**Ollama Models Not Loading** +```bash +# Check Ollama service +ollama list -To run the UI +# Restart Ollama +systemctl restart ollama -```shell -streamlit run streamlit_ui.py +# Pull model again +ollama pull llama3.2 ``` -# Citation +**Knowledge Base Creation Stuck** +- Check logs in the UI +- Ensure no other process is using ChromaDB +- Try rebuilding with parallel processing disabled +- Check available disk space +**PDF Upload Fails** +- Verify PDF is not corrupted +- Check file size (very large files may timeout) +- Ensure MongoDB has sufficient storage + +**Chat Not Using Knowledge Base** +- Verify KB status is "completed" in Knowledge Base tab +- Check embedding model matches KB embedding +- Ensure "Use Knowledge Base" toggle is enabled + +### Debug Mode + +Enable detailed logging: + +```bash +export STREAMLIT_LOG_LEVEL=debug +streamlit run document_manager_ui.py ``` + +### Performance Tuning + +**For Large Knowledge Bases**: +- Increase parallel workers: 6-8 +- Use faster embedding models +- Consider SSD storage for ChromaDB + +**For Limited Resources**: +- Disable parallel processing +- Reduce max workers to 2-3 +- Use smaller embedding models + +## πŸ“š Documentation + +Additional documentation available in: +- `README.md` - Components overview +- `README2.md` - Extended documentation +- `docs/` - Architecture diagrams and specifications + +## πŸ“ Citation + +If you use GURU in your research, please cite: + +```bibtex @article{Campi_Giudici_Pinciroli_Vago_Brambilla_Fraternali_2025, title={Enhancing Human-AI Collaboration through a Conversational Agent for Energy Efficiency}, author={Campi, Riccardo and Giudici, Mathyas and Pinciroli Vago, NicolΓ² Oreste and Brambilla, Marco and Fraternali, Piero}, @@ -96,3 +518,23 @@ streamlit run streamlit_ui.py DOI={10.1609/aaaiss.v5i1.35554} } ``` + +## πŸ“„ License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + +## 🀝 Contributing + +Contributions are welcome! Please: + +1. Fork the repository +2. Create a feature branch (`git checkout -b feature/amazing-feature`) +3. Commit your changes (`git commit -m 'Add amazing feature'`) +4. Push to the branch (`git push origin feature/amazing-feature`) +5. Open a Pull Request + +## πŸ“§ Contact + +For questions or support: +- **Repository**: [DataSciencePolimi/EnergeniusRAG-shared](https://github.com/DataSciencePolimi/EnergeniusRAG-shared) +- **Issues**: [GitHub Issues](https://github.com/DataSciencePolimi/EnergeniusRAG-shared/issues) \ No newline at end of file diff --git a/document_manager_ui.py b/document_manager_ui.py new file mode 100644 index 0000000..54a3198 --- /dev/null +++ b/document_manager_ui.py @@ -0,0 +1,777 @@ +"""Integrated UI - Document Manager and Energenius GURU.""" + +import streamlit as st +from datetime import datetime +import time + +# Import from src modules +from src.database import get_mongodb_client, get_collection +from src.domain_manager import create_domain, get_all_domains, delete_domain +from src.document_manager import ( + upload_pdf_to_mongodb, + add_url_to_mongodb, + get_documents_by_domain, + delete_document, + download_pdf_from_mongodb, + check_duplicate_pdf, + check_duplicate_url, + get_document_stats +) +from src.chat_manager import ( + initialize_chat_messages, + clear_chat_messages, + add_message, + get_chat_messages, + prepare_chat_download_data, + process_chat_upload +) +from src.orchestrator_manager import ( + cached_get_ollama_models, + cached_get_ollama_embeddings, + initialize_orchestrator +) +from src.kb_builder import ( + build_knowledge_base_for_domain, + get_knowledge_base_info, + delete_knowledge_base +) + +# Page Functions +def render_guru_page(): + """Render the Energenius GURU chat interface.""" + st.title("Energenius GURU") + + # Initialize chat messages + initialize_chat_messages() + + # Sidebar for model parameters + st.sidebar.title("Model Parameters") + + # First, get available domains and let user select + domains = get_all_domains() + domain_names = [d["name"] for d in domains] + default_domain_index = 0 + + region = st.sidebar.selectbox( + "Domain", domain_names, index=default_domain_index + ) + + # Get KB info for the selected domain to auto-select embedding model + kb_info = get_knowledge_base_info(region) + + # Provider selection + provider = st.sidebar.selectbox("Provider", ["openai", "ollama"], index=0) + + if provider == "ollama": + # Add refresh button for model list + col1, col2 = st.sidebar.columns([4, 1]) + with col1: + st.write("") # Spacing + with col2: + if st.button("πŸ”„", help="Refresh model list"): + st.cache_data.clear() + st.rerun() + + # Get available models dynamically + available_models = cached_get_ollama_models() + available_embeddings = cached_get_ollama_embeddings() + + # Try to find a default model, otherwise use the first one + default_model_index = 0 + + model = st.sidebar.selectbox("Model", available_models, index=default_model_index) + + # Auto-select embedding based on KB info if available + if kb_info and kb_info.get("embedding"): + embedding = kb_info.get("embedding") + if embedding not in available_embeddings: + st.sidebar.warning(f"⚠️ KB uses '{embedding}' which is not available") + else: + # Try to find a default embedding + if "mxbai-embed-large" in available_embeddings: + embedding = "mxbai-embed-large" + else: + embedding = available_embeddings[0] if available_embeddings else "mxbai-embed-large" + elif provider == "openai": + model = st.sidebar.selectbox("Model", ["gpt-5-nano", "gpt-4.1-nano"], index=1) + + # Auto-select embedding based on KB info if available + openai_embeddings = ["text-embedding-3-small", "text-embedding-3-large"] + if kb_info and kb_info.get("embedding"): + embedding = kb_info.get("embedding") + if embedding not in openai_embeddings: + st.sidebar.warning(f"⚠️ KB uses '{embedding}' which is not available for OpenAI") + embedding = openai_embeddings[0] # Fallback to default + else: + embedding = openai_embeddings[0] # Default to first embedding + else: + model = "None" + embedding = "None" + + language = st.sidebar.selectbox( + "Language", ["English", "EspaΓ±ol"], index=0 + ) + + # Fix temperature to 1.0 for gpt-5 models + if model.startswith("gpt-5"): + temperature = 1.0 + st.sidebar.slider( + "Temperature", min_value=0.0, max_value=1.0, value=1.0, step=0.1, disabled=True, + help="Temperature is fixed to 1.0 for gpt-5 models" + ) + else: + temperature = st.sidebar.slider( + "Temperature", min_value=0.0, max_value=1.0, value=0.75, step=0.1 + ) + + use_knowledge_base = st.sidebar.toggle( + "Use Knowledge Base", value=True, help="Use the knowledge base to answer questions." + ) + + # Show KB status for selected domain + if use_knowledge_base: + if kb_info: + kb_status = kb_info.get("status", "completed") + if kb_status == "completed": + st.sidebar.info(f"βœ… KB available for '{region}'") + elif kb_status == "creating": + st.sidebar.warning(f"⚠️ KB for '{region}' is still being created. Please wait or check Document Manager.") + elif kb_status == "error": + error_msg = kb_info.get("error_message", "Unknown error") + st.sidebar.error(f"❌ KB creation failed: {error_msg}") + else: + st.sidebar.info(f"βœ… KB available for '{region}'") + else: + st.sidebar.warning(f"⚠️ No KB found for '{region}'. Create one in Document Manager.") + + # Orchestrator initialization + orchestrator = initialize_orchestrator( + provider=provider, + model=model, + embedding=embedding, + language=language, + temperature=temperature, + user_type="Medium income", + house_type="Apartment", + region=region, + use_knowledge_base=use_knowledge_base, + ) + + # Support functions + st.sidebar.button( + "Clear chat", + icon=":material/delete:", + on_click=clear_chat_messages, + ) + + st.sidebar.download_button( + label="Download chat", + help="Download the chat history as a JSON file.", + icon=":material/download:", + file_name="chat.json", + mime="application/json", + data=prepare_chat_download_data(get_chat_messages()), + ) + + chat_upload = st.sidebar.file_uploader( + label="Upload chat", + help="Upload a chat file to continue from a conversation.", + type=["json"], + ) + + if chat_upload is not None: + messages, error = process_chat_upload(chat_upload) + + if error: + st.error(error) + elif messages: + # Erase the current chat and load uploaded messages + clear_chat_messages() + st.session_state["messages"] = messages + + # Call to the orchestrator to load the messages + orchestrator.load_past_messages(messages) + st.rerun() + + # Update the interface with the previous messages + for message in get_chat_messages(): + with st.chat_message(message["role"]): + st.markdown(message["content"]) + + # Create the chat interface + if prompt := st.chat_input("Enter your query"): + # Store and display the current prompt + add_message("user", prompt) + st.chat_message("user").markdown(prompt) + + # Streaming: create an empty placeholder for assistant message + with st.chat_message("assistant"): + response_placeholder = st.empty() + full_response = "" + + # Show spinner while streaming response + with st.spinner(""): + for chunk in orchestrator.user_message(prompt): + full_response += chunk + response_placeholder.markdown(full_response) + + response_placeholder.markdown(full_response) + + # Save final assistant message to history + add_message("assistant", full_response) + + +def render_document_manager_page(): + """Render the Document Manager interface.""" + # Display connection status + try: + get_mongodb_client().server_info() + except Exception as e: + st.error(f"❌ Failed to connect to MongoDB: {str(e)}") + return + + # Sidebar for domain management + with st.sidebar: + st.header("πŸ—‚οΈ Domain Management") + + # Statistics button - toggle on/off + if st.button("πŸ“Š View Statistics", use_container_width=True): + st.session_state["show_stats"] = not st.session_state.get("show_stats", False) + st.rerun() + + # Show statistics in a dialog/popup + if st.session_state.get("show_stats", False): + with st.container(): + stats = get_document_stats() + st.metric("Total Domains", stats["total_domains"]) + st.metric("Total Documents", stats["total_docs"]) + st.metric("PDFs", stats["total_pdfs"]) + st.metric("URLs", stats["total_urls"]) + + st.divider() + + # Create new domain + with st.expander("βž• Create New Domain", expanded=False): + new_domain_name = st.text_input("Domain Name", key="new_domain_name") + new_domain_desc = st.text_area("Description (optional)", key="new_domain_desc") + + if st.button("Create Domain", type="primary"): + if new_domain_name: + try: + # Check if domain already exists + existing = get_collection().find_one({"type": "domain", "name": new_domain_name}) + if existing: + st.error(f"Domain '{new_domain_name}' already exists!") + else: + create_domain(new_domain_name, new_domain_desc) + st.success(f"Domain '{new_domain_name}' created successfully!") + st.rerun() + except Exception as e: + st.error(f"Error creating domain: {str(e)}") + else: + st.warning("Please enter a domain name") + + # List all domains + st.subheader("πŸ“ Existing Domains") + domains = get_all_domains() + + if not domains: + st.info("No domains found. Create one to get started!") + else: + for domain in domains: + doc_count = get_collection().count_documents({"type": "document", "domain": domain["name"]}) + st.write(f"**{domain['name']}** ({doc_count} docs)") + + # Main content area - tabs for different operations + tab1, tab2, tab3, tab4 = st.tabs(["πŸ“€ Upload Documents", "πŸ“‹ Manage Documents", "🧠 Knowledge Base", "πŸ—‘οΈ Delete Domain"]) + + # Tab 1: Upload Documents + with tab1: + domains = get_all_domains() + if not domains: + st.warning("⚠️ Please create a domain first before uploading documents!") + else: + domain_names = [d["name"] for d in domains] + selected_domain = st.selectbox("Select Domain", domain_names, key="upload_domain") + + doc_type = st.radio("Document Type", ["PDF File", "URL"], horizontal=True) + + if doc_type == "PDF File": + st.subheader("πŸ“„ Upload PDF") + uploaded_files = st.file_uploader( + "Choose PDF file(s)", + type=["pdf"], + accept_multiple_files=True, + key="pdf_uploader" + ) + + pdf_description = st.text_area("Description (optional)", key="pdf_desc") + pdf_tags = st.text_input("Tags (comma-separated)", key="pdf_tags") + + if st.button("Upload PDF(s)", type="primary"): + if uploaded_files: + tags_list = [tag.strip() for tag in pdf_tags.split(",")] if pdf_tags else [] + + # Check for duplicates before uploading + duplicates = [] + files_to_upload = [] + + for pdf_file in uploaded_files: + pdf_content = pdf_file.read() + pdf_file.seek(0) # Reset file pointer for actual upload + + dup_check = check_duplicate_pdf(selected_domain, pdf_file.name, pdf_content) + + if dup_check["is_duplicate"]: + duplicates.append({ + "name": pdf_file.name, + "type": dup_check["duplicate_type"], + "existing": dup_check["existing_doc"] + }) + else: + files_to_upload.append(pdf_file) + + # Show duplicate warnings + if duplicates: + st.warning(f"⚠️ Found {len(duplicates)} duplicate(s):") + for dup in duplicates: + if dup["type"] == "filename": + st.write(f"- **{dup['name']}**: File with same name already exists") + else: + st.write(f"- **{dup['name']}**: File with identical content already exists as '{dup['existing']['filename']}'") + + # Upload non-duplicate files + if files_to_upload: + progress_bar = st.progress(0) + uploaded_count = 0 + + for idx, pdf_file in enumerate(files_to_upload): + try: + upload_pdf_to_mongodb(selected_domain, pdf_file, pdf_description, tags_list) + uploaded_count += 1 + progress_bar.progress((idx + 1) / len(files_to_upload)) + except Exception as e: + st.error(f"Error uploading {pdf_file.name}: {str(e)}") + + if uploaded_count > 0: + st.success(f"βœ… Successfully uploaded {uploaded_count} PDF(s) to domain '{selected_domain}'!") + time.sleep(2) + st.rerun() + elif not files_to_upload and duplicates: + st.info("No new files to upload. All files are duplicates.") + else: + st.warning("Please select at least one PDF file") + + else: # URL + st.subheader("πŸ”— Add URL") + url_input = st.text_input("URL", key="url_input") + url_title = st.text_input("Title (optional)", key="url_title") + url_description = st.text_area("Description (optional)", key="url_desc") + url_tags = st.text_input("Tags (comma-separated)", key="url_tags") + + if st.button("Add URL", type="primary"): + if url_input: + # Check for duplicate URL + dup_check = check_duplicate_url(selected_domain, url_input) + + if dup_check["is_duplicate"]: + existing_doc = dup_check["existing_doc"] + st.warning(f"⚠️ This URL already exists in domain '{selected_domain}'") + st.write(f"**Title:** {existing_doc.get('title', 'N/A')}") + st.write(f"**Added:** {existing_doc.get('uploaded_at').strftime('%Y-%m-%d %H:%M')}") + st.info("URL was not added again.") + else: + try: + tags_list = [tag.strip() for tag in url_tags.split(",")] if url_tags else [] + add_url_to_mongodb(selected_domain, url_input, url_title, url_description, tags_list) + st.success(f"βœ… URL added successfully to domain '{selected_domain}'!") + time.sleep(2) + st.rerun() + except Exception as e: + st.error(f"Error adding URL: {str(e)}") + else: + st.warning("Please enter a URL") + + # Tab 2: View & Manage + with tab2: + domains = get_all_domains() + if not domains: + st.info("No domains found.") + else: + domain_names = [d["name"] for d in domains] + view_domain = st.selectbox("Select Domain to View", domain_names, key="view_domain") + + if view_domain: + documents = get_documents_by_domain(view_domain) + + if not documents: + st.info(f"No documents found in domain '{view_domain}'") + else: + st.write(f"**{len(documents)} document(s) in '{view_domain}'**") + + # Filter options + col1, col2, col3 = st.columns([2, 1, 1]) + with col1: + search_query = st.text_input("πŸ” Search documents", key="search_docs") + with col2: + filter_type = st.selectbox("Filter by type", ["All", "PDF", "URL"], key="filter_type") + with col3: + # Get all unique tags + all_tags = set() + for doc in documents: + if doc.get("tags"): + all_tags.update(doc.get("tags")) + tag_options = ["All"] + sorted(list(all_tags)) + filter_tag = st.selectbox("Filter by tag", tag_options, key="filter_tag") + + # Apply filters + filtered_docs = documents + if search_query: + filtered_docs = [ + doc for doc in filtered_docs + if search_query.lower() in doc.get("filename", "").lower() + or search_query.lower() in doc.get("title", "").lower() + or search_query.lower() in doc.get("description", "").lower() + ] + + if filter_type != "All": + filtered_docs = [doc for doc in filtered_docs if doc.get("doc_type") == filter_type.lower()] + + if filter_tag != "All": + filtered_docs = [doc for doc in filtered_docs if doc.get("tags") and filter_tag in doc.get("tags")] + + st.write(f"Showing {len(filtered_docs)} document(s)") + + # Display documents + for doc in filtered_docs: + with st.container(): + col1, col2, col3 = st.columns([5, 2, 2]) + + with col1: + if doc.get("doc_type") == "pdf": + st.write(f"πŸ“„ **{doc.get('filename')}**") + size_mb = doc.get('size', 0) / (1024 * 1024) + st.caption(f"Size: {size_mb:.2f} MB") + else: + st.write(f"πŸ”— **{doc.get('title', 'Untitled')}**") + st.caption(f"URL: {doc.get('url')}") + + if doc.get("description"): + st.write(doc.get("description")) + + if doc.get("tags"): + tags_html = " ".join([f"`{tag}`" for tag in doc.get("tags")]) + st.markdown(tags_html) + + with col2: + st.caption(f"Uploaded: {doc.get('uploaded_at').strftime('%Y-%m-%d %H:%M')}") + + with col3: + # For PDFs: Preview, Download, Delete buttons + if doc.get("doc_type") == "pdf": + button_col1, button_col2, button_col3 = st.columns(3) + + with button_col1: + if st.button("πŸ‘οΈ", key=f"preview_{doc['_id']}", help="Preview PDF"): + st.session_state[f"show_preview_{doc['_id']}"] = True + st.rerun() + + with button_col2: + pdf_data = download_pdf_from_mongodb(str(doc['_id'])) + if pdf_data: + st.download_button( + label="⬇️", + data=pdf_data["content"], + file_name=pdf_data["filename"], + mime="application/pdf", + key=f"download_{doc['_id']}", + help="Download PDF" + ) + + with button_col3: + if st.button("πŸ—‘οΈ", key=f"delete_{doc['_id']}", help="Delete document"): + try: + delete_document(str(doc['_id'])) + st.success("Deleted!") + st.rerun() + except Exception as e: + st.error(f"Error: {str(e)}") + else: + # For URLs: Only delete button + if st.button("πŸ—‘οΈ", key=f"delete_{doc['_id']}", help="Delete document"): + try: + delete_document(str(doc['_id'])) + st.success("Deleted!") + st.rerun() + except Exception as e: + st.error(f"Error: {str(e)}") + + st.divider() + + # Handle PDF preview dialogs + for doc in filtered_docs: + if doc.get("doc_type") == "pdf" and st.session_state.get(f"show_preview_{doc['_id']}", False): + @st.dialog(f"Preview: {doc.get('filename')}", width="large") + def show_pdf_preview(): + import base64 + + pdf_data = download_pdf_from_mongodb(str(doc['_id'])) + if pdf_data: + # Add download button at the top + st.download_button( + label="⬇️ Download PDF", + data=pdf_data["content"], + file_name=pdf_data["filename"], + mime="application/pdf", + key=f"dialog_download_{doc['_id']}", + use_container_width=True + ) + + # Encode PDF to base64 for embedding + base64_pdf = base64.b64encode(pdf_data["content"]).decode('utf-8') + + # Display PDF using iframe with base64 data (reduced height) + pdf_display = f''' + + ''' + st.markdown(pdf_display, unsafe_allow_html=True) + else: + st.error("Failed to load PDF") + + show_pdf_preview() + + # Tab 3: Knowledge Base + with tab3: + domains = get_all_domains() + if not domains: + st.warning("No domains found. Create a domain and add documents first.") + else: + domain_names = [d["name"] for d in domains] + kb_domain = st.selectbox("Select Domain", domain_names, key="kb_domain") + + if kb_domain: + # Check if KB already exists + kb_info = get_knowledge_base_info(kb_domain) + + if kb_info: + kb_status = kb_info.get("status", "completed") + + if kb_status == "completed": + st.success(f"βœ… Knowledge base exists for domain '{kb_domain}'") + elif kb_status == "creating": + st.warning(f"⏳ Knowledge base is being created for domain '{kb_domain}'. This may take several minutes...") + elif kb_status == "error": + error_msg = kb_info.get("error_message", "Unknown error") + st.error(f"❌ Knowledge base creation failed: {error_msg}") + + col1, col2 = st.columns(2) + with col1: + st.metric("Created", kb_info.get("created_at").strftime("%Y-%m-%d %H:%M")) + st.metric("Documents Processed", kb_info.get("document_count", 0)) + if kb_status == "completed" and kb_info.get("completed_at"): + st.metric("Completed", kb_info.get("completed_at").strftime("%Y-%m-%d %H:%M")) + with col2: + st.metric("Provider", kb_info.get("provider", "N/A")) + st.metric("Model", kb_info.get("model", "N/A")) + st.metric("Status", kb_status.upper()) + + st.write(f"**URLs:** {kb_info.get('url_count', 0)}") + st.write(f"**PDFs:** {kb_info.get('pdf_count', 0)}") + st.write(f"**Embedding:** {kb_info.get('embedding', 'N/A')}") + st.write(f"**Folder:** `{kb_info.get('folder', 'N/A')}`") + + st.divider() + + col1, col2 = st.columns(2) + with col1: + if st.button("πŸ”„ Rebuild Knowledge Base", type="secondary", use_container_width=True): + st.session_state["rebuild_kb"] = True + st.rerun() + with col2: + if st.button("πŸ—‘οΈ Delete Knowledge Base", type="secondary", use_container_width=True): + try: + delete_knowledge_base(kb_domain) + st.success("Knowledge base deleted!") + st.rerun() + except Exception as e: + st.error(f"Error deleting KB: {str(e)}") + else: + st.warning(f"No knowledge base found for domain '{kb_domain}'") + + # Show KB creation form + kb_is_creating = kb_info and kb_info.get("status") == "creating" + + if kb_is_creating and not st.session_state.get("rebuild_kb", False): + st.divider() + st.info("⏳ A knowledge base is currently being created. Please wait for it to complete.") + + # Display logs from database + if kb_info.get("logs"): + st.text_area("Progress Log", "\n".join(kb_info.get("logs")), height=300, key="kb_progress_log", disabled=True) + + # Auto-refresh every 5 seconds if creating + time.sleep(5) + st.rerun() + + if st.button("πŸ”„ Refresh Status", key="refresh_kb_status"): + st.rerun() + elif not kb_info or st.session_state.get("rebuild_kb", False): + st.divider() + st.subheader("Create Knowledge Base") + + # Count documents + documents = get_documents_by_domain(kb_domain) + url_count = sum(1 for doc in documents if doc.get("doc_type") == "url") + pdf_count = sum(1 for doc in documents if doc.get("doc_type") == "pdf") + + if not documents: + st.error("No documents found in this domain. Add some documents first!") + else: + st.info(f"This domain contains {len(documents)} document(s): {url_count} URL(s) and {pdf_count} PDF(s)") + + # Configuration + col1, col2 = st.columns(2) + with col1: + kb_provider = st.selectbox("Provider", ["openai", "ollama"], key="kb_provider") + with col2: + if kb_provider == "ollama": + available_models = cached_get_ollama_models() + kb_model = st.selectbox("Model", available_models, key="kb_model") + available_embeddings = cached_get_ollama_embeddings() + default_emb_idx = available_embeddings.index("mxbai-embed-large") if "mxbai-embed-large" in available_embeddings else 0 + kb_embedding = st.selectbox("Embedding", available_embeddings, index=default_emb_idx, key="kb_embedding") + else: + kb_model = st.selectbox("Model", ["gpt-5-nano", "gpt-4.1-nano", "gpt-4o-mini-2024-07-18"], key="kb_model", index=1) + kb_embedding = st.selectbox("Embedding", ["text-embedding-3-small", "text-embedding-3-large"], key="kb_embedding") + + # Performance options + with st.expander("⚑ Performance Options", expanded=False): + enable_parallel = st.checkbox( + "Enable Parallel Processing", + value=True, + help="Process multiple LLM calls and embeddings in parallel for faster KB creation. Recommended for most cases." + ) + max_workers = st.slider( + "Max Workers", + min_value=1, + max_value=8, + value=4, + help="Number of parallel workers. Higher values = faster but more resource intensive. Recommended: 4-8" + ) + if not enable_parallel: + st.info("πŸ’‘ Parallel processing is disabled. KB creation will be slower but use less resources.") + + if st.button("πŸš€ Create Knowledge Base", type="primary", use_container_width=True): + # Create a progress container + log_placeholder = st.empty() + + logs = [] + def log_progress(message): + logs.append(f"[{datetime.now().strftime('%H:%M:%S')}] {message}") + log_placeholder.text_area("Progress Log", "\n".join(logs), height=200) + + # Use background thread so user can switch tabs + result = build_knowledge_base_for_domain( + kb_domain, + provider=kb_provider, + model=kb_model, + embedding=kb_embedding, + progress_callback=log_progress, + use_background_thread=True, + max_workers=max_workers, + enable_parallel=enable_parallel + ) + + + if result["status"] == "success": + st.success(result["message"]) + st.info("πŸ’‘ The knowledge base is being created in the background. You can safely switch tabs and check back later.") + st.session_state["rebuild_kb"] = False + st.rerun() + else: + st.error(result["message"]) + + # Tab 4: Delete Domain + with tab4: + st.warning("⚠️ **Warning:** Deleting a domain will permanently remove it and all its documents!") + + domains = get_all_domains() + if not domains: + st.info("No domains to delete.") + else: + domain_names = [d["name"] for d in domains] + delete_domain_name = st.selectbox("Select Domain to Delete", domain_names, key="delete_domain") + + if delete_domain_name: + doc_count = get_collection().count_documents({"type": "document", "domain": delete_domain_name}) + st.error(f"This will delete domain '{delete_domain_name}' and its {doc_count} document(s).") + + confirm_text = st.text_input(f"Type '{delete_domain_name}' to confirm deletion:", key="confirm_delete") + + if st.button("Delete Domain Permanently", type="primary"): + if confirm_text == delete_domain_name: + try: + delete_domain(delete_domain_name) + st.success(f"Domain '{delete_domain_name}' deleted successfully!") + st.rerun() + except Exception as e: + st.error(f"Error deleting domain: {str(e)}") + else: + st.error("Domain name doesn't match. Deletion cancelled.") + + +# Main application with navigation +def main(): + """Main application entry point with page navigation.""" + st.set_page_config( + page_title="Energenius Platform", + page_icon="⚑", + layout="wide" + ) + + # Hide Streamlit menu and reduce top padding + hide_streamlit_style = """ + + """ + st.markdown(hide_streamlit_style, unsafe_allow_html=True) + + # Create navigation buttons + col1, col2, col3 = st.columns([1, 1, 4]) + + with col1: + if st.button("πŸ’¬ GURU Chat", use_container_width=True, type="primary" if st.session_state.get("page", "guru") == "guru" else "secondary"): + st.session_state["page"] = "guru" + st.rerun() + + with col2: + if st.button("πŸ“š Document Manager", use_container_width=True, type="primary" if st.session_state.get("page", "guru") == "documents" else "secondary"): + st.session_state["page"] = "documents" + st.rerun() + + st.divider() + + # Initialize page selection + if "page" not in st.session_state: + st.session_state["page"] = "guru" + + # Render the selected page + if st.session_state["page"] == "guru": + render_guru_page() + else: + render_document_manager_page() + + +if __name__ == "__main__": + main() diff --git a/energy_bench.py b/energy_bench.py index 995319f..96b8f24 100644 --- a/energy_bench.py +++ b/energy_bench.py @@ -1,10 +1,14 @@ """Module to benchmark the energy consumption of the Guru orchestrator.""" +import os import pandas as pd +from dotenv import load_dotenv from benchmark import Benchmark from orchestrator import Guru -from private_settings import PRIVATE_SETINGS + +# Load environment variables from .env file +load_dotenv() if __name__ == "__main__": @@ -12,7 +16,8 @@ dataset = pd.read_csv("benchmark/backup/DatasetQA.csv") # Create the Guru instance - if PRIVATE_SETINGS["LLM_LOCAL"]: + llm_local = os.getenv("LLM_LOCAL", "false").lower() == "true" + if llm_local: guru = Guru("ollama", "gpt-oss:120b", "mxbai-embed-large", "english", 0, "Italy") else: guru = Guru("openai", "gpt-4", "text-embedding-3-small", "english", 0, "Italy") diff --git a/kb_creator.py b/kb_creator.py index 7ec13db..78fe57a 100644 --- a/kb_creator.py +++ b/kb_creator.py @@ -1,12 +1,16 @@ """Knowledge Base Creator""" +import os +from dotenv import load_dotenv from knowledge_base import KnowledgeExtractor -from private_settings import PRIVATE_SETINGS +# Load environment variables from .env file +load_dotenv() # Creating and running the knowledge base class based on the environment -if PRIVATE_SETINGS["LLM_LOCAL"]: - ke = KnowledgeExtractor("ollama", "gpt-oss", "mxbai-embed-large") +llm_local = os.getenv("LLM_LOCAL", "false").lower() == "true" +if llm_local: + ke = KnowledgeExtractor("ollama", "gemma3:12b-it-qat", "mxbai-embed-large") else: # Online ke = KnowledgeExtractor("openai", "gpt-4", "text-embedding-3-small") @@ -18,4 +22,6 @@ "https://www.agenziaentrate.gov.it/portale/web/guest/aree-tematiche/casa/agevolazioni/bonus-mobili-ed-elettrodomestici", "https://italiainclassea.enea.it/le-tecnologie/", ], + load_cached_docs=True, + load_cached_preprocessed_chunks=True ) diff --git a/kb_creator_Europe.py b/kb_creator_Europe.py index f04e6f8..50e06d5 100644 --- a/kb_creator_Europe.py +++ b/kb_creator_Europe.py @@ -1,11 +1,15 @@ """Knowledge Base Creator""" +import os +from dotenv import load_dotenv from knowledge_base import KnowledgeExtractor -from private_settings import PRIVATE_SETINGS +# Load environment variables from .env file +load_dotenv() # Creating and running the knowledge base class based on the environment -if PRIVATE_SETINGS["LLM_LOCAL"]: +llm_local = os.getenv("LLM_LOCAL", "false").lower() == "true" +if llm_local: ke = KnowledgeExtractor("ollama", "gpt-oss:120b", "mxbai-embed-large") else: # Online diff --git a/kb_creator_Italy.py b/kb_creator_Italy.py index 392c4f0..a8c6903 100644 --- a/kb_creator_Italy.py +++ b/kb_creator_Italy.py @@ -1,11 +1,15 @@ """Knowledge Base Creator""" +import os +from dotenv import load_dotenv from knowledge_base import KnowledgeExtractor -from private_settings import PRIVATE_SETINGS +# Load environment variables from .env file +load_dotenv() # Creating and running the knowledge base class based on the environment -if PRIVATE_SETINGS["LLM_LOCAL"]: +llm_local = os.getenv("LLM_LOCAL", "false").lower() == "true" +if llm_local: ke = KnowledgeExtractor("ollama", "gpt-oss:120b", "mxbai-embed-large") else: # Online diff --git a/kb_creator_Switzerland.py b/kb_creator_Switzerland.py index 627bd90..c83be84 100644 --- a/kb_creator_Switzerland.py +++ b/kb_creator_Switzerland.py @@ -1,11 +1,15 @@ """Knowledge Base Creator""" +import os +from dotenv import load_dotenv from knowledge_base import KnowledgeExtractor -from private_settings import PRIVATE_SETINGS +# Load environment variables from .env file +load_dotenv() # Creating and running the knowledge base class based on the environment -if PRIVATE_SETINGS["LLM_LOCAL"]: +llm_local = os.getenv("LLM_LOCAL", "false").lower() == "true" +if llm_local: ke = KnowledgeExtractor("ollama", "gpt-oss:120b", "mxbai-embed-large") else: # Online diff --git a/knowledge_base/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/data_level0.bin b/knowledge_base/kbs/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/data_level0.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/data_level0.bin rename to knowledge_base/kbs/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/data_level0.bin diff --git a/knowledge_base/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/header.bin b/knowledge_base/kbs/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/header.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/header.bin rename to knowledge_base/kbs/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/header.bin diff --git a/knowledge_base/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/length.bin b/knowledge_base/kbs/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/length.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/length.bin rename to knowledge_base/kbs/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/length.bin diff --git a/knowledge_base/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/link_lists.bin b/knowledge_base/kbs/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/link_lists.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/link_lists.bin rename to knowledge_base/kbs/files_Europe/chroma_db/35d3a63a-a234-48ab-bda8-430f4bd7c0f7/link_lists.bin diff --git a/knowledge_base/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/data_level0.bin b/knowledge_base/kbs/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/data_level0.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/data_level0.bin rename to knowledge_base/kbs/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/data_level0.bin diff --git a/knowledge_base/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/header.bin b/knowledge_base/kbs/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/header.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/header.bin rename to knowledge_base/kbs/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/header.bin diff --git a/knowledge_base/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/length.bin b/knowledge_base/kbs/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/length.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/length.bin rename to knowledge_base/kbs/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/length.bin diff --git a/knowledge_base/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/link_lists.bin b/knowledge_base/kbs/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/link_lists.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/link_lists.bin rename to knowledge_base/kbs/files_Europe/chroma_db/52d84313-b357-4a79-8d9e-978cf10dd750/link_lists.bin diff --git a/knowledge_base/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/data_level0.bin b/knowledge_base/kbs/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/data_level0.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/data_level0.bin rename to knowledge_base/kbs/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/data_level0.bin diff --git a/knowledge_base/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/header.bin b/knowledge_base/kbs/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/header.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/header.bin rename to knowledge_base/kbs/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/header.bin diff --git a/knowledge_base/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/length.bin b/knowledge_base/kbs/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/length.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/length.bin rename to knowledge_base/kbs/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/length.bin diff --git a/knowledge_base/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/link_lists.bin b/knowledge_base/kbs/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/link_lists.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/link_lists.bin rename to knowledge_base/kbs/files_Europe/chroma_db/59f240be-98ad-4310-bd23-aa8c690b72e2/link_lists.bin diff --git a/knowledge_base/files_Europe/chroma_db/chroma.sqlite3 b/knowledge_base/kbs/files_Europe/chroma_db/chroma.sqlite3 similarity index 100% rename from knowledge_base/files_Europe/chroma_db/chroma.sqlite3 rename to knowledge_base/kbs/files_Europe/chroma_db/chroma.sqlite3 diff --git a/knowledge_base/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/data_level0.bin b/knowledge_base/kbs/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/data_level0.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/data_level0.bin rename to knowledge_base/kbs/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/data_level0.bin diff --git a/knowledge_base/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/header.bin b/knowledge_base/kbs/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/header.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/header.bin rename to knowledge_base/kbs/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/header.bin diff --git a/knowledge_base/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/length.bin b/knowledge_base/kbs/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/length.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/length.bin rename to knowledge_base/kbs/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/length.bin diff --git a/knowledge_base/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/link_lists.bin b/knowledge_base/kbs/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/link_lists.bin similarity index 100% rename from knowledge_base/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/link_lists.bin rename to knowledge_base/kbs/files_Europe/chroma_db/f3c723d9-c9da-4bab-8e7f-f22c845a9c85/link_lists.bin diff --git a/knowledge_base/files_Europe/graph_documents.joblib b/knowledge_base/kbs/files_Europe/graph_documents.joblib similarity index 100% rename from knowledge_base/files_Europe/graph_documents.joblib rename to knowledge_base/kbs/files_Europe/graph_documents.joblib diff --git a/knowledge_base/files_Europe/preprocessed_chunks.joblib b/knowledge_base/kbs/files_Europe/preprocessed_chunks.joblib similarity index 100% rename from knowledge_base/files_Europe/preprocessed_chunks.joblib rename to knowledge_base/kbs/files_Europe/preprocessed_chunks.joblib diff --git a/knowledge_base/files_Europe/raw_docs.joblib b/knowledge_base/kbs/files_Europe/raw_docs.joblib similarity index 100% rename from knowledge_base/files_Europe/raw_docs.joblib rename to knowledge_base/kbs/files_Europe/raw_docs.joblib diff --git a/knowledge_base/files_Europe/rdf_graph.ttl b/knowledge_base/kbs/files_Europe/rdf_graph.ttl similarity index 100% rename from knowledge_base/files_Europe/rdf_graph.ttl rename to knowledge_base/kbs/files_Europe/rdf_graph.ttl diff --git a/knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/data_level0.bin b/knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/data_level0.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/data_level0.bin rename to knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/data_level0.bin diff --git a/knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/header.bin b/knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/header.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/header.bin rename to knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/header.bin diff --git a/knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/index_metadata.pickle b/knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/index_metadata.pickle similarity index 100% rename from knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/index_metadata.pickle rename to knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/index_metadata.pickle diff --git a/knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/length.bin b/knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/length.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/length.bin rename to knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/length.bin diff --git a/knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/link_lists.bin b/knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/link_lists.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/link_lists.bin rename to knowledge_base/kbs/files_Generic/chroma_db/5c997f27-f387-4015-a27c-27b8cf5ea877/link_lists.bin diff --git a/knowledge_base/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/data_level0.bin b/knowledge_base/kbs/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/data_level0.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/data_level0.bin rename to knowledge_base/kbs/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/data_level0.bin diff --git a/knowledge_base/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/header.bin b/knowledge_base/kbs/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/header.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/header.bin rename to knowledge_base/kbs/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/header.bin diff --git a/knowledge_base/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/length.bin b/knowledge_base/kbs/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/length.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/length.bin rename to knowledge_base/kbs/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/length.bin diff --git a/knowledge_base/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/link_lists.bin b/knowledge_base/kbs/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/link_lists.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/link_lists.bin rename to knowledge_base/kbs/files_Generic/chroma_db/72e31084-777b-4f13-b5ec-59a858438f8a/link_lists.bin diff --git a/knowledge_base/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/data_level0.bin b/knowledge_base/kbs/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/data_level0.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/data_level0.bin rename to knowledge_base/kbs/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/data_level0.bin diff --git a/knowledge_base/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/header.bin b/knowledge_base/kbs/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/header.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/header.bin rename to knowledge_base/kbs/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/header.bin diff --git a/knowledge_base/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/length.bin b/knowledge_base/kbs/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/length.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/length.bin rename to knowledge_base/kbs/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/length.bin diff --git a/knowledge_base/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/link_lists.bin b/knowledge_base/kbs/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/link_lists.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/link_lists.bin rename to knowledge_base/kbs/files_Generic/chroma_db/a034a1de-d422-487f-8a96-2fb9ff1f64a4/link_lists.bin diff --git a/knowledge_base/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/data_level0.bin b/knowledge_base/kbs/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/data_level0.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/data_level0.bin rename to knowledge_base/kbs/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/data_level0.bin diff --git a/knowledge_base/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/header.bin b/knowledge_base/kbs/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/header.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/header.bin rename to knowledge_base/kbs/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/header.bin diff --git a/knowledge_base/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/length.bin b/knowledge_base/kbs/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/length.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/length.bin rename to knowledge_base/kbs/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/length.bin diff --git a/knowledge_base/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/link_lists.bin b/knowledge_base/kbs/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/link_lists.bin similarity index 100% rename from knowledge_base/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/link_lists.bin rename to knowledge_base/kbs/files_Generic/chroma_db/af18c57f-035c-4ea1-bb05-e707a5567325/link_lists.bin diff --git a/knowledge_base/files_Generic/chroma_db/chroma.sqlite3 b/knowledge_base/kbs/files_Generic/chroma_db/chroma.sqlite3 similarity index 100% rename from knowledge_base/files_Generic/chroma_db/chroma.sqlite3 rename to knowledge_base/kbs/files_Generic/chroma_db/chroma.sqlite3 diff --git a/knowledge_base/files_Generic/graph_documents.joblib b/knowledge_base/kbs/files_Generic/graph_documents.joblib similarity index 100% rename from knowledge_base/files_Generic/graph_documents.joblib rename to knowledge_base/kbs/files_Generic/graph_documents.joblib diff --git a/knowledge_base/files_Generic/preprocessed_chunks.joblib b/knowledge_base/kbs/files_Generic/preprocessed_chunks.joblib similarity index 100% rename from knowledge_base/files_Generic/preprocessed_chunks.joblib rename to knowledge_base/kbs/files_Generic/preprocessed_chunks.joblib diff --git a/knowledge_base/files_Generic/raw_docs.joblib b/knowledge_base/kbs/files_Generic/raw_docs.joblib similarity index 100% rename from knowledge_base/files_Generic/raw_docs.joblib rename to knowledge_base/kbs/files_Generic/raw_docs.joblib diff --git a/knowledge_base/files_Generic/rdf_graph.ttl b/knowledge_base/kbs/files_Generic/rdf_graph.ttl similarity index 100% rename from knowledge_base/files_Generic/rdf_graph.ttl rename to knowledge_base/kbs/files_Generic/rdf_graph.ttl diff --git a/knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/data_level0.bin b/knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/data_level0.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/data_level0.bin rename to knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/data_level0.bin diff --git a/knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/header.bin b/knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/header.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/header.bin rename to knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/header.bin diff --git a/knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/index_metadata.pickle b/knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/index_metadata.pickle similarity index 100% rename from knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/index_metadata.pickle rename to knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/index_metadata.pickle diff --git a/knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/length.bin b/knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/length.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/length.bin rename to knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/length.bin diff --git a/knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/link_lists.bin b/knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/link_lists.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/link_lists.bin rename to knowledge_base/kbs/files_Italy/chroma_db/125b5628-3aad-4821-9cf0-4d625fde6b6d/link_lists.bin diff --git a/knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/data_level0.bin b/knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/data_level0.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/data_level0.bin rename to knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/data_level0.bin diff --git a/knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/header.bin b/knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/header.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/header.bin rename to knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/header.bin diff --git a/knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/index_metadata.pickle b/knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/index_metadata.pickle similarity index 100% rename from knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/index_metadata.pickle rename to knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/index_metadata.pickle diff --git a/knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/length.bin b/knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/length.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/length.bin rename to knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/length.bin diff --git a/knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/link_lists.bin b/knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/link_lists.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/link_lists.bin rename to knowledge_base/kbs/files_Italy/chroma_db/313a7e81-1680-4717-b906-a80ff3f06b3d/link_lists.bin diff --git a/knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/data_level0.bin b/knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/data_level0.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/data_level0.bin rename to knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/data_level0.bin diff --git a/knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/header.bin b/knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/header.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/header.bin rename to knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/header.bin diff --git a/knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/index_metadata.pickle b/knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/index_metadata.pickle similarity index 100% rename from knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/index_metadata.pickle rename to knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/index_metadata.pickle diff --git a/knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/length.bin b/knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/length.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/length.bin rename to knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/length.bin diff --git a/knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/link_lists.bin b/knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/link_lists.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/link_lists.bin rename to knowledge_base/kbs/files_Italy/chroma_db/38827c57-f211-4721-b00d-bca74efe69ac/link_lists.bin diff --git a/knowledge_base/files_Italy/chroma_db/chroma.sqlite3 b/knowledge_base/kbs/files_Italy/chroma_db/chroma.sqlite3 similarity index 100% rename from knowledge_base/files_Italy/chroma_db/chroma.sqlite3 rename to knowledge_base/kbs/files_Italy/chroma_db/chroma.sqlite3 diff --git a/knowledge_base/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/data_level0.bin b/knowledge_base/kbs/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/data_level0.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/data_level0.bin rename to knowledge_base/kbs/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/data_level0.bin diff --git a/knowledge_base/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/header.bin b/knowledge_base/kbs/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/header.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/header.bin rename to knowledge_base/kbs/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/header.bin diff --git a/knowledge_base/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/length.bin b/knowledge_base/kbs/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/length.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/length.bin rename to knowledge_base/kbs/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/length.bin diff --git a/knowledge_base/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/link_lists.bin b/knowledge_base/kbs/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/link_lists.bin similarity index 100% rename from knowledge_base/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/link_lists.bin rename to knowledge_base/kbs/files_Italy/chroma_db/eb1d395e-7e32-42ec-907d-732e365f958d/link_lists.bin diff --git a/knowledge_base/files_Italy/graph_documents.joblib b/knowledge_base/kbs/files_Italy/graph_documents.joblib similarity index 100% rename from knowledge_base/files_Italy/graph_documents.joblib rename to knowledge_base/kbs/files_Italy/graph_documents.joblib diff --git a/knowledge_base/files_Italy/preprocessed_chunks.joblib b/knowledge_base/kbs/files_Italy/preprocessed_chunks.joblib similarity index 100% rename from knowledge_base/files_Italy/preprocessed_chunks.joblib rename to knowledge_base/kbs/files_Italy/preprocessed_chunks.joblib diff --git a/knowledge_base/files_Italy/raw_docs.joblib b/knowledge_base/kbs/files_Italy/raw_docs.joblib similarity index 100% rename from knowledge_base/files_Italy/raw_docs.joblib rename to knowledge_base/kbs/files_Italy/raw_docs.joblib diff --git a/knowledge_base/files_Italy/rdf_graph.ttl b/knowledge_base/kbs/files_Italy/rdf_graph.ttl similarity index 100% rename from knowledge_base/files_Italy/rdf_graph.ttl rename to knowledge_base/kbs/files_Italy/rdf_graph.ttl diff --git a/knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/data_level0.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/data_level0.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/data_level0.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/data_level0.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/header.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/header.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/header.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/header.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/index_metadata.pickle b/knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/index_metadata.pickle similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/index_metadata.pickle rename to knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/index_metadata.pickle diff --git a/knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/length.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/length.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/length.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/length.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/link_lists.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/link_lists.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/link_lists.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/0f03843a-fd43-4d27-8b2d-5a2bd81df494/link_lists.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/data_level0.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/data_level0.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/data_level0.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/data_level0.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/header.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/header.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/header.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/header.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/length.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/length.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/length.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/length.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/link_lists.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/link_lists.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/link_lists.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/984bbfa9-2b07-4d94-a385-ace07e7e2ebd/link_lists.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/chroma.sqlite3 b/knowledge_base/kbs/files_Switzerland/chroma_db/chroma.sqlite3 similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/chroma.sqlite3 rename to knowledge_base/kbs/files_Switzerland/chroma_db/chroma.sqlite3 diff --git a/knowledge_base/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/data_level0.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/data_level0.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/data_level0.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/data_level0.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/header.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/header.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/header.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/header.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/length.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/length.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/length.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/length.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/link_lists.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/link_lists.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/link_lists.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/e2420058-0eed-403c-970c-9ca2d361f757/link_lists.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/data_level0.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/data_level0.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/data_level0.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/data_level0.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/header.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/header.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/header.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/header.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/length.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/length.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/length.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/length.bin diff --git a/knowledge_base/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/link_lists.bin b/knowledge_base/kbs/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/link_lists.bin similarity index 100% rename from knowledge_base/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/link_lists.bin rename to knowledge_base/kbs/files_Switzerland/chroma_db/e9ff614d-0eb3-4162-8ebf-5d9f8e021cab/link_lists.bin diff --git a/knowledge_base/files_Switzerland/graph_documents.joblib b/knowledge_base/kbs/files_Switzerland/graph_documents.joblib similarity index 100% rename from knowledge_base/files_Switzerland/graph_documents.joblib rename to knowledge_base/kbs/files_Switzerland/graph_documents.joblib diff --git a/knowledge_base/files_Switzerland/preprocessed_chunks.joblib b/knowledge_base/kbs/files_Switzerland/preprocessed_chunks.joblib similarity index 100% rename from knowledge_base/files_Switzerland/preprocessed_chunks.joblib rename to knowledge_base/kbs/files_Switzerland/preprocessed_chunks.joblib diff --git a/knowledge_base/files_Switzerland/raw_docs.joblib b/knowledge_base/kbs/files_Switzerland/raw_docs.joblib similarity index 100% rename from knowledge_base/files_Switzerland/raw_docs.joblib rename to knowledge_base/kbs/files_Switzerland/raw_docs.joblib diff --git a/knowledge_base/files_Switzerland/rdf_graph.ttl b/knowledge_base/kbs/files_Switzerland/rdf_graph.ttl similarity index 100% rename from knowledge_base/files_Switzerland/rdf_graph.ttl rename to knowledge_base/kbs/files_Switzerland/rdf_graph.ttl diff --git a/knowledge_base/knowledge_extractor.py b/knowledge_base/knowledge_extractor.py index e488ada..8a5fcbf 100644 --- a/knowledge_base/knowledge_extractor.py +++ b/knowledge_base/knowledge_extractor.py @@ -3,17 +3,18 @@ import hashlib import os import re +from concurrent.futures import ThreadPoolExecutor, as_completed import joblib import numpy as np from langchain.embeddings import init_embeddings -from langchain_community.document_loaders import AsyncHtmlLoader, PyPDFLoader -from langchain_community.document_transformers import Html2TextTransformer +from langchain_community.document_loaders import AsyncHtmlLoader +from langchain_community.graphs.graph_document import GraphDocument from langchain_core.documents import Document from langchain_experimental.graph_transformers import LLMGraphTransformer from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_experimental.text_splitter import SemanticChunker -from rdflib import FOAF, OWL, RDF, RDFS, XSD, BNode, Graph, Literal, Namespace, URIRef +from rdflib import RDF, XSD, BNode, Literal from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm import chromadb @@ -26,32 +27,35 @@ from llm import LLMHandler -from .utils.graph_prompt import entities_comparator, extract_descriptions_for_entities, extract_descriptions_for_triples, representative_entity_selector, translate_chunk, summarize_chunk -from .utils.graph_helpers import process_name_or_relationship, normalize_l2, sparql_query +from .utils.graph_prompt import entities_comparator, extract_descriptions_for_entities, extract_descriptions_for_triples, translate_chunk, summarize_chunk +from .utils.graph_helpers import process_name_or_relationship from .utils.energenius_graph import EnergeniusGraph -from itertools import permutations - from bs4 import BeautifulSoup class KnowledgeExtractor: """_Class to create a knowledge base from a text files._""" - def __init__(self, provider: str, model: str, embedding: str): + def __init__(self, provider: str, model: str, embedding: str, max_workers: int = 4): """_Initialize the KnowledgeExtractor._ Args: provider (str): _Description of the model provider._ model (str): _Description of the model name._ embedding (str): _Description of the embedding model name._ + max_workers (int): _Maximum number of parallel workers for LLM calls (default: 4)._ """ + # Set temperature based on model - gpt-5 models require temperature=1.0 + temperature = 1.0 if model.startswith("gpt-5") else 0.0 + # Initialize the LLMHandler and embedding model. self.llm_handler = LLMHandler( - provider=provider, model=model, temperature=0.0, language=None, keep_history=False + provider=provider, model=model, temperature=temperature, language=None, keep_history=False ) self.embeddings = init_embeddings(model=embedding, provider=provider) + self.max_workers = max_workers self.llm_graph_transformer = LLMGraphTransformer( llm=self.llm_handler.get_model(), @@ -83,6 +87,125 @@ def _get_first_sentence(self, text): sentences = re.split(r'(?<=[.!?])\s+', text.strip()) return sentences[0] if sentences else '' + def _process_chunk_parallel(self, i, chunk, prev_content, next_content): + """Process a single chunk for translation and summarization (parallel worker).""" + try: + if "language" not in chunk.metadata.keys(): + chunk.metadata["language"] = "na" + + # Translation + if "en" not in chunk.metadata["language"].lower(): + chunk.page_content = self._strip_quotes( + self.llm_handler.generate_response(translate_chunk(), f"{chunk.page_content}", False) + ) + + # Summarization + curr = chunk.page_content + context = "\n".join(filter(None, [ + self._get_last_sentence(prev_content) if prev_content else None, + curr, + self._get_first_sentence(next_content) if next_content else None + ])) + chunk.page_content = self._strip_quotes( + self.llm_handler.generate_response(summarize_chunk(), context, False) + ).replace("\n\n", "\n") + + return i, chunk + except Exception as e: + print(f"\n[Warning] Failed to process chunk {i}: {str(e)}") + return i, chunk # Return original chunk on error + + def _convert_to_graph_parallel(self, doc, max_retries=2): + """Convert a single document to graph format (parallel worker) with retry logic and timeout.""" + for attempt in range(max_retries + 1): + try: + result = self.llm_graph_transformer.convert_to_graph_documents([doc])[0] + return result + + except Exception as e: + error_msg = str(e) + + # Check if it's a token limit error + if "length limit was reached" in error_msg or "completion_tokens" in error_msg: + if attempt < max_retries: + # Try splitting the chunk further + current_size = len(doc.page_content) + new_size = int(current_size * 0.5) # Reduce by 50% + + print(f"\n[Warning] Chunk too large ({current_size} chars), splitting into smaller pieces (target: {new_size} chars)...") + + splitter = RecursiveCharacterTextSplitter( + chunk_size=new_size, + chunk_overlap=0, + length_function=len, + separators=["\n\n", "\n", ". ", " ", ""] + ) + + # Split and take only the first sub-chunk to avoid duplication + sub_chunks = splitter.split_text(doc.page_content) + if sub_chunks: + doc = doc.model_copy(update={"page_content": sub_chunks[0]}) + continue + + # If not a token limit error or final attempt, return empty + if attempt == max_retries: + print(f"\n[Error] Failed to convert chunk after {max_retries + 1} attempts: {error_msg}") + return GraphDocument(nodes=[], relationships=[], source=doc) + + # Fallback (should not reach here) + return GraphDocument(nodes=[], relationships=[], source=doc) + + def _generate_triple_description_parallel(self, row): + """Generate description for a single triple (parallel worker).""" + try: + chunk = f"{row['prev_chunk_content']}\n\n{row['chunk_content']}\n\n{row['next_chunk_content']}" + description = self._strip_quotes( + self.llm_handler.generate_response( + extract_descriptions_for_triples(f"{chunk}"), + f"{row['source_entity_name']} {row['relationship_name']} {row['target_entity_name']}", + False + ) + ).replace("\n\n", "\n") + return row['triple'], description + except Exception as e: + print(f"\n[Warning] Failed to generate triple description: {str(e)}") + return row['triple'], "" # Return empty description on error + + def _generate_entity_description_parallel(self, row, graph): + """Generate description for a single entity (parallel worker).""" + try: + types = graph.get_types(row["entity"]) + entity_description_from_triples = '\n'.join(f'{row["name"]} is {type["name"]}.' for _, type in types.iterrows()) + entity_description_from_triples += "\n".join(graph.get_entity_triples(row["entity"])["description"]) + + description = self._strip_quotes( + self.llm_handler.generate_response( + extract_descriptions_for_entities(f"{entity_description_from_triples}"), + f"{row['name']}", + False + ) + ).replace("\n\n", "\n") + return row["entity"], description + except Exception as e: + print(f"\n[Warning] Failed to generate entity description: {str(e)}") + return row["entity"], "" # Return empty description on error + + def _embed_item_parallel(self, item_data, item_type): + """Embed a single item (parallel worker).""" + try: + if item_type == "entity": + return item_data["entity"], self.embeddings.embed_query(item_data["name"]) + elif item_type == "type": + return item_data["type"], self.embeddings.embed_query(item_data["name"]) + elif item_type == "relationship": + return item_data["relationship"], self.embeddings.embed_query(item_data["name"]) + elif item_type == "triple": + return item_data["triple"], self.embeddings.embed_query(item_data["description"]) + except Exception as e: + print(f"\n[Warning] Failed to embed {item_type}: {str(e)}") + # Return zero vector on error + return list(item_data.values())[0], [0.0] * 1536 # Typical embedding dimension + def __extract_main_content(self, html): # Alternatively #return Html2TextTransformer().transform_documents(html_docs) @@ -179,6 +302,7 @@ def run( load_cached_triple_descriptions: bool = False, load_cached_entity_descriptions: bool = False, load_cached_embeddings: bool = False, + enable_parallel: bool = True, ) -> None: """_Main function to create the knowledge base._ Args: @@ -191,11 +315,12 @@ def run( load_cached_triple_descriptions (bool, optional): Whether to load cached triple descriptions. Defaults to False. load_cached_entity_descriptions (bool, optional): Whether to load cached entity descriptions. Defaults to False. load_cached_embeddings (bool, optional): Whether to load cached embeddings for the knowledge base. Defaults to False. + enable_parallel (bool, optional): Whether to use parallel processing for LLM calls and embeddings. Defaults to True. """ # Initialize the variables dir_path = os.path.dirname(os.path.realpath(__file__)) - path = os.path.join(dir_path, folder) + path = os.path.join(dir_path, "kbs", folder) # Checking if files folder is present if not os.path.exists(path): @@ -231,13 +356,20 @@ def run( # Load PDF documents and convert to HTML for pdf_url in pdf_urls: try: - # Download PDF from URL - response = requests.get(pdf_url, timeout=30) - response.raise_for_status() + # Check if it's a local file path or URL + if os.path.exists(pdf_url): + # Local file path + with open(pdf_url, 'rb') as f: + pdf_content = f.read() + else: + # Download PDF from URL + response = requests.get(pdf_url, timeout=30) + response.raise_for_status() + pdf_content = response.content # Extract text with layout preservation as HTML output_string = io.StringIO() - pdf_file = io.BytesIO(response.content) + pdf_file = io.BytesIO(pdf_content) extract_text_to_fp( pdf_file, @@ -305,9 +437,9 @@ def run( temp_chunks.append(chunk.model_copy(update={"page_content": text})) chunks = temp_chunks - # Size limiter 2 + # Size limiter 2 - Reduced from 1500 to 1200 to prevent token overflow in graph conversion size_limiter = RecursiveCharacterTextSplitter( - chunk_size=1500, + chunk_size=1200, chunk_overlap=0, length_function=len, separators=[".", "!", "?"], @@ -342,30 +474,57 @@ def run( else: preprocessed_chunks = chunks - for i in tqdm(range(len(preprocessed_chunks)), desc="Translation & summarization of the chunks: "): - - if "language" not in preprocessed_chunks[i].metadata.keys(): - preprocessed_chunks[i].metadata["language"] = "na" + if enable_parallel and self.max_workers > 1: + print(f"Using parallel processing with {self.max_workers} workers for chunk preprocessing") + + # Process chunks in parallel - pass prev/next content to avoid race conditions + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = { + executor.submit( + self._process_chunk_parallel, + i, + chunk, + preprocessed_chunks[i - 1].page_content if i > 0 else "", + preprocessed_chunks[i + 1].page_content if i < len(preprocessed_chunks) - 1 else "" + ): i + for i, chunk in enumerate(preprocessed_chunks) + } + + # Collect results with progress bar with timeout + for future in tqdm(as_completed(futures, timeout=300), total=len(futures), desc="Translation & summarization of chunks: "): + try: + idx, processed_chunk = future.result(timeout=60) + preprocessed_chunks[idx] = processed_chunk + except Exception as e: + idx = futures[future] + print(f"\n[Error] Chunk {idx} failed: {str(e)}") + # Keep original chunk on failure + else: + # Sequential processing (original behavior) + for i in tqdm(range(len(preprocessed_chunks)), desc="Translation & summarization of the chunks: "): + + if "language" not in preprocessed_chunks[i].metadata.keys(): + preprocessed_chunks[i].metadata["language"] = "na" + + # Translation + if "en" not in preprocessed_chunks[i].metadata["language"].lower(): + preprocessed_chunks[i].page_content = self._strip_quotes( + self.llm_handler.generate_response(translate_chunk(), f"{preprocessed_chunks[i].page_content}", False) + ) - # Translation - if "en" not in preprocessed_chunks[i].metadata["language"].lower(): + # Summarization + prev = preprocessed_chunks[i - 1].page_content if i > 0 else "" + curr = preprocessed_chunks[i].page_content + next_ = preprocessed_chunks[i + 1].page_content if i < len(preprocessed_chunks) - 1 else "" + context = "\n".join(filter(None, [ + self._get_last_sentence(prev) if prev else None, + curr, + self._get_first_sentence(next_) if next_ else None + ])) + print(f"\n\n{context}") preprocessed_chunks[i].page_content = self._strip_quotes( - self.llm_handler.generate_response(translate_chunk(), f"{preprocessed_chunks[i].page_content}", False) - ) - - # Summarization - prev = preprocessed_chunks[i - 1].page_content if i > 0 else "" - curr = preprocessed_chunks[i].page_content - next_ = preprocessed_chunks[i + 1].page_content if i < len(preprocessed_chunks) - 1 else "" - context = "\n".join(filter(None, [ - self._get_last_sentence(prev) if prev else None, - curr, - self._get_first_sentence(next_) if next_ else None - ])) - print(f"\n\n{context}") - preprocessed_chunks[i].page_content = self._strip_quotes( - self.llm_handler.generate_response(summarize_chunk(), context, False) - ).replace("\n\n", "\n") + self.llm_handler.generate_response(summarize_chunk(), context, False) + ).replace("\n\n", "\n") joblib.dump(preprocessed_chunks, os.path.join(path, "preprocessed_chunks.joblib")) # Save @@ -380,18 +539,40 @@ def run( print("No existing knowledge base found.") return else: - - graph_documents = [] - for doc in tqdm(preprocessed_chunks, desc="Conversion to graph documents: "): # Nodes and relationships extraction - graph_from_chunk = self.llm_graph_transformer.convert_to_graph_documents([doc])[0] - print("\n".join([f"{rel.source.id} ({rel.source.type}), {rel.type}, {rel.target.id} ({rel.target.type})" for rel in graph_from_chunk.relationships])) - graph_documents.append(graph_from_chunk) + if enable_parallel and self.max_workers > 1: + print(f"Using parallel processing with {self.max_workers} workers for graph conversion") + + graph_documents = [] + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = {executor.submit(self._convert_to_graph_parallel, doc): i for i, doc in enumerate(preprocessed_chunks)} + + # Collect results with progress bar, maintaining order + results = {} + for future in tqdm(as_completed(futures, timeout=600), total=len(futures), desc="Conversion to graph documents: "): + try: + idx = futures[future] + graph_from_chunk = future.result(timeout=120) + results[idx] = graph_from_chunk + except Exception as e: + idx = futures[future] + print(f"\n[Error] Graph conversion {idx} failed: {str(e)}") + results[idx] = GraphDocument(nodes=[], relationships=[], source=preprocessed_chunks[idx]) + + # Sort by index to maintain order + graph_documents = [results[i] for i in range(len(results))] + else: + # Sequential processing (original behavior) + graph_documents = [] + for doc in tqdm(preprocessed_chunks, desc="Conversion to graph documents: "): # Nodes and relationships extraction + graph_from_chunk = self._convert_to_graph_parallel(doc) # Use same retry logic + print("\n".join([f"{rel.source.id} ({rel.source.type}), {rel.type}, {rel.target.id} ({rel.target.type})" for rel in graph_from_chunk.relationships])) + graph_documents.append(graph_from_chunk) joblib.dump(graph_documents, os.path.join(path, "graph_documents.joblib")) # Save # --- Syntactic disambiguation --- - + print("Starting syntactic disambiguation...") def is_valid_text(text): # Check for non-empty alphanumeric content if not re.match(r'^(?=.*[a-zA-Z0-9]).+$', text): @@ -436,33 +617,86 @@ def is_valid_text(text): if re.match(r'^(?=.*[a-zA-Z0-9]).+$', rel.target.type): all_entities[rel.target.type] = rel.target.type + # Helper function for parallel entity comparison + def compare_entity_pair(i, j, ids, similarity_matrix): + """Compare a pair of entities to determine if they should be merged.""" + try: + if similarity_matrix[i][j] > 0.9 and not re.search(r'\d', ids[i]) and not re.search(r'\d', ids[j]): + same = self.llm_handler.generate_response(entities_comparator(), f"{ids[i]}\n{ids[j]}", False) == "Same" + if same: + def to_keep(s1, s2): + s1c = s1.count(' ') + s2c = s2.count(' ') + if s1c > s2c and s1c < 5: return s1 + elif s1c < s2c and s1c < 5: return s2 + else: return s1 if len(s1) <= len(s2) else s2 + ent = to_keep(ids[i], ids[j]) + print(f"{ids[i]} - {ids[j]} -> {ent}") + return (ids[i], ids[j], ent) + return None + except Exception as e: + print(f"\n[Warning] Failed to compare entities {ids[i]} and {ids[j]}: {str(e)}") + return None + # Compute cosine similarity matrix - merged_map = {} - for iterations in range(5): + for iterations in tqdm(range(5), desc="Syntactic disambiguation iterations: "): if not all_entities: break ids = list(all_entities.keys()) - embeddings = np.array([self.embeddings.embed_query(key) for key in ids]) + embeddings = np.array([self.embeddings.embed_query(key) for key in tqdm(ids, desc=f" Embedding entities (iteration {iterations+1}): ", leave=False)]) similarity_matrix = cosine_similarity(embeddings) # Group similar nodes new_merged_map = {} - for i in tqdm(range(len(ids)), desc="Syntactic disambiguation: "): - for j in range(i + 1, len(ids)): - if similarity_matrix[i][j] > 0.9 and not re.search(r'\d', ids[i]) and not re.search(r'\d', ids[j]): # No numbers - same = self.llm_handler.generate_response(entities_comparator(), f"{ids[i]}\n{ids[j]}", False) == "Same" # If they are not the same thing - if same: - def to_keep(s1, s2): - s1c = s1.count(' ') - s2c = s2.count(' ') - if s1c > s2c and s1c < 5: return s1 - elif s1c < s2c and s1c < 5: return s2 - else: return s1 if len(s1) <= len(s2) else s2 - ent = to_keep(ids[i],ids[j]) # Chose which of the two to keep - print(f"{ids[i]} - {ids[j]} -> {ent}") - # Merge j into i - new_merged_map[ids[i]] = ent - new_merged_map[ids[j]] = ent + + if enable_parallel and self.max_workers > 1: + print(f" Using parallel processing with {self.max_workers} workers for entity comparison") + + # Collect all candidate pairs + candidate_pairs = [] + for i in range(len(ids)): + for j in range(i + 1, len(ids)): + if similarity_matrix[i][j] > 0.9 and not re.search(r'\d', ids[i]) and not re.search(r'\d', ids[j]): + candidate_pairs.append((i, j)) + + print(f" Found {len(candidate_pairs)} candidate pairs to compare (iteration {iterations+1})") + + # Process pairs in parallel + with ThreadPoolExecutor(max_workers=min(6, self.max_workers)) as executor: + futures = { + executor.submit(compare_entity_pair, i, j, ids, similarity_matrix): (i, j) + for i, j in candidate_pairs + } + + for future in tqdm(as_completed(futures, timeout=300), total=len(futures), desc=f" Comparing entity pairs (iteration {iterations+1}): ", leave=False): + try: + result = future.result(timeout=60) + if result: + id_i, id_j, ent = result + new_merged_map[id_i] = ent + new_merged_map[id_j] = ent + except Exception as e: + i, j = futures[future] + print(f"\n[Error] Entity comparison ({ids[i]}, {ids[j]}) failed: {str(e)}") + else: + # Sequential processing (original behavior) + for i in tqdm(range(len(ids)), desc=f" Comparing entity pairs (iteration {iterations+1}): ", leave=False): + for j in range(i + 1, len(ids)): + if similarity_matrix[i][j] > 0.9 and not re.search(r'\d', ids[i]) and not re.search(r'\d', ids[j]): # No numbers + same = self.llm_handler.generate_response(entities_comparator(), f"{ids[i]}\n{ids[j]}", False) == "Same" # If they are not the same thing + if same: + def to_keep(s1, s2): + s1c = s1.count(' ') + s2c = s2.count(' ') + if s1c > s2c and s1c < 5: return s1 + elif s1c < s2c and s1c < 5: return s2 + else: return s1 if len(s1) <= len(s2) else s2 + ent = to_keep(ids[i],ids[j]) # Chose which of the two to keep + print(f"{ids[i]} - {ids[j]} -> {ent}") + # Merge j into i + new_merged_map[ids[i]] = ent + new_merged_map[ids[j]] = ent + all_entities = {v: v for v in new_merged_map.values()} # Update graph_documents @@ -610,13 +844,29 @@ def to_keep(s1, s2): triples["chunk_content"] = triples["chunk_content"].str.replace("\n", " ", regex=False) triples["next_chunk_content"] = triples["next_chunk_content"].str.replace("\n", " ", regex=False) - for index, row in tqdm(list(triples.iterrows()), desc="Summarizing triples: "): - chunk = f"{row["prev_chunk_content"]}\n\n{row["chunk_content"]}\n\n{row["next_chunk_content"]}" - description = self._strip_quotes( - self.llm_handler.generate_response( - extract_descriptions_for_triples(f"{chunk}"), f"{row['source_entity_name']} {row["relationship_name"]} {row['target_entity_name']}", False) - ).replace("\n\n", "\n") - graph.rdf_graph.add((row["triple"], graph.ONTO.hasDescription, Literal(description, datatype=XSD.string))) + if enable_parallel and self.max_workers > 1: + print(f"Using parallel processing with {self.max_workers} workers for triple descriptions") + + # Max workers to avoid rate limits + with ThreadPoolExecutor(max_workers=min(6, self.max_workers)) as executor: + futures = {executor.submit(self._generate_triple_description_parallel, row): index for index, row in triples.iterrows()} + + for future in tqdm(as_completed(futures, timeout=600), total=len(futures), desc="Summarizing triples: "): + try: + triple_uri, description = future.result(timeout=120) + graph.rdf_graph.add((triple_uri, graph.ONTO.hasDescription, Literal(description, datatype=XSD.string))) + except Exception as e: + index = futures[future] + print(f"\n[Error] Triple {index} description failed: {str(e)}") + else: + # Sequential processing (original behavior) + for index, row in tqdm(list(triples.iterrows()), desc="Summarizing triples: "): + chunk = f"{row['prev_chunk_content']}\n\n{row['chunk_content']}\n\n{row['next_chunk_content']}" + description = self._strip_quotes( + self.llm_handler.generate_response( + extract_descriptions_for_triples(f"{chunk}"), f"{row['source_entity_name']} {row['relationship_name']} {row['target_entity_name']}", False) + ).replace("\n\n", "\n") + graph.rdf_graph.add((row["triple"], graph.ONTO.hasDescription, Literal(description, datatype=XSD.string))) graph.save_to_file(os.path.join(path, f"{file_name}.ttl")) # Save @@ -634,18 +884,35 @@ def to_keep(s1, s2): # Entities entities = graph.get_entities() - for index, row in tqdm(list(entities.iterrows()), desc="Summarizing entities: "): - # Types - types = graph.get_types(row["entity"]) - entity_description_from_triples = '\n'.join(f'{row["name"]} is {type["name"]}.' for _, type in types.iterrows()) - # Descriptions - entity_description_from_triples += "\n".join(graph.get_entity_triples(row["entity"])["description"]) - - description = self._strip_quotes( - self.llm_handler.generate_response( - extract_descriptions_for_entities(f"{entity_description_from_triples}"), f"{row["name"]}", False) - ).replace("\n\n", "\n") - graph.rdf_graph.add((row["entity"], graph.ONTO.hasDescription, Literal(description, datatype=XSD.string))) + + if enable_parallel and self.max_workers > 1: + print(f"Using parallel processing with {self.max_workers} workers for entity descriptions") + + # Max workers to avoid rate limits + with ThreadPoolExecutor(max_workers=min(6, self.max_workers)) as executor: + futures = {executor.submit(self._generate_entity_description_parallel, row, graph): index for index, row in entities.iterrows()} + + for future in tqdm(as_completed(futures, timeout=600), total=len(futures), desc="Summarizing entities: "): + try: + entity_uri, description = future.result(timeout=120) + graph.rdf_graph.add((entity_uri, graph.ONTO.hasDescription, Literal(description, datatype=XSD.string))) + except Exception as e: + index = futures[future] + print(f"\n[Error] Entity {index} description failed: {str(e)}") + else: + # Sequential processing (original behavior) + for index, row in tqdm(list(entities.iterrows()), desc="Summarizing entities: "): + # Types + types = graph.get_types(row["entity"]) + entity_description_from_triples = '\n'.join(f'{row["name"]} is {type["name"]}.' for _, type in types.iterrows()) + # Descriptions + entity_description_from_triples += "\n".join(graph.get_entity_triples(row["entity"])["description"]) + + description = self._strip_quotes( + self.llm_handler.generate_response( + extract_descriptions_for_entities(f"{entity_description_from_triples}"), f"{row["name"]}", False) + ).replace("\n\n", "\n") + graph.rdf_graph.add((row["entity"], graph.ONTO.hasDescription, Literal(description, datatype=XSD.string))) graph.save_to_file(os.path.join(path, f"{file_name}.ttl")) # Save @@ -664,33 +931,85 @@ def to_keep(s1, s2): if not load_cached_embeddings: - # Entities - #collection_entities.delete(ids=collection_entities.get()["ids"]) - entities = graph.get_entities() - for index, row in tqdm(list(entities.iterrows()), desc="Embedding entities: "): - emb = self.embeddings.embed_query(row["name"]) - collection_entities.add(ids=[row["entity"]], embeddings=[emb]) - - # Types - #collection_types.delete(ids=collection_types.get()["ids"]) - types = graph.get_types() - for index, row in tqdm(list(types.iterrows()), desc="Embedding types: "): - emb = self.embeddings.embed_query(row["name"]) - collection_types.add(ids=[row["type"]], embeddings=[emb]) - - # Relationships - #collection_relationships.delete(ids=collection_relationships.get()["ids"]) - relationships = graph.get_relationships() - for index, row in tqdm(list(relationships.iterrows()), desc="Embedding relationships: "): - emb = self.embeddings.embed_query(row["name"]) - collection_relationships.add(ids=[row["relationship"]], embeddings=[emb]) - - # Triples - #collection_triples.delete(ids=collection_triples.get()["ids"]) - triples = graph.get_triples() - for index, row in tqdm(list(triples.iterrows()), desc="Embedding triples: "): - emb = self.embeddings.embed_query(row["description"]) - collection_triples.add(ids=[row["triple"]], embeddings=[emb]) + if enable_parallel and self.max_workers > 1: + print(f"Using parallel processing with {self.max_workers} workers for embeddings") + + # Entities + entities = graph.get_entities() + entity_rows = [{"entity": row["entity"], "name": row["name"]} for _, row in entities.iterrows()] + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = {executor.submit(self._embed_item_parallel, item, "entity"): item for item in entity_rows} + for future in tqdm(as_completed(futures, timeout=300), total=len(futures), desc="Embedding entities: "): + try: + entity_id, emb = future.result(timeout=30) + collection_entities.add(ids=[entity_id], embeddings=[emb]) + except Exception as e: + item = futures[future] + print(f"\n[Error] Embedding entity {item.get('entity', 'unknown')} failed: {str(e)}") + + # Types + types = graph.get_types() + type_rows = [{"type": row["type"], "name": row["name"]} for _, row in types.iterrows()] + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = {executor.submit(self._embed_item_parallel, item, "type"): item for item in type_rows} + for future in tqdm(as_completed(futures, timeout=300), total=len(futures), desc="Embedding types: "): + try: + type_id, emb = future.result(timeout=30) + collection_types.add(ids=[type_id], embeddings=[emb]) + except Exception as e: + item = futures[future] + print(f"\n[Error] Embedding type {item.get('type', 'unknown')} failed: {str(e)}") + + # Relationships + relationships = graph.get_relationships() + rel_rows = [{"relationship": row["relationship"], "name": row["name"]} for _, row in relationships.iterrows()] + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = {executor.submit(self._embed_item_parallel, item, "relationship"): item for item in rel_rows} + for future in tqdm(as_completed(futures, timeout=300), total=len(futures), desc="Embedding relationships: "): + try: + rel_id, emb = future.result(timeout=30) + collection_relationships.add(ids=[rel_id], embeddings=[emb]) + except Exception as e: + item = futures[future] + print(f"\n[Error] Embedding relationship {item.get('relationship', 'unknown')} failed: {str(e)}") + + # Triples + triples = graph.get_triples() + triple_rows = [{"triple": row["triple"], "description": row["description"]} for _, row in triples.iterrows()] + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = {executor.submit(self._embed_item_parallel, item, "triple"): item for item in triple_rows} + for future in tqdm(as_completed(futures, timeout=300), total=len(futures), desc="Embedding triples: "): + try: + triple_id, emb = future.result(timeout=30) + collection_triples.add(ids=[triple_id], embeddings=[emb]) + except Exception as e: + item = futures[future] + print(f"\n[Error] Embedding triple {item.get('triple', 'unknown')} failed: {str(e)}") + else: + # Sequential processing (original behavior) + # Entities + entities = graph.get_entities() + for index, row in tqdm(list(entities.iterrows()), desc="Embedding entities: "): + emb = self.embeddings.embed_query(row["name"]) + collection_entities.add(ids=[row["entity"]], embeddings=[emb]) + + # Types + types = graph.get_types() + for index, row in tqdm(list(types.iterrows()), desc="Embedding types: "): + emb = self.embeddings.embed_query(row["name"]) + collection_types.add(ids=[row["type"]], embeddings=[emb]) + + # Relationships + relationships = graph.get_relationships() + for index, row in tqdm(list(relationships.iterrows()), desc="Embedding relationships: "): + emb = self.embeddings.embed_query(row["name"]) + collection_relationships.add(ids=[row["relationship"]], embeddings=[emb]) + + # Triples + triples = graph.get_triples() + for index, row in tqdm(list(triples.iterrows()), desc="Embedding triples: "): + emb = self.embeddings.embed_query(row["description"]) + collection_triples.add(ids=[row["triple"]], embeddings=[emb]) # Chunks #collection_chunks.delete(ids=collection_chunks.get()["ids"]) diff --git a/knowledge_base/knowledge_manager.py b/knowledge_base/knowledge_manager.py index 02d77da..6dca3c2 100644 --- a/knowledge_base/knowledge_manager.py +++ b/knowledge_base/knowledge_manager.py @@ -51,10 +51,19 @@ def __init__(self, provider: str, model: str, embedding: str, language: str, kno language=None, keep_history=False ) - self.embeddings = init_embeddings( - model=embedding, - provider=provider - ) + + # Initialize embeddings with keep_alive for Ollama + if provider == "ollama": + self.embeddings = init_embeddings( + model=embedding, + provider=provider, + keep_alive=-1 # Keep model loaded indefinitely to prevent crashes + ) + else: + self.embeddings = init_embeddings( + model=embedding, + provider=provider + ) self.language = language self.knowledge_base_path = knowledge_base_path @@ -93,7 +102,7 @@ def user_message(self, message: str, user_type: str, house_type: str, region: st # --- Init --- # Initialize the variables - self.path = os.path.join(os.path.dirname(os.path.realpath(__file__)), self.knowledge_base_path) + self.path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "kbs", self.knowledge_base_path) # Load the RDF graph self.graph = EnergeniusGraph() @@ -124,7 +133,6 @@ def user_message(self, message: str, user_type: str, house_type: str, region: st #print(f"\n\n-----User message-----\n{message}") - # If the question is not well-formed if len(message) < 3: return wrong_answer_prompt(self.language) @@ -459,7 +467,7 @@ def user_message(self, message: str, user_type: str, house_type: str, region: st n_results=30, ) triples = [{"id": id, "distance": distance} for (id, distance) in zip(triples["ids"][0], triples["distances"][0]) if distance < 0.5] if triples else [] - print(f"\n\n-----Triples for {message}-----\n{"\n".join([f"{triple}" for triple in triples])}") + #print(f"\n\n-----Triples for {message}-----\n{"\n".join([f"{triple}" for triple in triples])}") # No triple found: return wrong answer prompt if not triples: diff --git a/knowledge_base/utils/graph_helpers.py b/knowledge_base/utils/graph_helpers.py index a71232b..9fdc21a 100644 --- a/knowledge_base/utils/graph_helpers.py +++ b/knowledge_base/utils/graph_helpers.py @@ -4,7 +4,6 @@ from rdflib import Graph from rdflib.plugins.sparql import prepareQuery -from nltk.corpus import wordnet as wn import re import unicodedata diff --git a/llm/__init__.py b/llm/__init__.py index f9e5bcb..4b5f7ea 100644 --- a/llm/__init__.py +++ b/llm/__init__.py @@ -1,2 +1,2 @@ """LLM module for handling different large language models.""" -from .langchain import LLMHandler +from .langchain import LLMHandler, get_ollama_models, get_ollama_embeddings diff --git a/llm/langchain.py b/llm/langchain.py index 678cd19..dd4a356 100644 --- a/llm/langchain.py +++ b/llm/langchain.py @@ -1,12 +1,78 @@ """LangChain LLM wrapper.""" import os +import requests +from dotenv import load_dotenv from langchain.chat_models import init_chat_model from langchain_core.language_models import BaseChatModel from langchain_core.messages import BaseMessage, AIMessage, HumanMessage, SystemMessage -from private_settings import PRIVATE_SETINGS +# Load environment variables from .env file +load_dotenv() + + +def get_ollama_models(base_url: str = None, timeout: int = 5) -> list[str]: + """Fetch available Ollama chat models from the API (excludes embedding models). + + Args: + base_url (str, optional): The base URL of the Ollama server. + If None, uses PRIVATE_SETINGS["LLM_BASE_URL"]. + timeout (int, optional): Request timeout in seconds. Defaults to 5. + + Returns: + list[str]: List of available chat model names, or error messages if request fails. + """ + try: + if base_url is None: + base_url = os.getenv("LLM_BASE_URL", "http://localhost:11434") + # Remove trailing slash if present + base_url = base_url.rstrip('/') + response = requests.get(f"{base_url}/api/tags", timeout=timeout) + if response.status_code == 200: + models_data = response.json() + # Filter out embedding models based on model family metadata + embedding_families = ["bert", "nomic-bert"] + chat_models = [ + model["name"] for model in models_data.get("models", []) + if model.get("details", {}).get("family", "").lower() not in embedding_families + ] + return chat_models if chat_models else ["No chat models found"] + else: + return ["Error: Could not connect to Ollama"] + except Exception as e: + return [f"Error: {str(e)}"] + + +def get_ollama_embeddings(base_url: str = None, timeout: int = 5) -> list[str]: + """Fetch available Ollama embedding models from the API. + + Args: + base_url (str, optional): The base URL of the Ollama server. + If None, uses PRIVATE_SETINGS["LLM_BASE_URL"]. + timeout (int, optional): Request timeout in seconds. Defaults to 5. + + Returns: + list[str]: List of available embedding model names, or fallback defaults if request fails. + """ + try: + if base_url is None: + base_url = os.getenv("LLM_BASE_URL", "http://localhost:11434") + base_url = base_url.rstrip('/') + response = requests.get(f"{base_url}/api/tags", timeout=timeout) + if response.status_code == 200: + models_data = response.json() + # Filter for embedding models based on model family metadata + embedding_families = ["bert", "nomic-bert"] + embeddings = [ + model["name"] for model in models_data.get("models", []) + if model.get("details", {}).get("family", "").lower() in embedding_families + ] + return embeddings if embeddings else ["mxbai-embed-large"] + else: + return ["mxbai-embed-large", "nomic-embed-text"] # Fallback defaults + except Exception: + return ["mxbai-embed-large", "nomic-embed-text"] # Fallback defaults class LLMHandler: @@ -43,19 +109,31 @@ def __init__( self.__env_creation(provider) # creation of the model - if not PRIVATE_SETINGS["LLM_LOCAL"]: + llm_local = os.getenv("LLM_LOCAL", "false").lower() == "true" + if not llm_local: self.model = init_chat_model( model=model, model_provider=provider, temperature=temperature, ) else: - self.model = init_chat_model( - model=model, - model_provider=provider, - temperature=temperature, - base_url=PRIVATE_SETINGS["LLM_BASE_URL"], - ) + # For local models (like Ollama), add keep_alive parameter + llm_base_url = os.getenv("LLM_BASE_URL", "http://localhost:11434") + if provider == "ollama": + self.model = init_chat_model( + model=model, + model_provider=provider, + temperature=temperature, + base_url=llm_base_url, + keep_alive=-1, # Keep model loaded indefinitely to prevent crashes + ) + else: + self.model = init_chat_model( + model=model, + model_provider=provider, + temperature=temperature, + base_url=llm_base_url, + ) def __env_creation(self, provider: str) -> None: """ @@ -67,7 +145,12 @@ def __env_creation(self, provider: str) -> None: """ if provider == "openai": - os.environ["OPENAI_API_KEY"] = PRIVATE_SETINGS["LLM_KEY"]["openai"] + # OPENAI_API_KEY should already be set in environment or .env file + # Only set it if not already present + if "OPENAI_API_KEY" not in os.environ: + openai_key = os.getenv("OPENAI_API_KEY") + if openai_key: + os.environ["OPENAI_API_KEY"] = openai_key def get_model(self) -> BaseChatModel: """Get the current BaseChatModel model diff --git a/requirements.txt b/requirements.txt index d59b1ae..fbd8a88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,6 +47,7 @@ joblib==1.5.0 jsonpatch==1.33 jsonpickle==4.0.5 jsonpointer==3.0.0 +json_repair==0.52.3 jsonschema==4.23.0 jsonschema-specifications==2025.4.1 kiwisolver==1.4.8 @@ -133,3 +134,5 @@ wcwidth==0.2.13 wheel==0.45.1 yarl==1.20.0 zstandard==0.23.0 +pymongo==4.3.3 +pdfminer.six==20221105 \ No newline at end of file diff --git a/run_document_manager.sh b/run_document_manager.sh new file mode 100755 index 0000000..cee5980 --- /dev/null +++ b/run_document_manager.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +cd "$(dirname "$0")" && streamlit run document_manager_ui.py --server.port 8501 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..2c3547f --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,3 @@ +"""Document Manager - Core modules for MongoDB document management.""" + +__version__ = "1.0.0" diff --git a/src/chat_manager.py b/src/chat_manager.py new file mode 100644 index 0000000..cecc6f4 --- /dev/null +++ b/src/chat_manager.py @@ -0,0 +1,70 @@ +"""Chat management functions - upload, download, and session handling.""" + +import json +from io import StringIO +import streamlit as st + + +def prepare_chat_download_data(messages): + """Prepare chat messages for download as JSON.""" + return json.dumps(messages, separators=(",", ": ")) + + +def process_chat_upload(uploaded_file): + """ + Process uploaded chat file and return messages. + + Returns: + tuple: (messages_list, error_message) + """ + if uploaded_file is None: + return None, None + + try: + # Convert to string based IO + stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) + string_data = stringio.read() + + # Convert into JSON + chat_data = json.loads(string_data) + + # Validate format + if not isinstance(chat_data, list): + return None, "Invalid chat file format: expected a list of messages." + + # Validate each message + for message in chat_data: + if (not isinstance(message, dict) + or "role" not in message + or "content" not in message): + return None, "Invalid chat file format: each message must have 'role' and 'content'." + + return chat_data, None + + except json.JSONDecodeError as e: + return None, f"Error decoding JSON: {e}" + except Exception as e: + return None, f"Error processing file: {e}" + + +def initialize_chat_messages(): + """Initialize chat messages in session state if not present.""" + if "messages" not in st.session_state: + st.session_state["messages"] = [] + + +def clear_chat_messages(): + """Clear all chat messages from session state.""" + st.session_state.pop("messages", None) + + +def add_message(role, content): + """Add a message to the chat history.""" + if "messages" not in st.session_state: + st.session_state["messages"] = [] + st.session_state["messages"].append({"role": role, "content": content}) + + +def get_chat_messages(): + """Get all chat messages from session state.""" + return st.session_state.get("messages", []) diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..4459bd3 --- /dev/null +++ b/src/database.py @@ -0,0 +1,32 @@ +"""MongoDB database connection and configuration.""" + +import os +import streamlit as st +from pymongo import MongoClient +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# MongoDB configuration +MONGODB_URI = os.getenv("MONGODB_URI", "mongodb://localhost:27017") +MONGODB_DATABASE = os.getenv("MONGODB_DATABASE", "energenius") +MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION", "documents") + + +@st.cache_resource +def get_mongodb_client(): + """Get MongoDB client connection.""" + return MongoClient(MONGODB_URI) + + +def get_db(): + """Get MongoDB database.""" + client = get_mongodb_client() + return client[MONGODB_DATABASE] + + +def get_collection(): + """Get MongoDB collection.""" + db = get_db() + return db[MONGODB_COLLECTION] diff --git a/src/document_manager.py b/src/document_manager.py new file mode 100644 index 0000000..1d2428e --- /dev/null +++ b/src/document_manager.py @@ -0,0 +1,143 @@ +"""Document management functions for MongoDB - PDFs and URLs.""" + +from datetime import datetime +from bson.objectid import ObjectId +from .database import get_collection +from .utils import calculate_file_hash + + +def upload_pdf_to_mongodb(domain_name, pdf_file, description="", tags=None): + """Upload a PDF file to MongoDB.""" + collection = get_collection() + + # Read PDF content + pdf_content = pdf_file.read() + + # Calculate content hash for duplicate detection + content_hash = calculate_file_hash(pdf_content) + + document_doc = { + "type": "document", + "domain": domain_name, + "doc_type": "pdf", + "filename": pdf_file.name, + "description": description, + "tags": tags or [], + "content": pdf_content, + "content_hash": content_hash, + "size": len(pdf_content), + "uploaded_at": datetime.now(), + "updated_at": datetime.now() + } + + result = collection.insert_one(document_doc) + return result.inserted_id + + +def add_url_to_mongodb(domain_name, url, title="", description="", tags=None): + """Add a URL to MongoDB.""" + collection = get_collection() + + document_doc = { + "type": "document", + "domain": domain_name, + "doc_type": "url", + "url": url, + "title": title or url, + "description": description, + "tags": tags or [], + "uploaded_at": datetime.now(), + "updated_at": datetime.now() + } + + result = collection.insert_one(document_doc) + return result.inserted_id + + +def get_documents_by_domain(domain_name): + """Get all documents for a specific domain.""" + collection = get_collection() + documents = list(collection.find({"type": "document", "domain": domain_name}).sort("uploaded_at", -1)) + return documents + + +def delete_document(document_id): + """Delete a document from MongoDB.""" + collection = get_collection() + collection.delete_one({"_id": ObjectId(document_id)}) + + +def download_pdf_from_mongodb(document_id): + """Retrieve a PDF document from MongoDB.""" + collection = get_collection() + + document = collection.find_one({"_id": ObjectId(document_id), "doc_type": "pdf"}) + + if document and "content" in document: + return { + "content": document["content"], + "filename": document.get("filename", "document.pdf") + } + return None + + +def check_duplicate_pdf(domain_name, filename, file_content): + """Check if a PDF already exists in the domain by filename or content hash.""" + collection = get_collection() + content_hash = calculate_file_hash(file_content) + + # Check by filename + filename_match = collection.find_one({ + "type": "document", + "domain": domain_name, + "doc_type": "pdf", + "filename": filename + }) + + # Check by content hash + hash_match = collection.find_one({ + "type": "document", + "domain": domain_name, + "doc_type": "pdf", + "content_hash": content_hash + }) + + return { + "is_duplicate": bool(filename_match or hash_match), + "duplicate_type": "filename" if filename_match else ("content" if hash_match else None), + "existing_doc": filename_match or hash_match, + "content_hash": content_hash + } + + +def check_duplicate_url(domain_name, url): + """Check if a URL already exists in the domain.""" + collection = get_collection() + + existing = collection.find_one({ + "type": "document", + "domain": domain_name, + "doc_type": "url", + "url": url + }) + + return { + "is_duplicate": bool(existing), + "existing_doc": existing + } + + +def get_document_stats(): + """Get statistics about documents.""" + collection = get_collection() + total_domains = collection.count_documents({"type": "domain"}) + total_docs = collection.count_documents({"type": "document"}) + total_pdfs = collection.count_documents({"type": "document", "doc_type": "pdf"}) + total_urls = collection.count_documents({"type": "document", "doc_type": "url"}) + + return { + "total_domains": total_domains, + "total_docs": total_docs, + "total_pdfs": total_pdfs, + "total_urls": total_urls + } diff --git a/src/domain_manager.py b/src/domain_manager.py new file mode 100644 index 0000000..603de5f --- /dev/null +++ b/src/domain_manager.py @@ -0,0 +1,34 @@ +"""Domain management functions for MongoDB.""" + +from datetime import datetime +from .database import get_collection + + +def create_domain(domain_name, description=""): + """Create a new domain in MongoDB.""" + collection = get_collection() + domain_doc = { + "type": "domain", + "name": domain_name, + "description": description, + "created_at": datetime.now(), + "updated_at": datetime.now() + } + result = collection.insert_one(domain_doc) + return result.inserted_id + + +def get_all_domains(): + """Get all domains from MongoDB.""" + collection = get_collection() + domains = list(collection.find({"type": "domain"}).sort("name", 1)) + return domains + + +def delete_domain(domain_name): + """Delete a domain and all its documents.""" + collection = get_collection() + # Delete all documents in the domain + collection.delete_many({"domain": domain_name}) + # Delete the domain itself + collection.delete_one({"type": "domain", "name": domain_name}) diff --git a/src/kb_builder.py b/src/kb_builder.py new file mode 100644 index 0000000..8b57acd --- /dev/null +++ b/src/kb_builder.py @@ -0,0 +1,428 @@ +"""Knowledge Base Builder from MongoDB documents.""" + +import os +import gc +import threading +from datetime import datetime +from .database import get_collection +from .document_manager import get_documents_by_domain + +def _close_guru_sessions_for_domain(folder_name, log_progress): + """ + Close any GURU sessions that are using the specified knowledge base folder. + This releases ChromaDB file locks to allow rebuilding. + + Args: + folder_name (str): The knowledge base folder name (e.g., "files_Italy") + log_progress (callable): Logging callback + """ + import streamlit as st + + # Check if there's an active orchestrator in the session state + if "orchestrator" in st.session_state: + orchestrator = st.session_state["orchestrator"] + + # Check if the orchestrator's guru is using this domain's KB + if hasattr(orchestrator, 'guru') and hasattr(orchestrator.guru, 'know_base'): + kb = orchestrator.guru.know_base + if hasattr(kb, 'knowledge_base_path') and kb.knowledge_base_path == folder_name: + log_progress(f"Found active GURU session using {folder_name}") + + # Close ChromaDB client if it exists + if hasattr(kb, 'chromadbClient') and kb.chromadbClient is not None: + try: + log_progress("Closing ChromaDB connection...") + # ChromaDB doesn't have an explicit close method, but we can clear the reference + kb.chromadbClient = None + log_progress("ChromaDB connection cleared") + except Exception as e: + log_progress(f"Note: {str(e)}") + + # Clear the orchestrator to force reinitialization + log_progress("Clearing orchestrator from session...") + del st.session_state["orchestrator"] + log_progress("Orchestrator cleared successfully") + + # Force garbage collection to release file handles + gc.collect() + log_progress("Garbage collection completed") + + # Give the system a moment to fully release file handles + import time + time.sleep(1) + log_progress("Ready to proceed with KB creation") + + +def _build_kb_worker(domain_name, provider, model, embedding, max_workers, enable_parallel): + """ + Worker function that runs in a separate thread to build KB. + This allows the process to continue even if user switches tabs in Streamlit. + """ + from knowledge_base import KnowledgeExtractor + + collection = get_collection() + temp_pdf_files = [] + + try: + print(f"[Thread] Starting KB creation for domain: {domain_name}") + + # Get all documents for the domain + documents = get_documents_by_domain(domain_name) + + # Separate URLs and PDFs + urls = [] + pdf_count = 0 + + for doc in documents: + if doc.get("doc_type") == "url": + urls.append(doc.get("url")) + elif doc.get("doc_type") == "pdf": + pdf_count += 1 + try: + pdf_content = doc.get("content") + if pdf_content: + kb_temp_dir = os.path.join( + os.path.dirname(__file__), + "..", + "knowledge_base", + "kbs", + "temp_pdfs" + ) + os.makedirs(kb_temp_dir, exist_ok=True) + + import hashlib + safe_filename = doc.get('filename', 'document.pdf').replace(' ', '_') + file_hash = hashlib.md5(pdf_content[:1024]).hexdigest()[:8] + temp_filename = f"{file_hash}_{safe_filename}" + temp_path = os.path.join(kb_temp_dir, temp_filename) + + with open(temp_path, 'wb') as f: + f.write(pdf_content) + + urls.append(temp_path) + temp_pdf_files.append(temp_path) + print(f"[Thread] Saved PDF '{doc.get('filename')}'") + except Exception as e: + print(f"[Thread] Warning: Could not process PDF '{doc.get('filename')}': {str(e)}") + + print(f"[Thread] Processing {len(urls)} documents") + + # Initialize Knowledge Extractor with parallel processing options + ke = KnowledgeExtractor(provider, model, embedding, max_workers=max_workers) + folder_name = f"files_{domain_name}" + + # Run the knowledge extraction + ke.run( + folder=folder_name, + file_name="rdf_graph", + html_links=urls, + load_cached_docs=False, + load_cached_preprocessed_chunks=False, + enable_parallel=enable_parallel + ) + + print(f"[Thread] KB creation completed for domain '{domain_name}'") + + # Update status to completed + collection.update_one( + {"type": "knowledge_base", "domain": domain_name}, + {"$set": {"status": "completed", "completed_at": datetime.now()}} + ) + print(f"[Thread] Status updated to completed") + + except Exception as e: + print(f"[Thread] Error creating KB: {str(e)}") + collection.update_one( + {"type": "knowledge_base", "domain": domain_name}, + {"$set": {"status": "error", "error_message": str(e), "error_at": datetime.now()}} + ) + finally: + # Clean up temporary PDF files + for temp_file in temp_pdf_files: + try: + if os.path.exists(temp_file): + os.remove(temp_file) + print(f"[Thread] Removed temp file: {os.path.basename(temp_file)}") + except Exception as e: + print(f"[Thread] Warning: Could not remove temp file: {str(e)}") + + +def build_knowledge_base_for_domain(domain_name, provider="ollama", model=None, embedding=None, progress_callback=None, use_background_thread=False, max_workers=4, enable_parallel=True): + """ + Build a knowledge base from all documents in a MongoDB domain. + + Args: + domain_name (str): Name of the domain to build KB from + provider (str): LLM provider ("openai" or "ollama") + model (str): Model name (optional, will use defaults) + embedding (str): Embedding model name (optional, will use defaults) + progress_callback (callable): Optional callback function to report progress + use_background_thread (bool): Whether to run in background thread + max_workers (int): Maximum number of parallel workers (default: 4) + enable_parallel (bool): Whether to enable parallel processing (default: True) + + Returns: + dict: Result dictionary with status and message + """ + from knowledge_base import KnowledgeExtractor + + def log_progress(message): + """Helper to log progress.""" + if progress_callback: + progress_callback(message) + print(message) + + try: + log_progress(f"Starting KB creation for domain: {domain_name}") + + # Get all documents for the domain + documents = get_documents_by_domain(domain_name) + + if not documents: + return { + "status": "error", + "message": f"No documents found in domain '{domain_name}'" + } + + # Separate URLs and PDFs, save MongoDB PDFs to temp files + urls = [] + temp_pdf_files = [] + pdf_count = 0 + + for doc in documents: + if doc.get("doc_type") == "url": + urls.append(doc.get("url")) + elif doc.get("doc_type") == "pdf": + pdf_count += 1 + # Save PDF from MongoDB to temporary file with http-accessible path + try: + pdf_content = doc.get("content") + if pdf_content: + # Create temp directory in the KB folder for PDFs + kb_temp_dir = os.path.join( + os.path.dirname(__file__), + "..", + "knowledge_base", + "kbs", + "temp_pdfs" + ) + os.makedirs(kb_temp_dir, exist_ok=True) + + # Create a unique filename + import hashlib + safe_filename = doc.get('filename', 'document.pdf').replace(' ', '_') + file_hash = hashlib.md5(pdf_content[:1024]).hexdigest()[:8] + temp_filename = f"{file_hash}_{safe_filename}" + temp_path = os.path.join(kb_temp_dir, temp_filename) + + # Write PDF to file + with open(temp_path, 'wb') as f: + f.write(pdf_content) + + # Add the absolute file path as a URL (KnowledgeExtractor will handle it) + urls.append(temp_path) + temp_pdf_files.append(temp_path) + log_progress(f"Saved PDF '{doc.get('filename')}' to {temp_filename}") + except Exception as e: + log_progress(f"Warning: Could not process PDF '{doc.get('filename')}': {str(e)}") + + log_progress(f"Found {len(urls) - len(temp_pdf_files)} URLs and {pdf_count} PDFs ({len(temp_pdf_files)} successfully prepared)") + + # Set default models based on provider + if model is None: + if provider == "ollama": + model = "gemma3:12b-it-qat" + else: + model = "gpt-4" + + if embedding is None: + if provider == "ollama": + embedding = "mxbai-embed-large" + else: + embedding = "text-embedding-3-small" + + log_progress(f"Using provider: {provider}, model: {model}, embedding: {embedding}") + + # Close any existing GURU sessions using this domain's KB to avoid ChromaDB locking + log_progress("Checking for active GURU sessions with this domain...") + folder_name = f"files_{domain_name}" + _close_guru_sessions_for_domain(folder_name, log_progress) + + # Store KB metadata in MongoDB BEFORE starting the long process + # This way, even if the process is interrupted, the metadata exists + collection = get_collection() + kb_doc = { + "type": "knowledge_base", + "domain": domain_name, + "folder": folder_name, + "file_name": "rdf_graph", + "provider": provider, + "model": model, + "embedding": embedding, + "created_at": datetime.now(), + "document_count": len(documents), + "url_count": len(urls), + "pdf_count": pdf_count, + "status": "creating" # Mark as in-progress + } + + # Save metadata immediately + collection.update_one( + {"type": "knowledge_base", "domain": domain_name}, + {"$set": kb_doc}, + upsert=True + ) + log_progress("Metadata saved to database") + + if not urls: + # Update status to error + collection.update_one( + {"type": "knowledge_base", "domain": domain_name}, + {"$set": {"status": "error", "error_message": "No documents found to process"}} + ) + return { + "status": "error", + "message": "No documents found to process in this domain." + } + + # If background thread requested, start worker and return immediately + if use_background_thread: + log_progress("Starting background thread for KB creation...") + thread = threading.Thread( + target=_build_kb_worker, + args=(domain_name, provider, model, embedding, max_workers, enable_parallel), + daemon=True + ) + thread.start() + log_progress("Background thread started. You can safely switch tabs.") + + return { + "status": "success", + "message": f"Knowledge base creation started in background for domain '{domain_name}'. Check back later for status.", + "folder": folder_name, + "url_count": len(urls), + "pdf_count": pdf_count, + "background": True + } + + # Otherwise, run synchronously (original behavior) + # Initialize Knowledge Extractor with parallel processing options + ke = KnowledgeExtractor(provider, model, embedding, max_workers=max_workers) + + # Create folder for this domain's KB + file_name = "rdf_graph" + + # Run the knowledge extraction + try: + ke.run( + folder=folder_name, + file_name=file_name, + html_links=urls, + load_cached_docs=False, + load_cached_preprocessed_chunks=False, + enable_parallel=enable_parallel + ) + + + # Update status to completed + collection.update_one( + {"type": "knowledge_base", "domain": domain_name}, + {"$set": {"status": "completed", "completed_at": datetime.now()}} + ) + log_progress("Status updated to completed") + finally: + # Clean up temporary PDF files + if temp_pdf_files: + for temp_file in temp_pdf_files: + try: + if os.path.exists(temp_file): + os.remove(temp_file) + except Exception as e: + log_progress(f"Warning: Could not remove temp file {temp_file}: {str(e)}") + + return { + "status": "success", + "message": f"Knowledge base created successfully for domain '{domain_name}'", + "folder": folder_name, + "url_count": len(urls), + "pdf_count": pdf_count + } + + except Exception as e: + error_msg = f"Error creating knowledge base: {str(e)}" + log_progress(error_msg) + + # Update status to error in MongoDB + try: + collection = get_collection() + collection.update_one( + {"type": "knowledge_base", "domain": domain_name}, + {"$set": {"status": "error", "error_message": str(e), "error_at": datetime.now()}} + ) + except Exception: + pass # Don't fail if we can't update the error status + + # Clean up temporary PDF files even on error + if 'temp_pdf_files' in locals() and temp_pdf_files: + log_progress("Cleaning up temporary PDF files after error...") + for temp_file in temp_pdf_files: + try: + if os.path.exists(temp_file): + os.remove(temp_file) + except Exception: + pass # Ignore cleanup errors + + return { + "status": "error", + "message": error_msg + } + + +def get_knowledge_base_info(domain_name): + """ + Get information about a knowledge base for a domain. + + Args: + domain_name (str): Name of the domain + + Returns: + dict: KB information or None if not found + """ + collection = get_collection() + kb_info = collection.find_one({"type": "knowledge_base", "domain": domain_name}) + return kb_info + + +def delete_knowledge_base(domain_name): + """ + Delete a knowledge base for a domain. + + Args: + domain_name (str): Name of the domain + + Returns: + bool: True if deleted successfully + """ + import shutil + + collection = get_collection() + kb_info = get_knowledge_base_info(domain_name) + + if kb_info: + # Delete folder + folder_path = os.path.join( + os.path.dirname(__file__), + "..", + "knowledge_base", + "kbs", + kb_info.get("folder") + ) + + if os.path.exists(folder_path): + shutil.rmtree(folder_path) + + # Delete metadata + collection.delete_one({"type": "knowledge_base", "domain": domain_name}) + return True + + return False diff --git a/src/orchestrator_manager.py b/src/orchestrator_manager.py new file mode 100644 index 0000000..35181e4 --- /dev/null +++ b/src/orchestrator_manager.py @@ -0,0 +1,34 @@ +"""Orchestrator management functions for Energenius GURU.""" + +import streamlit as st + +from orchestrator import LiveOrchestrator +from llm import get_ollama_models, get_ollama_embeddings + + +@st.cache_data(ttl=60) +def cached_get_ollama_models(): + """Cached wrapper for getting Ollama models.""" + return get_ollama_models() + + +@st.cache_data(ttl=60) +def cached_get_ollama_embeddings(): + """Cached wrapper for getting Ollama embeddings.""" + return get_ollama_embeddings() + + +def initialize_orchestrator(provider, model, embedding, language, temperature, + user_type, house_type, region, use_knowledge_base): + """Initialize the LiveOrchestrator with given parameters.""" + return LiveOrchestrator( + provider=provider, + model=model, + embedding=embedding, + language=language, + temperature=temperature, + user_type=user_type, + house_type=house_type, + region=region, + use_knowledge=use_knowledge_base, + ) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..2b0812a --- /dev/null +++ b/src/utils.py @@ -0,0 +1,8 @@ +"""Utility functions for document management.""" + +import hashlib + + +def calculate_file_hash(file_content): + """Calculate SHA256 hash of file content.""" + return hashlib.sha256(file_content).hexdigest() diff --git a/streamlit_ui.py b/streamlit_ui.py index ef6c6b8..2777a14 100644 --- a/streamlit_ui.py +++ b/streamlit_ui.py @@ -6,6 +6,7 @@ import streamlit as st from orchestrator import LiveOrchestrator +from llm import get_ollama_models, get_ollama_embeddings # give title to the page st.title("Energenius GURU") @@ -19,17 +20,48 @@ if "messages" not in st.session_state: st.session_state["messages"] = [] +# Cached wrappers for Ollama model fetching +@st.cache_data(ttl=60) +def cached_get_ollama_models(): + """Cached wrapper for getting Ollama models.""" + return get_ollama_models() + +@st.cache_data(ttl=60) +def cached_get_ollama_embeddings(): + """Cached wrapper for getting Ollama embeddings.""" + return get_ollama_embeddings() + # create sidebar to adjust parameters st.sidebar.title("Model Parameters") provider = st.sidebar.selectbox("Provider", ["openai", "ollama"], index=1) if provider == "ollama": - model = st.sidebar.selectbox("Model", ["gpt-oss:120b", "gpt-oss:20b", "llama3.2", "mistral"], index=0) - embedding = st.sidebar.selectbox( - "Embedding", ["mxbai-embed-large", "nomic-embed-text"], index=0 - ) + # Add refresh button for model list + col1, col2 = st.sidebar.columns([4, 1]) + with col1: + st.write("") # Spacing + with col2: + if st.button("πŸ”„", help="Refresh model list"): + st.cache_data.clear() + st.rerun() + + # Get available models dynamically + available_models = cached_get_ollama_models() + available_embeddings = cached_get_ollama_embeddings() + + # Try to find a default model, otherwise use the first one + default_model_index = 0 + + model = st.sidebar.selectbox("Model", available_models, index=default_model_index) + + # Try to find a default embedding, otherwise use the first one + default_embedding_index = 0 + if "mxbai-embed-large" in available_embeddings: + default_embedding_index = available_embeddings.index("mxbai-embed-large") + + embedding = st.sidebar.selectbox("Embedding", available_embeddings, index=default_embedding_index) elif provider == "openai": - model = st.sidebar.selectbox("Model", ["gpt-3.5-turbo", "gpt-4", "ollama"], index=1) + model = st.sidebar.selectbox("Model", ["gpt-3.5-turbo", "gpt-4"], index=1) embedding = st.sidebar.selectbox( "Embedding", ["text-embedding-3-small", "text-embedding-3-large"], index=0 )