Merge pull request #655 from openchatai/feat/neural_search

codebanesr · web-flow · commit 7e6386ace58f · 2024-02-23T16:30:39.000-08:00
Feat/neural search
diff --git a/llm-server/routes/search/search_controller.py b/llm-server/routes/search/search_controller.py
@@ -3,6 +3,8 @@
 from utils.get_logger import CustomLogger
 from utils.llm_consts import VectorCollections, initialize_qdrant_client
 from qdrant_client import models  # Add this line
+from routes.search.search_service import weighted_search
+from pydantic import BaseModel
 
 search_workflow = Blueprint("search", __name__)
 
@@ -43,3 +45,32 @@ def search_vector_store(chatbot_id: str):
     results = get_all_results(chatbot_id, keyword)
 
     return jsonify(results), 201
+
+
+class WeightedSearchRequest(BaseModel):
+    query: str
+    title_weight: float = 0.7
+    description_weight: float = 0.3
+
+
+@search_workflow.route("/cmd_bar/<chatbot_id>", methods=["POST"])
+def get_cmdbar_data(chatbot_id: str):
+    try:
+        request_data = WeightedSearchRequest(
+            **request.get_json()
+        )  # Assuming you have a class to parse data
+        scored_points = weighted_search(
+            chatbot_id,
+            request_data.query,
+            request_data.title_weight,
+            request_data.description_weight,
+        )
+        return (
+            jsonify([sp.model_dump() for sp in scored_points]),
+            200,
+        )
+
+    except ValueError as e:  # Example of handling a potential error
+        return jsonify({"error": str(e)}), 400  # Bad request
+    except Exception as e:
+        return jsonify({"error": "Internal server error"}), 500
diff --git a/llm-server/routes/search/search_service.py b/llm-server/routes/search/search_service.py
@@ -5,6 +5,7 @@
 from typing import Dict, List, Optional
 import operator
 from copy import deepcopy
+from utils.llm_consts import ENABLE_NEURAL_SEARCH
 
 client = initialize_qdrant_client()
 embedding = get_embeddings()
@@ -64,23 +65,44 @@ def add_cmdbar_data(items: List[Item], metadata: Dict[str, str]) -> None:
 
 # Function to search with weights
 def weighted_search(
-    query: str, title_weight: float = 0.7, description_weight: float = 0.3
+    chatbot_id: str,
+    query: str,
+    title_weight: float = 0.7,
+    description_weight: float = 0.3,
 ) -> List[models.ScoredPoint]:
     query_embedding = embedding.embed_query(query)
 
     # Search title and descriptions
     title_results = client.search(
         collection_name=VectorCollections.neural_search,
         query_vector=models.NamedVector(name="title", vector=query_embedding),
+        query_filter=models.Filter(
+            must=[
+                models.FieldCondition(
+                    key="metadata.bot_id",
+                    match=models.MatchValue(value=str(chatbot_id)),
+                )
+            ]
+        ),
+        limit=20,
         with_payload=True,
-        with_vector=False,
+        with_vectors=False,
     )
 
     description_results = client.search(
         collection_name=VectorCollections.neural_search,
         query_vector=models.NamedVector(name="description", vector=query_embedding),
+        query_filter=models.Filter(
+            must=[
+                models.FieldCondition(
+                    key="metadata.bot_id",
+                    match=models.MatchValue(value=chatbot_id),
+                )
+            ]
+        ),
+        limit=20,
         with_payload=True,
-        with_vector=False,
+        with_vectors=False,
     )
 
     # Build a lookup for description results
diff --git a/llm-server/utils/llm_consts.py b/llm-server/utils/llm_consts.py
@@ -121,3 +121,5 @@ def get_mysql_uri():
 )
 
 JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "YOURSUPERSECRETKEY")
+
+ENABLE_NEURAL_SEARCH = os.getenv("ENABLE_NEURAL_SEARCH", "NO") == "YES"
diff --git a/llm-server/workers/tasks/url_parsers.py b/llm-server/workers/tasks/url_parsers.py
@@ -70,24 +70,22 @@ def get_url_fragments(self, content) -> List[LinkInformation]:
     def find_all_headings_and_highlights(
         self, content: str
     ) -> Tuple[str, List[Tuple[str, str]]]:
-        soup = BeautifulSoup(content, "lxml")
-        title_tag = soup.title
-        title = ""
-        if title_tag is not None:
-            title = title_tag.get_text(strip=True)
-
-        headings: List[Tuple[str, str]] = []
-
-        for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
-            heading_text = heading.get_text(strip=True)
 
-            # Check if the heading or one of its children has an 'id' attribute
-            id_tag = heading.find(attrs={"id": True})
-            if id_tag:
-                heading_id = id_tag["id"]
-                headings.append((heading_text, heading_id))
-
-        return title, headings
+        soup = BeautifulSoup(content, "lxml")
+        title = soup.title.text if soup.title else ""
+        elements_with_id = soup.find_all(id=True)
+        links = soup.find_all("a")
+        pairs = []
+        for element in elements_with_id:
+            id_ = element.get("id")
+            if id_:  # A simple check if the id exists
+                corresponding_links = [
+                    link for link in links if link.get("href") == "#" + id_
+                ]  # Removed "./#" prefix
+                if corresponding_links:
+                    for link in corresponding_links:
+                        pairs.append((element.get_text(strip=True), id_))
+        return title, pairs
 
     def parse_text_content(self, content) -> str:
         text = BeautifulSoup(content, "lxml").get_text()
diff --git a/llm-server/workers/tasks/web_crawl.py b/llm-server/workers/tasks/web_crawl.py
@@ -73,7 +73,8 @@ def scrape_url(url: str, bot_id: str):
             for heading_text, heading_id in headings
         ]
 
-        add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
+        if len(items) > 0:
+            add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
         return parser.parse_text_content(content)
     except ValueError as e:
         # Log an error message if no parser is available for the content type
@@ -140,11 +141,8 @@ def scrape_website(url: str, bot_id: str, max_pages: int) -> int:
                     chatbot_id=bot_id, url=current_url, status="SUCCESS"
                 )
 
-                # Get links on the current page
-                links = get_links(current_url)
-
-                # Add new links to the queue
-                queue.extend(links)
+            links = get_links(current_url)
+            queue.extend(links)
 
         except Exception as e:
             logger.error("WEB_SCRAPE_ERROR", error=e)

Original file line number	Diff line number	Diff line change
`@@ -121,3 +121,5 @@ def get_mysql_uri():`
`121`	`121`	`)`
`122`	`122`
`123`	`123`	`JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "YOURSUPERSECRETKEY")`
	`124`	`+`
	`125`	`+ENABLE_NEURAL_SEARCH = os.getenv("ENABLE_NEURAL_SEARCH", "NO") == "YES"`