Skip to content
This repository was archived by the owner on Mar 26, 2025. It is now read-only.

Commit 7e6386a

Browse files
authored
Merge pull request #655 from openchatai/feat/neural_search
Feat/neural search
2 parents d683835 + a11d727 commit 7e6386a

File tree

5 files changed

+77
-26
lines changed

5 files changed

+77
-26
lines changed

llm-server/routes/search/search_controller.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from utils.get_logger import CustomLogger
44
from utils.llm_consts import VectorCollections, initialize_qdrant_client
55
from qdrant_client import models # Add this line
6+
from routes.search.search_service import weighted_search
7+
from pydantic import BaseModel
68

79
search_workflow = Blueprint("search", __name__)
810

@@ -43,3 +45,32 @@ def search_vector_store(chatbot_id: str):
4345
results = get_all_results(chatbot_id, keyword)
4446

4547
return jsonify(results), 201
48+
49+
50+
class WeightedSearchRequest(BaseModel):
51+
query: str
52+
title_weight: float = 0.7
53+
description_weight: float = 0.3
54+
55+
56+
@search_workflow.route("/cmd_bar/<chatbot_id>", methods=["POST"])
57+
def get_cmdbar_data(chatbot_id: str):
58+
try:
59+
request_data = WeightedSearchRequest(
60+
**request.get_json()
61+
) # Assuming you have a class to parse data
62+
scored_points = weighted_search(
63+
chatbot_id,
64+
request_data.query,
65+
request_data.title_weight,
66+
request_data.description_weight,
67+
)
68+
return (
69+
jsonify([sp.model_dump() for sp in scored_points]),
70+
200,
71+
)
72+
73+
except ValueError as e: # Example of handling a potential error
74+
return jsonify({"error": str(e)}), 400 # Bad request
75+
except Exception as e:
76+
return jsonify({"error": "Internal server error"}), 500

llm-server/routes/search/search_service.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import Dict, List, Optional
66
import operator
77
from copy import deepcopy
8+
from utils.llm_consts import ENABLE_NEURAL_SEARCH
89

910
client = initialize_qdrant_client()
1011
embedding = get_embeddings()
@@ -64,23 +65,44 @@ def add_cmdbar_data(items: List[Item], metadata: Dict[str, str]) -> None:
6465

6566
# Function to search with weights
6667
def weighted_search(
67-
query: str, title_weight: float = 0.7, description_weight: float = 0.3
68+
chatbot_id: str,
69+
query: str,
70+
title_weight: float = 0.7,
71+
description_weight: float = 0.3,
6872
) -> List[models.ScoredPoint]:
6973
query_embedding = embedding.embed_query(query)
7074

7175
# Search title and descriptions
7276
title_results = client.search(
7377
collection_name=VectorCollections.neural_search,
7478
query_vector=models.NamedVector(name="title", vector=query_embedding),
79+
query_filter=models.Filter(
80+
must=[
81+
models.FieldCondition(
82+
key="metadata.bot_id",
83+
match=models.MatchValue(value=str(chatbot_id)),
84+
)
85+
]
86+
),
87+
limit=20,
7588
with_payload=True,
76-
with_vector=False,
89+
with_vectors=False,
7790
)
7891

7992
description_results = client.search(
8093
collection_name=VectorCollections.neural_search,
8194
query_vector=models.NamedVector(name="description", vector=query_embedding),
95+
query_filter=models.Filter(
96+
must=[
97+
models.FieldCondition(
98+
key="metadata.bot_id",
99+
match=models.MatchValue(value=chatbot_id),
100+
)
101+
]
102+
),
103+
limit=20,
82104
with_payload=True,
83-
with_vector=False,
105+
with_vectors=False,
84106
)
85107

86108
# Build a lookup for description results

llm-server/utils/llm_consts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,3 +121,5 @@ def get_mysql_uri():
121121
)
122122

123123
JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "YOURSUPERSECRETKEY")
124+
125+
ENABLE_NEURAL_SEARCH = os.getenv("ENABLE_NEURAL_SEARCH", "NO") == "YES"

llm-server/workers/tasks/url_parsers.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -70,24 +70,22 @@ def get_url_fragments(self, content) -> List[LinkInformation]:
7070
def find_all_headings_and_highlights(
7171
self, content: str
7272
) -> Tuple[str, List[Tuple[str, str]]]:
73-
soup = BeautifulSoup(content, "lxml")
74-
title_tag = soup.title
75-
title = ""
76-
if title_tag is not None:
77-
title = title_tag.get_text(strip=True)
78-
79-
headings: List[Tuple[str, str]] = []
80-
81-
for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
82-
heading_text = heading.get_text(strip=True)
8373

84-
# Check if the heading or one of its children has an 'id' attribute
85-
id_tag = heading.find(attrs={"id": True})
86-
if id_tag:
87-
heading_id = id_tag["id"]
88-
headings.append((heading_text, heading_id))
89-
90-
return title, headings
74+
soup = BeautifulSoup(content, "lxml")
75+
title = soup.title.text if soup.title else ""
76+
elements_with_id = soup.find_all(id=True)
77+
links = soup.find_all("a")
78+
pairs = []
79+
for element in elements_with_id:
80+
id_ = element.get("id")
81+
if id_: # A simple check if the id exists
82+
corresponding_links = [
83+
link for link in links if link.get("href") == "#" + id_
84+
] # Removed "./#" prefix
85+
if corresponding_links:
86+
for link in corresponding_links:
87+
pairs.append((element.get_text(strip=True), id_))
88+
return title, pairs
9189

9290
def parse_text_content(self, content) -> str:
9391
text = BeautifulSoup(content, "lxml").get_text()

llm-server/workers/tasks/web_crawl.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ def scrape_url(url: str, bot_id: str):
7373
for heading_text, heading_id in headings
7474
]
7575

76-
add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
76+
if len(items) > 0:
77+
add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
7778
return parser.parse_text_content(content)
7879
except ValueError as e:
7980
# Log an error message if no parser is available for the content type
@@ -140,11 +141,8 @@ def scrape_website(url: str, bot_id: str, max_pages: int) -> int:
140141
chatbot_id=bot_id, url=current_url, status="SUCCESS"
141142
)
142143

143-
# Get links on the current page
144-
links = get_links(current_url)
145-
146-
# Add new links to the queue
147-
queue.extend(links)
144+
links = get_links(current_url)
145+
queue.extend(links)
148146

149147
except Exception as e:
150148
logger.error("WEB_SCRAPE_ERROR", error=e)

0 commit comments

Comments
 (0)