Skip to content
This repository was archived by the owner on Jan 5, 2025. It is now read-only.

Commit 3e5db01

Browse files
authored
Merge pull request #236 from lvalics/main
Create example.env
2 parents 8e906d7 + 61f6611 commit 3e5db01

File tree

10 files changed

+133
-21
lines changed

10 files changed

+133
-21
lines changed

dj_backend_server/CHANGELOG.MD

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2.14.2024
2+
- Added example.env to streamline environment setup.
3+
- Implemented translation fixes to enhance application localization.
4+
- Updated docker-compose.yaml to prefix each container with oc_ for better namespace management.
5+
- Performed fixes in requirements.txt for improved dependency resolution.
6+
- Ensured existence of Vector Database (QDrant) prior to web crawling operations to address issues encountered with large websites, ensuring Vector Database creation and availability.

dj_backend_server/api/data_sources/pdf_handler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,15 +222,17 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool
222222

223223
docs = text_splitter.split_documents(raw_docs)
224224

225-
print("external files docs -->", docs);
225+
# print("external files docs -->", docs);
226226

227227
if not docs:
228228
print("No documents were processed successfully.")
229229
return
230230

231231
embeddings = get_embeddings()
232232

233+
print(f"Initializing vector store for namespace: {namespace} with {len(docs)} documents.")
233234
init_vector_store(docs, embeddings, StoreOptions(namespace=namespace))
235+
print(f"Vector store initialized successfully for namespace: {namespace}.")
234236

235237
print(f'Folder need or not to delete. {delete_folder_flag}')
236238
# Delete folder if flag is set
@@ -243,7 +245,7 @@ def txt_to_vectordb(shared_folder: str, namespace: str, delete_folder_flag: bool
243245
# pdf_data_source.save()
244246
failed_job = FailedJob(uuid=str(uuid4()), connection='default', queue='default', payload='txt_to_vectordb', exception=str(e),failed_at=timezone.now())
245247
failed_job.save()
246-
print(e)
248+
print(f"Failed to initialize vector store for namespace: {namespace}. Exception: {e}")
247249
traceback.print_exc()
248250

249251

dj_backend_server/api/utils/init_vector_store.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,31 @@ def delete_from_vector_store(namespace: str, filter_criteria: dict) -> None:
128128
else:
129129
raise NotImplementedError(f"Delete operation is not implemented for the store type: {store_type}")
130130

131+
132+
def ensure_vector_database_exists(namespace):
133+
store_type = StoreType[os.environ['STORE']]
134+
try:
135+
if store_type == StoreType.QDRANT:
136+
client = QdrantClient(url=os.environ['QDRANT_URL'])
137+
for attempt in range(3):
138+
existing_collections = client.get_collections().collections
139+
if namespace not in existing_collections:
140+
print(f"Namespace '{namespace}' does not exist. Attempting to create.")
141+
vectors_config = models.VectorParams(
142+
size=1536, # Using 1536-dimensional vectors, adjust as necessary
143+
distance=models.Distance.COSINE # Using cosine distance, adjust as necessary
144+
)
145+
client.create_collection(collection_name=namespace, vectors_config=vectors_config)
146+
# Recheck if the namespace was successfully created
147+
if namespace in client.get_collections().collections:
148+
print(f"Namespace '{namespace}' successfully created.")
149+
return
150+
else:
151+
print(f"Failed to create namespace '{namespace}' on attempt {attempt + 1}.")
152+
else:
153+
print(f"Namespace '{namespace}' exists.")
154+
return
155+
raise Exception(f"Failed to ensure or create namespace '{namespace}' after 3 attempts.")
156+
except Exception as e:
157+
print(f"Failed to ensure vector database exists for namespace {namespace}: {e}")
158+

dj_backend_server/docker-compose.yaml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ version: '3.9'
33
services:
44
mysql:
55
restart: unless-stopped
6-
platform: linux/arm64/v8
6+
container_name: oc_mysql
77
image: "mysql:8"
88
ports:
99
- "3307:3306"
@@ -20,6 +20,7 @@ services:
2020

2121
qdrant:
2222
image: qdrant/qdrant
23+
container_name: oc_qdrant
2324
ports:
2425
- 6333:6333
2526
- 6334:6334
@@ -32,7 +33,7 @@ services:
3233
build:
3334
context: .
3435
dockerfile: Dockerfile
35-
container_name: web
36+
container_name: oc_web
3637
ports:
3738
- "8001:8000"
3839
volumes:
@@ -53,6 +54,7 @@ services:
5354

5455
adminer:
5556
image: adminer
57+
container_name: oc_adminer
5658
ports:
5759
- "8080:8080"
5860
environment:
@@ -66,7 +68,7 @@ services:
6668
build:
6769
context: .
6870
dockerfile: Dockerfile
69-
container_name: celery
71+
container_name: oc_celery
7072
volumes:
7173
- ./website_data_sources:/app/website_data_sources
7274
# - ./llama-2-7b-chat.ggmlv3.q4_K_M.bin:/app/llama-2-7b-chat.ggmlv3.q4_K_M.bin:ro
@@ -80,7 +82,7 @@ services:
8082

8183
redis:
8284
image: redis:latest
83-
container_name: redis_cache
85+
container_name: oc_redis_cache
8486
ports:
8587
- "6379:6379"
8688
volumes:

dj_backend_server/example.env

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
##########################################################
2+
3+
# Edit values for your site.
4+
# your app secret key
5+
SECRET_KEY='ADD-YOUR-CUSTOM-KEY-HERE'
6+
# For openai
7+
OPENAI_API_KEY=YOURKEY
8+
# add IP what you allow like superadmin
9+
ALLOWED_HOSTS=localhost,0.0.0.0
10+
# Use * only in dev environment
11+
#ALLOWED_HOSTS=*
12+
# Your SITE URL
13+
APP_URL='https://YOUR-URL-HERE'
14+
15+
##########################################################
16+
17+
# "azure" | "openai" | llama2
18+
OPENAI_API_TYPE=openai
19+
OPENAI_API_MODEL=gpt-4-1106-preview
20+
OPENAI_API_TEMPERATURE=1
21+
22+
# If using azure
23+
# AZURE_OPENAI_API_BASE=
24+
# AZURE_OPENAI_API_KEY=
25+
# AZURE_OPENAI_API_VERSION=2023-03-15-preview
26+
# AZURE_OPENAI_EMBEDDING_MODEL_NAME=
27+
# AZURE_OPENAI_DEPLOYMENT_NAME=
28+
# AZURE_OPENAI_COMPLETION_MODEL=gpt-35-turbo
29+
30+
# "azure" | "openai" | llama2
31+
EMBEDDING_PROVIDER=openai
32+
33+
# Vector Store, PINECONE|QDRANT
34+
STORE=QDRANT
35+
36+
37+
# if using pinecone
38+
# PINECONE_API_KEY=
39+
# PINECONE_ENV=
40+
# VECTOR_STORE_INDEX_NAME=
41+
42+
43+
# if using qdrant
44+
QDRANT_URL=http://qdrant:6333
45+
46+
47+
# optional, defaults to 15
48+
MAX_PAGES_CRAWL=150
49+
50+
# --- these will change if you decide to start testing the software
51+
CELERY_BROKER_URL=redis://redis:6379/
52+
CELERY_RESULT_BACKEND=redis://redis:6379/
53+
DATABASE_NAME=openchat
54+
DATABASE_USER=dbuser
55+
DATABASE_PASSWORD=dbpass
56+
DATABASE_HOST=mysql
57+
DATABASE_PORT=3306
58+
59+
# use 'external' if you want to use below services.
60+
PDF_LIBRARY = 'external'
61+
62+
#PDF API - OCRWebService.com (REST API). https://www.ocrwebservice.com/api/restguide
63+
#Extract text from scanned images and PDF documents and convert into editable formats.
64+
#Please create new account with ocrwebservice.com via http://www.ocrwebservice.com/account/signup and get license code
65+
OCR_LICCODE = 'LICENSE-CODE'
66+
OCR_USERNAME = 'USERNAME'
67+
OCR_LANGUAGE = 'english'
68+
# Advantage to clean up the OCR text which can be messy and full with garbage, but will generate a cost with LLM if is paid. Use carefully.
69+
# Use 1 to enable, 0 to disable.
70+
OCR_LLM = '1'
71+
72+
# retrieval_qa | conversation_retrieval, retrieval_qa works better with azure openai
73+
# if you want to use the conversation_retrieval | retrieval_qa chain
74+
CHAIN_TYPE=conversation_retrieval
75+

dj_backend_server/requirements.txt

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@ click-repl==0.3.0
1919
cryptography==41.0.3
2020
dataclasses-json==0.5.14
2121
Django==4.2.3
22-
django-rest-swagger
23-
djangorestframework
22+
django-rest-swagger==2.2.0
2423
dnspython==2.4.1
2524
drf-spectacular==0.27.1
2625
drf_spectacular.extensions==0.0.2
@@ -31,8 +30,8 @@ grpcio-tools==1.56.2
3130
h11==0.14.0
3231
h2==4.1.0
3332
hpack==4.0.0
34-
httpcore1.0.2
35-
httpx=0.25.2
33+
httpcore==1.0.2
34+
httpx==0.25.2
3635
hyperframe==6.0.1
3736
idna==3.6
3837
kombu==5.3.1

dj_backend_server/web/templates/onboarding/other-data-sources-website.html

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,20 +79,16 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans 'Website information
7979

8080
<div class="flex items-center justify-between space-x-6 mb-8">
8181
<div>
82-
<div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on
83-
the same page" %}</div>
82+
<div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %}</div>
8483
<div class="text-xs">
85-
{% trans "Sometimes, we might face challenges when trying to crawl certain websites,
86-
especially the ones built using JavaScript (Single-Page Applications). However, we're
87-
currently working on adding headless browsing to our system so that we can support all
88-
kinds of websites." %}
84+
{% trans "Sometimes, we might face challenges when trying to crawl certain websites, especially the ones built using JavaScript (Single-Page Applications). However, we're currently working on adding headless browsing to our system so that we can support all kinds of websites." %}
8985
</div>
9086
</div>
9187
</div>
9288
</div>
9389
<div class="flex items-center justify-between">
9490
<a class="text-sm underline hover:no-underline" href="{% url 'onboarding.data-source' %}">&lt;- {% trans "Back" %}</a>
95-
<button type="submit" class="btn bg-indigo-500 hover:bg-indigo-600 text-white ml-auto">{% trans "Next Step" %}</button>
91+
<button type="submit" class="btn bg-primary text-white py-2 px-3">{% trans "Next Step" %}</button>
9692
</div>
9793
</form>
9894
</div>

dj_backend_server/web/templates/onboarding/step-0.html

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,7 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans "Let's set up your f
5656
</div>
5757
<h3 class="text-lg font-bold text-slate-800 pl-9">{% trans "You provide the system with data" %}</h3>
5858
</div>
59-
<div class="pl-9">{% trans "It could be a website, pdf files, and soon you will have the option to
60-
integrate with many more" %}</div>
59+
<div class="pl-9">{% trans "It could be a website, pdf files, and soon you will have the option to integrate with many more" %}</div>
6160
</li>
6261
<!-- List item -->
6362
<li class="relative py-2">

dj_backend_server/web/templates/onboarding/step-2.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ <h1 class="text-3xl text-slate-800 font-bold mb-6">{% trans "Website information
7878

7979
<div class="flex items-center justify-between space-x-6 mb-8">
8080
<div>
81-
<div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %} 🫶
81+
<div class="font-medium text-slate-800 text-sm mb-1">{% trans "Just to make sure we are on the same page" %}
8282
</div>
8383
<div class="text-xs">
8484
{% trans "We might not be able to crawl some websites, especially websites that are built using JS (SPA), we are working on adding headless browsing to support all sorts of websites." %}

dj_backend_server/web/workers/crawler.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from web.signals.website_data_source_crawling_was_completed import website_data_source_crawling_completed
66
from web.models.crawled_pages import CrawledPages
77
from web.models.website_data_sources import WebsiteDataSource
8+
from api.utils.init_vector_store import ensure_vector_database_exists
89
from django.core.files.storage import default_storage
910
from django.core.files.base import ContentFile
1011
from django.utils.text import slugify
@@ -35,6 +36,10 @@ def start_recursive_crawler(data_source_id, chatbot_id):
3536
Exception: If any error occurs during the crawling process, the function will catch the exception, set the
3637
crawling status to "failed", and re-raise the exception.
3738
"""
39+
# Ensure vector database exists before starting the crawl
40+
41+
ensure_vector_database_exists(str(chatbot_id))
42+
# print("Starting recursive crawler")
3843
data_source = WebsiteDataSource.objects.get(pk=data_source_id)
3944
root_url = data_source.root_url
4045

@@ -323,4 +328,4 @@ def crawl(data_source_id, url, crawled_urls, max_pages, chatbot_id):
323328
except Exception as e:
324329
# Handle other exceptions (e.g., invalid HTML, network issues) and continue crawling
325330
logging.exception(f"An unexpected error occurred while crawling URL: {url}")
326-
pass
331+
pass

0 commit comments

Comments
 (0)