Skip to content

Commit ea1f155

Browse files
committed
revert notebooks
1 parent 8a5a442 commit ea1f155

File tree

2 files changed

+31
-65
lines changed

2 files changed

+31
-65
lines changed

Lab 7 - Semantic Search/01-text-embedding.ipynb

Lines changed: 10 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -164,47 +164,7 @@
164164
"outputs": [],
165165
"source": [
166166
"from vertexai.language_models import TextEmbeddingModel\n",
167-
"from typing import List\n",
168-
"\n",
169-
"EMBEDDING_MODEL = TextEmbeddingModel\n",
170-
"MAX_REQ_PER_MIN = 60\n",
171-
"CHUNK_SIZE = 4000\n",
172-
"CHUNK_OVERLAP = 15\n",
173-
"\n",
174-
"def rate_limit(max_per_minute):\n",
175-
" period = 60 / max_per_minute\n",
176-
" while True:\n",
177-
" before = time.time()\n",
178-
" yield\n",
179-
" after = time.time()\n",
180-
" elapsed = after - before\n",
181-
" sleep_time = max(0, period - elapsed)\n",
182-
" if sleep_time > 0:\n",
183-
" # print(f'Sleeping {sleep_time:.1f} seconds')\n",
184-
" time.sleep(sleep_time)\n",
185-
" \n",
186-
"def embed_documents(texts: List[str]) -> List[List[float]]:\n",
187-
" \"\"\"Call Vertex LLM embedding endpoint for embedding docs\n",
188-
" Args:\n",
189-
" texts: The list of texts to embed.\n",
190-
" Returns:\n",
191-
" List of embeddings, one for each text.\n",
192-
" \"\"\"\n",
193-
" model = EMBEDDING_MODEL.from_pretrained(\"textembedding-gecko@001\")\n",
194-
"\n",
195-
" limiter = rate_limit(MAX_REQ_PER_MIN)\n",
196-
" results = []\n",
197-
" docs = list(texts)\n",
198-
"\n",
199-
" while docs:\n",
200-
" # Working in batches of 2 because the API apparently won't let\n",
201-
" # us send more than 2 documents per request to get embeddings.\n",
202-
" head, docs = docs[:2], docs[2:]\n",
203-
" # print(f'Sending embedding request for: {head!r}')\n",
204-
" chunk = model.get_embeddings(head)\n",
205-
" results.extend(chunk)\n",
206-
" next(limiter)\n",
207-
" return results"
167+
"EMBEDDING_MODEL = TextEmbeddingModel.from_pretrained(\"textembedding-gecko@001\")"
208168
]
209169
},
210170
{
@@ -231,18 +191,19 @@
231191
"\n",
232192
"def create_text_embedding_entries(input_text:str, company_name: str, cusip: str):\n",
233193
" text_splitter = RecursiveCharacterTextSplitter(\n",
234-
" chunk_size = CHUNK_SIZE,\n",
235-
" chunk_overlap = CHUNK_OVERLAP,\n",
194+
" chunk_size = 2000,\n",
195+
" chunk_overlap = 15,\n",
236196
" length_function = len,\n",
237197
" is_separator_regex = False,\n",
238198
" )\n",
239199
" docs = text_splitter.split_text(input_text)\n",
240200
" res = []\n",
241201
" seq_id = -1\n",
242202
" for d in chunks(docs):\n",
243-
" embeddings = embed_documents(d)\n",
203+
" embeddings = EMBEDDING_MODEL.get_embeddings(d)\n",
204+
" \n",
244205
" # throttle so we don't blow through the quota.\n",
245-
" # time.sleep(1)\n",
206+
" time.sleep(1)\n",
246207
" \n",
247208
" for i in range(len(d)):\n",
248209
" seq_id += 1\n",
@@ -278,12 +239,10 @@
278239
"metadata": {},
279240
"outputs": [],
280241
"source": [
281-
"%%time\n",
282-
"\n",
283242
"import time\n",
284243
"\n",
285244
"# We're hitting the quota, so we're going to sleep for a bit to zero it out for sure, then throttle our calls\n",
286-
"# time.sleep(60)\n",
245+
"time.sleep(60)\n",
287246
"\n",
288247
"count = 0\n",
289248
"embedding_entries = []\n",
@@ -623,9 +582,9 @@
623582
"uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111"
624583
},
625584
"kernelspec": {
626-
"display_name": "lab (Local)",
585+
"display_name": "Python 3",
627586
"language": "python",
628-
"name": "local-lab"
587+
"name": "conda-root-py"
629588
},
630589
"language_info": {
631590
"codemirror_mode": {
@@ -637,7 +596,7 @@
637596
"name": "python",
638597
"nbconvert_exporter": "python",
639598
"pygments_lexer": "ipython3",
640-
"version": "3.8.17"
599+
"version": "3.10.12"
641600
}
642601
},
643602
"nbformat": 4,

Lab 7 - Semantic Search/02-semantic-search.ipynb

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,17 @@
236236
"id": "3479218e-c372-4296-8058-af73eb85096b",
237237
"metadata": {},
238238
"source": [
239-
"As seen above, the cross-encoder finds this passages which are more relevant to the query and ranks them accordingly"
239+
"As seen above, the cross-encoder finds this passage from Bershire Hathaway to be more relevant to the query"
240+
]
241+
},
242+
{
243+
"cell_type": "code",
244+
"execution_count": null,
245+
"id": "c38a08dc-b443-48ab-8e8e-11eb9bc53062",
246+
"metadata": {},
247+
"outputs": [],
248+
"source": [
249+
"ranked_results['text'][0]"
240250
]
241251
},
242252
{
@@ -326,15 +336,15 @@
326336
"metadata": {},
327337
"outputs": [],
328338
"source": [
329-
"ranked_results['text'][5]"
339+
"ranked_results['text'][4]"
330340
]
331341
},
332342
{
333343
"cell_type": "markdown",
334344
"id": "541c6aad-05b4-4391-9f27-b668a82ae910",
335345
"metadata": {},
336346
"source": [
337-
"The Hybrid search brought in additional results from companies not in vector-only search but has content related to energy, oil & gas. The re-ranker helped rank the results. "
347+
"The Hybrid search brought in additional results like `Martin Marietta Material2` which also has content related to energy, oil & gas. The re-ranker helped rank the results. "
338348
]
339349
},
340350
{
@@ -445,14 +455,11 @@
445455
]
446456
},
447457
{
448-
"cell_type": "code",
449-
"execution_count": null,
450-
"id": "ccf6be00-7ae2-468f-abb9-c28242c7f9db",
458+
"cell_type": "markdown",
459+
"id": "458e5b2c-8bf6-4755-8e3c-9eaf06bc4096",
451460
"metadata": {},
452-
"outputs": [],
453461
"source": [
454-
"top_mgr = res_df['managerName'][0]\n",
455-
"top_mgr"
462+
"And we can see that our top result is HAHN CAPITAL MANAGEMENT LLC."
456463
]
457464
},
458465
{
@@ -478,7 +485,7 @@
478485
"WITH m, count(DISTINCT c) AS ownedCompaniesWithDocs\n",
479486
"MATCH (m:Manager {managerName: $managerName})-[:OWNS]->(c:Company)\n",
480487
"RETURN m.managerName AS managerName, ownedCompaniesWithDocs, count(DISTINCT c) AS totalOwnedCompanies\n",
481-
"''', params = {'managerName': top_mgr})"
488+
"''', params = {'managerName':'HAHN CAPITAL MANAGEMENT LLC'})"
482489
]
483490
},
484491
{
@@ -538,7 +545,7 @@
538545
"MATCH (m0:Manager {managerName: $managerName})-[r:SIMILAR]->(m:Manager)\n",
539546
"RETURN DISTINCT m.managerName AS managerName, r.score AS score\n",
540547
"ORDER BY score DESC LIMIT 10\n",
541-
"''', params = {'managerName': top_mgr})"
548+
"''', params = {'managerName':'HAHN CAPITAL MANAGEMENT LLC'})"
542549
]
543550
},
544551
{
@@ -568,9 +575,9 @@
568575
"uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111"
569576
},
570577
"kernelspec": {
571-
"display_name": "lab (Local)",
578+
"display_name": "Python 3",
572579
"language": "python",
573-
"name": "local-lab"
580+
"name": "conda-root-py"
574581
},
575582
"language_info": {
576583
"codemirror_mode": {
@@ -582,7 +589,7 @@
582589
"name": "python",
583590
"nbconvert_exporter": "python",
584591
"pygments_lexer": "ipython3",
585-
"version": "3.8.17"
592+
"version": "3.10.12"
586593
}
587594
},
588595
"nbformat": 4,

0 commit comments

Comments
 (0)