revert notebooks

benofben · benofben · commit ea1f155240a7 · 2023-10-04T13:04:00.000-04:00
diff --git a/Lab 7 - Semantic Search/01-text-embedding.ipynb b/Lab 7 - Semantic Search/01-text-embedding.ipynb
@@ -164,47 +164,7 @@
    "outputs": [],
    "source": [
     "from vertexai.language_models import TextEmbeddingModel\n",
-    "from typing import List\n",
-    "\n",
-    "EMBEDDING_MODEL = TextEmbeddingModel\n",
-    "MAX_REQ_PER_MIN = 60\n",
-    "CHUNK_SIZE = 4000\n",
-    "CHUNK_OVERLAP = 15\n",
-    "\n",
-    "def rate_limit(max_per_minute):\n",
-    "    period = 60 / max_per_minute\n",
-    "    while True:\n",
-    "        before = time.time()\n",
-    "        yield\n",
-    "        after = time.time()\n",
-    "        elapsed = after - before\n",
-    "        sleep_time = max(0, period - elapsed)\n",
-    "        if sleep_time > 0:\n",
-    "            # print(f'Sleeping {sleep_time:.1f} seconds')\n",
-    "            time.sleep(sleep_time)\n",
-    "                \n",
-    "def embed_documents(texts: List[str]) -> List[List[float]]:\n",
-    "    \"\"\"Call Vertex LLM embedding endpoint for embedding docs\n",
-    "    Args:\n",
-    "    texts: The list of texts to embed.\n",
-    "    Returns:\n",
-    "    List of embeddings, one for each text.\n",
-    "    \"\"\"\n",
-    "    model = EMBEDDING_MODEL.from_pretrained(\"textembedding-gecko@001\")\n",
-    "\n",
-    "    limiter = rate_limit(MAX_REQ_PER_MIN)\n",
-    "    results = []\n",
-    "    docs = list(texts)\n",
-    "\n",
-    "    while docs:\n",
-    "        # Working in batches of 2 because the API apparently won't let\n",
-    "        # us send more than 2 documents per request to get embeddings.\n",
-    "        head, docs = docs[:2], docs[2:]\n",
-    "        # print(f'Sending embedding request for: {head!r}')\n",
-    "        chunk = model.get_embeddings(head)\n",
-    "        results.extend(chunk)\n",
-    "        next(limiter)\n",
-    "    return results"
+    "EMBEDDING_MODEL = TextEmbeddingModel.from_pretrained(\"textembedding-gecko@001\")"
    ]
   },
   {
@@ -231,18 +191,19 @@
     "\n",
     "def create_text_embedding_entries(input_text:str, company_name: str, cusip: str):\n",
     "    text_splitter = RecursiveCharacterTextSplitter(\n",
-    "        chunk_size = CHUNK_SIZE,\n",
-    "        chunk_overlap  = CHUNK_OVERLAP,\n",
+    "        chunk_size = 2000,\n",
+    "        chunk_overlap  = 15,\n",
     "        length_function = len,\n",
     "        is_separator_regex = False,\n",
     "    )\n",
     "    docs = text_splitter.split_text(input_text)\n",
     "    res = []\n",
     "    seq_id = -1\n",
     "    for d in chunks(docs):\n",
-    "        embeddings = embed_documents(d)\n",
+    "        embeddings = EMBEDDING_MODEL.get_embeddings(d)\n",
+    "        \n",
     "        # throttle so we don't blow through the quota.\n",
-    "        # time.sleep(1)\n",
+    "        time.sleep(1)\n",
     "        \n",
     "        for i in range(len(d)):\n",
     "            seq_id += 1\n",
@@ -278,12 +239,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%time\n",
-    "\n",
     "import time\n",
     "\n",
     "# We're hitting the quota, so we're going to sleep for a bit to zero it out for sure, then throttle our calls\n",
-    "# time.sleep(60)\n",
+    "time.sleep(60)\n",
     "\n",
     "count = 0\n",
     "embedding_entries = []\n",
@@ -623,9 +582,9 @@
    "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111"
   },
   "kernelspec": {
-   "display_name": "lab (Local)",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "local-lab"
+   "name": "conda-root-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -637,7 +596,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.17"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/Lab 7 - Semantic Search/02-semantic-search.ipynb b/Lab 7 - Semantic Search/02-semantic-search.ipynb
@@ -236,7 +236,17 @@
    "id": "3479218e-c372-4296-8058-af73eb85096b",
    "metadata": {},
    "source": [
-    "As seen above, the cross-encoder finds this passages which are more relevant to the query and ranks them accordingly"
+    "As seen above, the cross-encoder finds this passage from Bershire Hathaway to be more relevant to the query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c38a08dc-b443-48ab-8e8e-11eb9bc53062",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ranked_results['text'][0]"
    ]
   },
   {
@@ -326,15 +336,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ranked_results['text'][5]"
+    "ranked_results['text'][4]"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "541c6aad-05b4-4391-9f27-b668a82ae910",
    "metadata": {},
    "source": [
-    "The Hybrid search brought in additional results from companies not in vector-only search but has content related to energy, oil & gas. The re-ranker helped rank the results. "
+    "The Hybrid search brought in additional results like `Martin Marietta Material2` which also has content related to energy, oil & gas. The re-ranker helped rank the results. "
    ]
   },
   {
@@ -445,14 +455,11 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ccf6be00-7ae2-468f-abb9-c28242c7f9db",
+   "cell_type": "markdown",
+   "id": "458e5b2c-8bf6-4755-8e3c-9eaf06bc4096",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "top_mgr = res_df['managerName'][0]\n",
-    "top_mgr"
+    "And we can see that our top result is HAHN CAPITAL MANAGEMENT LLC."
    ]
   },
   {
@@ -478,7 +485,7 @@
     "WITH m, count(DISTINCT c) AS ownedCompaniesWithDocs\n",
     "MATCH (m:Manager {managerName: $managerName})-[:OWNS]->(c:Company)\n",
     "RETURN m.managerName AS managerName, ownedCompaniesWithDocs, count(DISTINCT c) AS totalOwnedCompanies\n",
-    "''', params =  {'managerName': top_mgr})"
+    "''', params =  {'managerName':'HAHN CAPITAL MANAGEMENT LLC'})"
    ]
   },
   {
@@ -538,7 +545,7 @@
     "MATCH (m0:Manager {managerName: $managerName})-[r:SIMILAR]->(m:Manager)\n",
     "RETURN DISTINCT m.managerName AS managerName, r.score AS score\n",
     "ORDER BY score DESC LIMIT 10\n",
-    "''', params =  {'managerName': top_mgr})"
+    "''', params =  {'managerName':'HAHN CAPITAL MANAGEMENT LLC'})"
    ]
   },
   {
@@ -568,9 +575,9 @@
    "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111"
   },
   "kernelspec": {
-   "display_name": "lab (Local)",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "local-lab"
+   "name": "conda-root-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -582,7 +589,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.17"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,