|
164 | 164 | "outputs": [], |
165 | 165 | "source": [ |
166 | 166 | "from vertexai.language_models import TextEmbeddingModel\n", |
167 | | - "from typing import List\n", |
168 | | - "\n", |
169 | | - "EMBEDDING_MODEL = TextEmbeddingModel\n", |
170 | | - "MAX_REQ_PER_MIN = 60\n", |
171 | | - "CHUNK_SIZE = 4000\n", |
172 | | - "CHUNK_OVERLAP = 15\n", |
173 | | - "\n", |
174 | | - "def rate_limit(max_per_minute):\n", |
175 | | - " period = 60 / max_per_minute\n", |
176 | | - " while True:\n", |
177 | | - " before = time.time()\n", |
178 | | - " yield\n", |
179 | | - " after = time.time()\n", |
180 | | - " elapsed = after - before\n", |
181 | | - " sleep_time = max(0, period - elapsed)\n", |
182 | | - " if sleep_time > 0:\n", |
183 | | - " # print(f'Sleeping {sleep_time:.1f} seconds')\n", |
184 | | - " time.sleep(sleep_time)\n", |
185 | | - " \n", |
186 | | - "def embed_documents(texts: List[str]) -> List[List[float]]:\n", |
187 | | - " \"\"\"Call Vertex LLM embedding endpoint for embedding docs\n", |
188 | | - " Args:\n", |
189 | | - " texts: The list of texts to embed.\n", |
190 | | - " Returns:\n", |
191 | | - " List of embeddings, one for each text.\n", |
192 | | - " \"\"\"\n", |
193 | | - " model = EMBEDDING_MODEL.from_pretrained(\"textembedding-gecko@001\")\n", |
194 | | - "\n", |
195 | | - " limiter = rate_limit(MAX_REQ_PER_MIN)\n", |
196 | | - " results = []\n", |
197 | | - " docs = list(texts)\n", |
198 | | - "\n", |
199 | | - " while docs:\n", |
200 | | - " # Working in batches of 2 because the API apparently won't let\n", |
201 | | - " # us send more than 2 documents per request to get embeddings.\n", |
202 | | - " head, docs = docs[:2], docs[2:]\n", |
203 | | - " # print(f'Sending embedding request for: {head!r}')\n", |
204 | | - " chunk = model.get_embeddings(head)\n", |
205 | | - " results.extend(chunk)\n", |
206 | | - " next(limiter)\n", |
207 | | - " return results" |
| 167 | + "EMBEDDING_MODEL = TextEmbeddingModel.from_pretrained(\"textembedding-gecko@001\")" |
208 | 168 | ] |
209 | 169 | }, |
210 | 170 | { |
|
231 | 191 | "\n", |
232 | 192 | "def create_text_embedding_entries(input_text:str, company_name: str, cusip: str):\n", |
233 | 193 | " text_splitter = RecursiveCharacterTextSplitter(\n", |
234 | | - " chunk_size = CHUNK_SIZE,\n", |
235 | | - " chunk_overlap = CHUNK_OVERLAP,\n", |
| 194 | + " chunk_size = 2000,\n", |
| 195 | + " chunk_overlap = 15,\n", |
236 | 196 | " length_function = len,\n", |
237 | 197 | " is_separator_regex = False,\n", |
238 | 198 | " )\n", |
239 | 199 | " docs = text_splitter.split_text(input_text)\n", |
240 | 200 | " res = []\n", |
241 | 201 | " seq_id = -1\n", |
242 | 202 | " for d in chunks(docs):\n", |
243 | | - " embeddings = embed_documents(d)\n", |
| 203 | + " embeddings = EMBEDDING_MODEL.get_embeddings(d)\n", |
| 204 | + " \n", |
244 | 205 | " # throttle so we don't blow through the quota.\n", |
245 | | - " # time.sleep(1)\n", |
| 206 | + " time.sleep(1)\n", |
246 | 207 | " \n", |
247 | 208 | " for i in range(len(d)):\n", |
248 | 209 | " seq_id += 1\n", |
|
278 | 239 | "metadata": {}, |
279 | 240 | "outputs": [], |
280 | 241 | "source": [ |
281 | | - "%%time\n", |
282 | | - "\n", |
283 | 242 | "import time\n", |
284 | 243 | "\n", |
285 | 244 | "# We're hitting the quota, so we're going to sleep for a bit to zero it out for sure, then throttle our calls\n", |
286 | | - "# time.sleep(60)\n", |
| 245 | + "time.sleep(60)\n", |
287 | 246 | "\n", |
288 | 247 | "count = 0\n", |
289 | 248 | "embedding_entries = []\n", |
|
623 | 582 | "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m111" |
624 | 583 | }, |
625 | 584 | "kernelspec": { |
626 | | - "display_name": "lab (Local)", |
| 585 | + "display_name": "Python 3", |
627 | 586 | "language": "python", |
628 | | - "name": "local-lab" |
| 587 | + "name": "conda-root-py" |
629 | 588 | }, |
630 | 589 | "language_info": { |
631 | 590 | "codemirror_mode": { |
|
637 | 596 | "name": "python", |
638 | 597 | "nbconvert_exporter": "python", |
639 | 598 | "pygments_lexer": "ipython3", |
640 | | - "version": "3.8.17" |
| 599 | + "version": "3.10.12" |
641 | 600 | } |
642 | 601 | }, |
643 | 602 | "nbformat": 4, |
|
0 commit comments