Skip to content

Commit 880d0f9

Browse files
committed
fix async extract kv
1 parent e072ee2 commit 880d0f9

File tree

5 files changed

+70
-60
lines changed

5 files changed

+70
-60
lines changed

any_parser/any_parser.py

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from any_parser.sync_parser import (
1414
ExtractKeyValueSyncParser,
1515
ExtractPIISyncParser,
16-
ExtractResumeKeyValueSyncParser,
1716
ExtractTablesSyncParser,
1817
ParseSyncParser,
1918
ParseProSyncParser,
@@ -112,9 +111,6 @@ def __init__(
112111
self._sync_parse_pro = ParseProSyncParser(api_key, base_url)
113112
self._sync_parse_textract = ParseTextractSyncParser(api_key, base_url)
114113
self._sync_extract_key_value = ExtractKeyValueSyncParser(api_key, base_url)
115-
self._sync_extract_resume_key_value = ExtractResumeKeyValueSyncParser(
116-
api_key, base_url
117-
)
118114
self._sync_extract_pii = ExtractPIISyncParser(api_key, base_url)
119115
self._sync_extract_tables = ExtractTablesSyncParser(api_key, base_url)
120116
self.batches = BatchParser(api_key, batch_url)
@@ -312,13 +308,17 @@ def extract_key_value(
312308
313309
Args:
314310
file_path (str): The path to the file to be parsed.
311+
file_content (str): Base64 encoded file content.
312+
file_type (str): File format extension.
315313
extract_instruction (Dict or List): A dictionary containing the keys to be
316314
extracted, with their values as the description of those keys.
317315
Or a list of dictionaries with 'key' and 'description' fields.
318316
Returns:
319317
tuple(str, str): The extracted data and the time taken.
320318
"""
321-
# Convert extract_instruction to the correct API format
319+
# Convert extract_instruction to the correct API format\
320+
if not file_type:
321+
file_type = file_path.split('.')[-1] if '.' in file_path else ""
322322
formatted_instruction = None
323323
if extract_instruction:
324324
if isinstance(extract_instruction, dict):
@@ -440,27 +440,14 @@ def async_extract_key_value(
440440
Returns:
441441
tuple: (job_id, timing_info) or (error_message, "")
442442
"""
443-
# Convert extract_instruction to the correct API format
444-
formatted_instruction = None
445-
if extract_instruction:
446-
if isinstance(extract_instruction, dict):
447-
# Convert dict format to list of key-description pairs
448-
formatted_instruction = [
449-
{"key": key, "description": description}
450-
for key, description in extract_instruction.items()
451-
]
452-
elif isinstance(extract_instruction, list):
453-
# Already in correct format
454-
formatted_instruction = extract_instruction
455-
else:
456-
raise ValueError("extract_instruction must be a dict or list")
457-
443+
if not file_type:
444+
file_type = file_path.split('.')[-1] if '.' in file_path else ""
458445
return self._async_parser.send_async_request(
459446
process_type=ProcessType.EXTRACT_KEY_VALUE,
460447
file_path=file_path, # type: ignore
461448
file_content=file_content, # type: ignore
462449
file_type=file_type, # type: ignore
463-
extract_args={"extract_instruction": formatted_instruction},
450+
extract_args={"extract_instruction": extract_instruction},
464451
)
465452

466453
def get_job_status(self, job_id: str):

any_parser/async_parser.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,20 @@ def send_async_request(
6161
}
6262

6363
if extract_args:
64-
if process_type == ProcessType.EXTRACT_KEY_VALUE and "extract_instruction" in extract_args:
65-
# Match the sync implementation exactly: create payload_args dict first
66-
payload["extract_input_key_description_pairs"] = extract_args["extract_instruction"]
64+
if process_type == ProcessType.EXTRACT_KEY_VALUE:
65+
input_keys = list(extract_args['extract_instruction'].keys())
66+
input_descriptions = list(extract_args['extract_instruction'].values())
67+
extract_instruction = [
68+
{
69+
"key": key,
70+
"description": description
71+
}
72+
for key, description in zip(input_keys, input_descriptions)
73+
]
74+
payload["extract_input_key_description_pairs"] = extract_instruction
6775
elif process_type == ProcessType.EXTRACT_TABLES:
6876
payload["extract_tables"] = True
6977
else:
70-
# For other process types, add extract_args directly
7178
payload.update(extract_args)
7279

7380
# Send the POST request

examples/async_extract_key_value_img.ipynb

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@
160160
{
161161
"data": {
162162
"text/plain": [
163-
"('file_type must be provided when using file_content', '')"
163+
"'b9061390-ce89-450c-9c10-94f4dac874df'"
164164
]
165165
},
166166
"execution_count": 6,
@@ -179,7 +179,7 @@
179179
" }\n",
180180
"\n",
181181
"# extract returns a tuple containing the markdown as a string and total time\n",
182-
"file_id = ap.async_extract_key_value(file_path=example_local_file, extract_instruction=extract_instruction)\n",
182+
"file_id = ap.async_extract_key_value(file_path=example_local_file, file_type=\"jpeg\", extract_instruction=extract_instruction)\n",
183183
"file_id"
184184
]
185185
},
@@ -192,26 +192,30 @@
192192
"name": "stdout",
193193
"output_type": "stream",
194194
"text": [
195-
"Waiting for response...\n",
196195
"Waiting for response...\n"
197196
]
198197
},
199198
{
200-
"ename": "KeyboardInterrupt",
201-
"evalue": "",
202-
"output_type": "error",
203-
"traceback": [
204-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
205-
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
206-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# fetch results (5s polling up to 60s)\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m result = \u001b[43map\u001b[49m\u001b[43m.\u001b[49m\u001b[43masync_fetch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msync_timeout\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m60\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msync_interval\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 3\u001b[39m display(result)\n",
207-
"\u001b[36mFile \u001b[39m\u001b[32m~/any-parser/any_parser/any_parser.py:535\u001b[39m, in \u001b[36mAnyParser.async_fetch\u001b[39m\u001b[34m(self, file_id, sync_timeout, sync_interval)\u001b[39m\n\u001b[32m 533\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m job_status.get(\u001b[33m\"\u001b[39m\u001b[33mstatus\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01min\u001b[39;00m [\u001b[33m\"\u001b[39m\u001b[33mpending\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mprocessing\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m 534\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mWaiting for response...\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m535\u001b[39m \u001b[43mtime\u001b[49m\u001b[43m.\u001b[49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43msync_interval\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 536\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[32m 537\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
208-
"\u001b[31mKeyboardInterrupt\u001b[39m: "
199+
"name": "stdout",
200+
"output_type": "stream",
201+
"text": [
202+
"Waiting for response...\n",
203+
"Waiting for response...\n"
209204
]
205+
},
206+
{
207+
"data": {
208+
"text/plain": [
209+
"\"{'R.B.C.': ['5.22'], 'M.C.V.': ['86.6'], 'M.C.H.': ['28.4'], 'M.C.H.C.': ['32.7']}\""
210+
]
211+
},
212+
"metadata": {},
213+
"output_type": "display_data"
210214
}
211215
],
212216
"source": [
213-
"# fetch results (5s polling up to 60s)\n",
214-
"result = ap.async_fetch(file_id, sync_timeout=60, sync_interval=5)\n",
217+
"# fetch results (3s polling up to 60s)\n",
218+
"result = ap.async_fetch(file_id, sync_timeout=60, sync_interval=3)\n",
215219
"display(result)"
216220
]
217221
},

examples/async_extract_key_value_pdf.ipynb

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -168,22 +168,34 @@
168168
},
169169
{
170170
"cell_type": "code",
171-
"execution_count": null,
171+
"execution_count": 6,
172172
"metadata": {},
173173
"outputs": [
174174
{
175-
"ename": "Exception",
176-
"evalue": "Error 500: {\"detail\":\"Object of type ExtractKeyDescriptionPair is not JSON serializable\"}",
177-
"output_type": "error",
178-
"traceback": [
179-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
180-
"\u001b[31mException\u001b[39m Traceback (most recent call last)",
181-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 11\u001b[39m\n\u001b[32m 3\u001b[39m extract_instruction = {\n\u001b[32m 4\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mdocument_owner\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mname of the document owner\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 5\u001b[39m \u001b[33m\"\u001b[39m\u001b[33minvoice_num\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33minvoice number\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 6\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mpo_num\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mpurchase order number\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 7\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mdate\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mdate of the invoice\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 8\u001b[39m }\n\u001b[32m 10\u001b[39m \u001b[38;5;66;03m# extract returns a tuple containing the markdown as a string and total time\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m11\u001b[39m file_id = \u001b[43map\u001b[49m\u001b[43m.\u001b[49m\u001b[43masync_extract_key_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexample_local_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile_type\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpdf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextract_instruction\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_instruction\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[38;5;66;03m# fetch results (5s polling up to 60s)\u001b[39;00m\n\u001b[32m 14\u001b[39m result = ap.async_fetch(file_id, sync_timeout=\u001b[32m60\u001b[39m, sync_interval=\u001b[32m5\u001b[39m)\n",
182-
"\u001b[36mFile \u001b[39m\u001b[32m~/any-parser/any_parser/any_parser.py:78\u001b[39m, in \u001b[36mhandle_file_processing.<locals>.wrapper\u001b[39m\u001b[34m(self, file_path, file_content, file_type, *args, **kwargs)\u001b[39m\n\u001b[32m 74\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 75\u001b[39m \u001b[38;5;66;03m# generate a random file path for genrating presigned url\u001b[39;00m\n\u001b[32m 76\u001b[39m file_path = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m/tmp/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00muuid.uuid4()\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m---> \u001b[39m\u001b[32m78\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 79\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 80\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 81\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfile_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 82\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfile_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 83\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 84\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 85\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
183-
"\u001b[36mFile \u001b[39m\u001b[32m~/any-parser/any_parser/any_parser.py:458\u001b[39m, in \u001b[36mAnyParser.async_extract_key_value\u001b[39m\u001b[34m(self, file_path, file_content, file_type, extract_instruction)\u001b[39m\n\u001b[32m 455\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 456\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mextract_instruction must be a dict or list\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m458\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_async_parser\u001b[49m\u001b[43m.\u001b[49m\u001b[43msend_async_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 459\u001b[39m \u001b[43m \u001b[49m\u001b[43mprocess_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mProcessType\u001b[49m\u001b[43m.\u001b[49m\u001b[43mEXTRACT_KEY_VALUE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 460\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[32m 461\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfile_content\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[32m 462\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfile_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[32m 463\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_args\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mextract_instruction\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mformatted_instruction\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 464\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
184-
"\u001b[36mFile \u001b[39m\u001b[32m~/any-parser/any_parser/async_parser.py:84\u001b[39m, in \u001b[36mAsyncParser.send_async_request\u001b[39m\u001b[34m(self, process_type, file_path, file_content, file_type, extract_args)\u001b[39m\n\u001b[32m 76\u001b[39m response = requests.post(\n\u001b[32m 77\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m._base_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mendpoint\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m,\n\u001b[32m 78\u001b[39m headers=\u001b[38;5;28mself\u001b[39m._headers,\n\u001b[32m 79\u001b[39m data=json.dumps(payload),\n\u001b[32m 80\u001b[39m timeout=TIMEOUT,\n\u001b[32m 81\u001b[39m )\n\u001b[32m 83\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m response.status_code != \u001b[32m200\u001b[39m:\n\u001b[32m---> \u001b[39m\u001b[32m84\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse.status_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse.text\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 86\u001b[39m response_data = response.json()\n\u001b[32m 87\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m response_data[\u001b[33m\"\u001b[39m\u001b[33mjob_id\u001b[39m\u001b[33m\"\u001b[39m]\n",
185-
"\u001b[31mException\u001b[39m: Error 500: {\"detail\":\"Object of type ExtractKeyDescriptionPair is not JSON serializable\"}"
175+
"name": "stdout",
176+
"output_type": "stream",
177+
"text": [
178+
"[{'key': 'document_owner', 'description': 'name of the document owner'}, {'key': 'invoice_num', 'description': 'invoice number'}, {'key': 'po_num', 'description': 'purchase order number'}, {'key': 'date', 'description': 'date of the invoice'}]\n"
186179
]
180+
},
181+
{
182+
"name": "stdout",
183+
"output_type": "stream",
184+
"text": [
185+
"Waiting for response...\n",
186+
"Waiting for response...\n",
187+
"Waiting for response...\n",
188+
"Waiting for response...\n"
189+
]
190+
},
191+
{
192+
"data": {
193+
"text/plain": [
194+
"\"{'document_owner': ['Cambio Corp'], 'invoice_num': ['INV11111111'], 'po_num': ['PO22222222'], 'date': ['06-SEPT-2024']}\""
195+
]
196+
},
197+
"metadata": {},
198+
"output_type": "display_data"
187199
}
188200
],
189201
"source": [
@@ -199,8 +211,8 @@
199211
"# extract returns a tuple containing the markdown as a string and total time\n",
200212
"file_id = ap.async_extract_key_value(file_path=example_local_file, file_type=\"pdf\", extract_instruction=extract_instruction)\n",
201213
"\n",
202-
"# fetch results (5s polling up to 60s)\n",
203-
"result = ap.async_fetch(file_id, sync_timeout=60, sync_interval=5)\n",
214+
"# fetch results (3s polling up to 60s)\n",
215+
"result = ap.async_fetch(file_id, sync_timeout=60, sync_interval=3)\n",
204216
"display(result)"
205217
]
206218
},

examples/extract_key_value_img.ipynb

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
{
1717
"cell_type": "code",
18-
"execution_count": 1,
18+
"execution_count": 7,
1919
"metadata": {},
2020
"outputs": [],
2121
"source": [
@@ -33,7 +33,7 @@
3333
},
3434
{
3535
"cell_type": "code",
36-
"execution_count": 2,
36+
"execution_count": 8,
3737
"metadata": {},
3838
"outputs": [],
3939
"source": [
@@ -56,7 +56,7 @@
5656
},
5757
{
5858
"cell_type": "code",
59-
"execution_count": 3,
59+
"execution_count": 9,
6060
"metadata": {},
6161
"outputs": [],
6262
"source": [
@@ -86,7 +86,7 @@
8686
},
8787
{
8888
"cell_type": "code",
89-
"execution_count": 4,
89+
"execution_count": 10,
9090
"metadata": {},
9191
"outputs": [],
9292
"source": [
@@ -107,7 +107,7 @@
107107
},
108108
{
109109
"cell_type": "code",
110-
"execution_count": 5,
110+
"execution_count": 11,
111111
"metadata": {},
112112
"outputs": [
113113
{
@@ -149,7 +149,7 @@
149149
},
150150
{
151151
"cell_type": "code",
152-
"execution_count": 6,
152+
"execution_count": 12,
153153
"metadata": {},
154154
"outputs": [
155155
{
@@ -168,7 +168,7 @@
168168
"name": "stdout",
169169
"output_type": "stream",
170170
"text": [
171-
"Time Elapsed: 7.12 seconds\n"
171+
"Time Elapsed: 11.87 seconds\n"
172172
]
173173
}
174174
],

0 commit comments

Comments
 (0)