Skip to content

Commit 5916c33

Browse files
authored
Merge pull request #29 from intel/update-branch-18029995658
fix: timeout when uploading large file
2 parents 703a49c + e49206b commit 5916c33

File tree

3 files changed

+179
-74
lines changed

3 files changed

+179
-74
lines changed

backend/main.py

Lines changed: 56 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Copyright (C) 2025 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
3-
4-
from fastapi import FastAPI, HTTPException, File, UploadFile
3+
from fastapi import BackgroundTasks, FastAPI, HTTPException, File, UploadFile
54
from pydantic import BaseModel
65
import fitz # PyMuPDF
76
from pathlib import Path
@@ -12,11 +11,15 @@
1211
from generate_image_embedding import generate_image_embedding
1312
from fastapi.responses import FileResponse, JSONResponse
1413
from generate_pptx import create_pptx
14+
from generate_pptx import create_pptx
1515
from starlette.background import BackgroundTask
1616
import tempfile
1717
import imagehash
1818
from PIL import Image
1919
import io
20+
import uuid
21+
from typing import Dict
22+
import json
2023

2124
app = FastAPI()
2225

@@ -26,22 +29,10 @@
2629
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
2730

2831

29-
@app.post("/parse")
30-
async def parse_pdf(file: UploadFile = File(...)):
31-
"""
32-
Endpoint to parse a PDF file uploaded via multipart/form-data.
33-
Extracts images, generates captions and embeddings, and returns the data.
34-
"""
35-
temp_file_path = None
32+
def process_pdf_to_file(job_id: str, pdf_path: str, filename: str):
3633
try:
37-
# Create temp file with delete=False to avoid Windows file locking issues
38-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
39-
temp_file.write(await file.read())
40-
temp_file_path = temp_file.name
41-
42-
print(f"DEBUG : Temporary PDF file created at: {temp_file_path}")
43-
# Open the PDF file using PyMuPDF (now works on Windows since file is closed)
44-
pdf_file = fitz.open(str(temp_file_path))
34+
print(f"Processing job {job_id}")
35+
pdf_file = fitz.open(str(pdf_path))
4536
image_data = []
4637
image_order = 1
4738
seen_hashes = set()
@@ -88,29 +79,62 @@ async def parse_pdf(file: UploadFile = File(...)):
8879

8980
# Prepare the response data
9081
response_data = {
91-
"name": file.filename,
82+
"name": filename,
9283
"details": f"Extracted {len(image_data)} images from the PDF.",
9384
"images": image_data,
9485
"text": extracted_text,
9586
}
9687

97-
return JSONResponse(content=response_data)
88+
temp_dir = tempfile.gettempdir()
89+
result_path = os.path.join(temp_dir, f"{job_id}.json")
90+
with open(result_path, "w") as f:
91+
json.dump(response_data, f)
9892

9993
except Exception as e:
100-
print(f"Error processing PDF: {e}")
101-
raise HTTPException(
102-
status_code=500, detail=f"An error occurred while processing the PDF: {e}"
103-
)
94+
print(f"Error in processing pdf job_id: {job_id}: {e}")
95+
10496
finally:
105-
# Clean up temporary file on Windows
106-
if temp_file_path and os.path.exists(temp_file_path):
107-
try:
108-
os.unlink(temp_file_path)
109-
print(f"DEBUG: Cleaned up temporary file: {temp_file_path}")
110-
except Exception as cleanup_error:
111-
print(
112-
f"Warning: Failed to clean up temporary file {temp_file_path}: {cleanup_error}"
113-
)
97+
try:
98+
if os.path.exists(pdf_path):
99+
os.remove(pdf_path)
100+
except Exception as cleanup_err:
101+
print(f"Warning: Failed to remove temporary PDF {pdf_path}: {cleanup_err}")
102+
103+
104+
@app.post("/upload")
105+
async def upload_file(
106+
file: UploadFile = File(...), background_tasks: BackgroundTasks = None
107+
):
108+
try:
109+
# Generate job ID
110+
job_id = str(uuid.uuid4())
111+
tmp_dir = tempfile.gettempdir()
112+
tmp_path = os.path.join(tmp_dir, f"{job_id}_{file.filename}")
113+
114+
# Save uploaded file to /tmp
115+
with open(tmp_path, "wb") as buffer:
116+
shutil.copyfileobj(file.file, buffer)
117+
118+
# Schedule background PDF processing
119+
background_tasks.add_task(process_pdf_to_file, job_id, tmp_path, file.filename)
120+
121+
return {"jobID": job_id}
122+
except Exception as e:
123+
raise HTTPException(status_code=500, detail=f"Error uploading file: {e}")
124+
125+
126+
@app.get("/result/{job_id}")
127+
def get_result(job_id: str):
128+
temp_dir = tempfile.gettempdir()
129+
result_path = os.path.join(temp_dir, f"{job_id}.json")
130+
if not os.path.exists(result_path):
131+
return JSONResponse(
132+
status_code=202, content={"message": "PDF processing not complete yet."}
133+
)
134+
135+
with open(result_path, "r") as f:
136+
result = json.load(f)
137+
return result
114138

115139

116140
class PPTXRequest(BaseModel):

frontend/src/lib/embedding/generate-embedding.ts

Lines changed: 93 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { EmbeddingChunk } from '../types/embedding-chunk'
66
import { embed } from 'ai'
77
import { verifyModel } from '../model/model-manager'
88
import { detokenize, effectiveTokenCount, tokenize } from '../utils'
9-
9+
import { randomInt } from 'crypto'
1010
/**
1111
* Generates embeddings for a given text using a specified model.
1212
*
@@ -17,6 +17,22 @@ import { detokenize, effectiveTokenCount, tokenize } from '../utils'
1717
* @returns A promise that resolves to an array of embedding chunks.
1818
* @throws An error if the model verification or embedding generation fails.
1919
*/
20+
21+
function sanitizeChunk(text: string): string {
22+
return (
23+
text
24+
// Collapse long runs of periods (..... -> .)
25+
.replace(/([.])\1{2,}/g, '$1')
26+
// Collapse long runs of dashes, underscores, etc. (optional)
27+
.replace(/([-_*])\1{2,}/g, '$1')
28+
// Remove zero-width and control characters
29+
.replace(/[\u0000-\u001F\u007F-\u009F\u200B]/g, '')
30+
// Collapse extra whitespace
31+
.replace(/\s{2,}/g, ' ')
32+
.trim()
33+
)
34+
}
35+
2036
export async function generateEmbeddings(
2137
text: string,
2238
chunkSizeToken: number,
@@ -73,46 +89,90 @@ export async function generateEmbeddings(
7389
let completedCount = 0
7490
const totalChunks = chunks.length
7591
console.log('DEBUG: generateEmbeddings totalChunks:', totalChunks)
76-
const embeddingPromises = chunks.map(async (chunk, index) => {
77-
try {
78-
const { embedding } = await embed({
79-
model: ollama.embedding(modelName),
80-
value: chunk,
81-
})
82-
// console.log(
83-
// `Embedding generated for chunk ${index + 1}/${chunks.length}`
84-
// );
85-
completedCount++
86-
const completionPercentage = ((completedCount / totalChunks) * 100).toFixed(2)
87-
// console.log(
88-
// `Embedding generation: ${completionPercentage}% (${completedCount}/${totalChunks})`
89-
// );
90-
const tokens = tokenize(chunk)
91-
console.log(
92-
`DEBUG: generateEmbeddings: ${completionPercentage}% (${completedCount}/${totalChunks}) | ` +
93-
`[${index}]: ${chunk.length} chars | ` +
94-
`Adjusted token (${chunkSizeToken}): ${tokens.length}`,
95-
)
92+
async function embedChunk(chunk: string, index: number): Promise<EmbeddingChunk | null> {
93+
const sanitized = sanitizeChunk(chunk)
94+
// Log full chunk if sanitization changed it
95+
if (sanitized !== chunk) {
96+
const sanitizeLog = `
97+
Sanitized chunk ${index + 1}:
98+
Before: ${chunk}
99+
After : ${sanitized}
100+
Length: ${chunk.length} -> ${sanitized.length}
101+
-------`
102+
console.log(sanitizeLog)
103+
}
104+
const maxRetries = 5
105+
const tokens = tokenize(chunk)
106+
const preview = chunk.slice(0, 500)
96107

97-
return {
98-
order: index + 1, // 1-based order
99-
chunk: chunk,
100-
embedding: embedding, // assumed to be a number[]
101-
sourceType: 'user' as const, // Specify sourceType for user-generated embeddings
108+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
109+
try {
110+
const { embedding } = await embed({
111+
model: ollama.embedding(modelName),
112+
value: sanitized,
113+
})
114+
completedCount++
115+
const completionPercentage = ((completedCount / totalChunks) * 100).toFixed(2)
116+
117+
const successLog = `
118+
Successful embedding for chunk ${index + 1}/${totalChunks}
119+
Length: ${chunk.length}, Tokens: ${tokens.length}
120+
Preview: ${preview}
121+
Completion: ${completionPercentage}% (${completedCount}/${totalChunks})
122+
-------`
123+
console.log(successLog)
124+
return {
125+
order: index + 1,
126+
chunk: sanitized,
127+
embedding,
128+
sourceType: 'user' as const,
129+
}
130+
} catch (err: unknown) {
131+
let message: string
132+
if (err instanceof Error) {
133+
message = err.message
134+
} else if (typeof err === 'string') {
135+
message = err
136+
} else {
137+
message = JSON.stringify(err)
138+
}
139+
140+
const errorLog = `
141+
Attempt ${attempt}/${maxRetries} failed for chunk ${index + 1}/${totalChunks}
142+
Length: ${chunk.length}, Tokens: ${tokens.length}
143+
Preview: ${preview}
144+
Error: ${message}
145+
-------`
146+
console.error(errorLog)
147+
if (attempt < maxRetries) {
148+
const jitter = randomInt(0, 100)
149+
const delay = 500 * 2 ** (attempt - 1) + jitter
150+
await new Promise<void>((resolve) => {
151+
setTimeout(() => resolve(), delay)
152+
})
153+
}
102154
}
103-
} catch (error) {
104-
throw new Error(
105-
`Failed to generate embedding for chunk ${index + 1}/${totalChunks}: ${error}`,
106-
)
107155
}
108-
})
109156

110-
const results = await Promise.all(embeddingPromises)
157+
const finalErrorLog = `
158+
Failed permanently for chunk ${index + 1}/${totalChunks}
159+
Length: ${chunk.length}, Tokens: ${tokens.length}
160+
Preview: ${preview}
161+
-------`
162+
console.error(finalErrorLog)
163+
return null
164+
}
165+
166+
const embeddingPromises = chunks.map((chunk, index) => embedChunk(chunk, index))
167+
const settled = await Promise.all(embeddingPromises)
168+
169+
const results = settled.filter((r): r is EmbeddingChunk => r !== null)
170+
111171
const endTime = Date.now()
112172
const totalTimeTakenMs = endTime - startTime
113173
const totalTimeTakenSec = (totalTimeTakenMs / 1000).toFixed(2)
114174
console.log(
115-
`Generated ${chunks.length} embeddings in ${totalTimeTakenMs}ms (${totalTimeTakenSec}s)`,
175+
`Generated ${results.length}/${chunks.length} embeddings in ${totalTimeTakenMs}ms (${totalTimeTakenSec}s)`,
116176
)
117177

118178
return results

frontend/src/lib/extract-file-data.ts

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,25 +49,46 @@ export async function extractFileData(file: {
4949
const formData = new FormData()
5050
formData.append('file', new Blob([new Uint8Array(data)], { type: mimetype }), file.name)
5151

52-
const parsefastApiUrl = new URL('/parse', process.env.FASTAPI_SERVER_URL).href
53-
const fastApiResponse = await fetch(parsefastApiUrl, {
52+
// Upload file and get response to confirm upload status
53+
54+
const url = new URL('/upload', process.env.FASTAPI_SERVER_URL)
55+
const uploadResponse = await fetch(url, {
5456
method: 'POST',
5557
body: formData,
5658
})
59+
if (!uploadResponse.ok) {
60+
throw new Error('Failed to upload file to FastAPI server')
61+
}
5762

58-
if (!fastApiResponse.ok) {
59-
throw new Error('Failed to parse PDF on FastAPI server')
63+
const { jobID } = await uploadResponse.json()
64+
65+
// Poll /result/{jobID} until done
66+
const pollResult = async (): Promise<ExtractedData> => {
67+
const url = new URL(`/result/${encodeURIComponent(jobID)}`, process.env.FASTAPI_SERVER_URL)
68+
const pollRes = await fetch(url)
69+
if (pollRes.status === 202) {
70+
// Not ready yet, wait and retry
71+
await new Promise<void>((resolve) => {
72+
setTimeout(() => resolve(), 3000)
73+
})
74+
return pollResult()
75+
}
76+
if (!pollRes.ok) {
77+
throw new Error('Failed to retrieve processed PDF result')
78+
}
79+
return pollRes.json()
6080
}
6181

62-
const fastApiData = await fastApiResponse.json()
63-
extractedText = fastApiData.text
64-
extractedImages = fastApiData.images
82+
const parsedData = await pollResult()
83+
extractedText = parsedData.text
84+
extractedImages = parsedData.images
6585
} else if (mimetype.includes('text') || ext === '.txt') {
6686
fileType = 'txt'
67-
// contentSequence.push({ type: "text", content: data.toString("utf-8") });
87+
88+
// contentSequence.push({ type: 'text', content: data.toString('utf-8') });
6889
} else if (mimetype.includes('markdown') || ext === '.md') {
6990
fileType = 'md'
70-
// contentSequence.push({ type: "text", content: data.toString("utf-8") });
91+
// contentSequence.push({ type: 'text', content: data.toString('utf-8') });
7192
} else {
7293
throw new Error('Unsupported file type')
7394
}

0 commit comments

Comments
 (0)