11# Copyright (C) 2025 Intel Corporation
22# SPDX-License-Identifier: Apache-2.0
3-
4- from fastapi import FastAPI , HTTPException , File , UploadFile
3+ from fastapi import BackgroundTasks , FastAPI , HTTPException , File , UploadFile
54from pydantic import BaseModel
65import fitz # PyMuPDF
76from pathlib import Path
1211from generate_image_embedding import generate_image_embedding
1312from fastapi .responses import FileResponse , JSONResponse
1413from generate_pptx import create_pptx
14+ from generate_pptx import create_pptx
1515from starlette .background import BackgroundTask
1616import tempfile
1717import imagehash
1818from PIL import Image
1919import io
20+ import uuid
21+ from typing import Dict
22+ import json
2023
2124app = FastAPI ()
2225
2629OUTPUT_DIR .mkdir (parents = True , exist_ok = True )
2730
2831
29- @app .post ("/parse" )
30- async def parse_pdf (file : UploadFile = File (...)):
31- """
32- Endpoint to parse a PDF file uploaded via multipart/form-data.
33- Extracts images, generates captions and embeddings, and returns the data.
34- """
35- temp_file_path = None
32+ def process_pdf_to_file (job_id : str , pdf_path : str , filename : str ):
3633 try :
37- # Create temp file with delete=False to avoid Windows file locking issues
38- with tempfile .NamedTemporaryFile (delete = False , suffix = ".pdf" ) as temp_file :
39- temp_file .write (await file .read ())
40- temp_file_path = temp_file .name
41-
42- print (f"DEBUG : Temporary PDF file created at: { temp_file_path } " )
43- # Open the PDF file using PyMuPDF (now works on Windows since file is closed)
44- pdf_file = fitz .open (str (temp_file_path ))
34+ print (f"Processing job { job_id } " )
35+ pdf_file = fitz .open (str (pdf_path ))
4536 image_data = []
4637 image_order = 1
4738 seen_hashes = set ()
@@ -88,33 +79,67 @@ async def parse_pdf(file: UploadFile = File(...)):
8879
8980 # Prepare the response data
9081 response_data = {
91- "name" : file . filename ,
82+ "name" : filename ,
9283 "details" : f"Extracted { len (image_data )} images from the PDF." ,
9384 "images" : image_data ,
9485 "text" : extracted_text ,
9586 }
9687
97- return JSONResponse (content = response_data )
88+ temp_dir = tempfile .gettempdir ()
89+ result_path = os .path .join (temp_dir , f"{ job_id } .json" )
90+ with open (result_path , "w" ) as f :
91+ json .dump (response_data , f )
9892
9993 except Exception as e :
100- print (f"Error processing PDF: { e } " )
101- raise HTTPException (
102- status_code = 500 , detail = f"An error occurred while processing the PDF: { e } "
103- )
94+ print (f"Error in processing pdf job_id: { job_id } : { e } " )
95+
10496 finally :
105- # Clean up temporary file on Windows
106- if temp_file_path and os .path .exists (temp_file_path ):
107- try :
108- os .unlink (temp_file_path )
109- print (f"DEBUG: Cleaned up temporary file: { temp_file_path } " )
110- except Exception as cleanup_error :
111- print (
112- f"Warning: Failed to clean up temporary file { temp_file_path } : { cleanup_error } "
113- )
97+ try :
98+ if os .path .exists (pdf_path ):
99+ os .remove (pdf_path )
100+ except Exception as cleanup_err :
101+ print (f"Warning: Failed to remove temporary PDF { pdf_path } : { cleanup_err } " )
102+
103+
104+ @app .post ("/upload" )
105+ async def upload_file (
106+ file : UploadFile = File (...), background_tasks : BackgroundTasks = None
107+ ):
108+ try :
109+ # Generate job ID
110+ job_id = str (uuid .uuid4 ())
111+ tmp_dir = tempfile .gettempdir ()
112+ tmp_path = os .path .join (tmp_dir , f"{ job_id } _{ file .filename } " )
113+
114+ # Save uploaded file to /tmp
115+ with open (tmp_path , "wb" ) as buffer :
116+ shutil .copyfileobj (file .file , buffer )
117+
118+ # Schedule background PDF processing
119+ background_tasks .add_task (process_pdf_to_file , job_id , tmp_path , file .filename )
120+
121+ return {"jobID" : job_id }
122+ except Exception as e :
123+ raise HTTPException (status_code = 500 , detail = f"Error uploading file: { e } " )
124+
125+
126+ @app .get ("/result/{job_id}" )
127+ def get_result (job_id : str ):
128+ temp_dir = tempfile .gettempdir ()
129+ result_path = os .path .join (temp_dir , f"{ job_id } .json" )
130+ if not os .path .exists (result_path ):
131+ return JSONResponse (
132+ status_code = 202 , content = {"message" : "PDF processing not complete yet." }
133+ )
134+
135+ with open (result_path , "r" ) as f :
136+ result = json .load (f )
137+ return result
114138
115139
116140class PPTXRequest (BaseModel ):
117141 content : dict
142+ language : str | None = "en"
118143
119144
120145def validate_and_transform_content (content : dict ) -> dict :
@@ -130,7 +155,7 @@ def validate_and_transform_content(content: dict) -> dict:
130155 """
131156 # Ensure required keys exist with default values if missing
132157 transformed_content = {
133- "title" : content .get ("title" , "Untitled Presentation " ),
158+ "title" : content .get ("title" , "" ),
134159 "contentType" : content .get ("contentType" , "lecture" ),
135160 "difficultyLevel" : content .get ("difficultyLevel" , "intermediate" ),
136161 "slides" : content .get ("slides" , []),
@@ -143,13 +168,13 @@ def validate_and_transform_content(content: dict) -> dict:
143168
144169 # Validate slides structure
145170 for slide in transformed_content ["slides" ]:
146- slide .setdefault ("title" , "Untitled Slide " )
171+ slide .setdefault ("title" , "" )
147172 slide .setdefault ("content" , [])
148173 slide .setdefault ("notes" , "" )
149174
150175 # Validate activities structure
151176 for activity in transformed_content ["activities" ]:
152- activity .setdefault ("title" , "Untitled Activity " )
177+ activity .setdefault ("title" , "" )
153178 activity .setdefault ("description" , "" )
154179 activity .setdefault ("type" , "Exercise" )
155180 activity .setdefault ("duration" , "20 minutes" )
@@ -168,13 +193,13 @@ def validate_and_transform_content(content: dict) -> dict:
168193
169194 # Validate key terms structure
170195 for term in transformed_content ["keyTerms" ]:
171- term .setdefault ("term" , "Untitled Term " )
172- term .setdefault ("definition" , "No definition provided. " )
196+ term .setdefault ("term" , "" )
197+ term .setdefault ("definition" , "" )
173198
174199 # Validate further readings structure
175200 for reading in transformed_content ["furtherReadings" ]:
176- reading .setdefault ("title" , "Untitled Reading " )
177- reading .setdefault ("author" , "Unknown Author " )
201+ reading .setdefault ("title" , "" )
202+ reading .setdefault ("author" , "" )
178203 reading .setdefault ("readingDescription" , "" )
179204
180205 return transformed_content
@@ -198,7 +223,10 @@ async def generate_pptx(request: PPTXRequest):
198223 print (temp_pptx_path )
199224
200225 # Generate the PPTX file
201- create_pptx (transformed_content , temp_pptx_path )
226+ lang = (request .language or "en" ).lower ()
227+ if lang not in ["en" , "id" ]:
228+ lang = "en"
229+ create_pptx (transformed_content , temp_pptx_path , lang )
202230 print (f"Temporary PPTX file created at: { temp_pptx_path } " )
203231
204232 if not os .path .exists (temp_pptx_path ):
0 commit comments