-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
64 lines (49 loc) · 2.29 KB
/
app.py
File metadata and controls
64 lines (49 loc) · 2.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import sys
import os
import json
import PyPDF2
from PIL import Image
import io
def extract_images_and_structure_from_pdf(pdf_path, output_path):
try:
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
structured_data = []
image_count = 0
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_content = {"section": f"Page {page_num + 1}", "content": []}
# Extract text (basic approach, might need refinement)
text = page.extract_text()
if text:
page_content["content"].append({"type": "text", "text": text})
# Extract images
# Iterate through the objects on the page
for img_file_index, img in enumerate(page.images):
image_count += 1
image_filename = f"image_{page_num+1}_{image_count}_{img_file_index}.jpg"
image_filepath = os.path.join(output_path, image_filename)
try:
image = Image.open(io.BytesIO(img.data))
image.save(image_filepath)
page_content["content"].append({"type": "image", "image_path": image_filename})
except Exception as e:
print(f"Error processing image: {e}")
page_content["content"].append({"type": "image", "image_path": None, "error": str(e)})
structured_data.append(page_content)
# Save to JSON file
json_filepath = os.path.join(output_path, "output.json")
with open(json_filepath, 'w', encoding='utf-8') as f:
json.dump(structured_data, f, indent=4, ensure_ascii=False)
print(f"Extracted structured data and images to {json_filepath}")
except Exception as e:
print(f"An error occurred: {str(e)}")
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python app.py <pdf_file_path> <output_path>")
sys.exit(1)
pdf_file_path = sys.argv[1]
output_path = sys.argv[2]
if not os.path.exists(output_path):
os.makedirs(output_path)
extract_images_and_structure_from_pdf(pdf_file_path, output_path)