forked from adithya-s-k/marker-api
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinvoke.py
More file actions
71 lines (57 loc) · 2.63 KB
/
invoke.py
File metadata and controls
71 lines (57 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import requests
from PIL import Image
import base64
import argparse
"""
python invoke.py --server_url http://127.0.0.1:8000/convert --filename test.pdf --output output
"""
def save_images_and_markdown(response_data, output_folder):
# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
for pdf in response_data:
pdf_filename = pdf['filename']
pdf_output_folder = os.path.join(output_folder, os.path.splitext(pdf_filename)[0])
# Create a folder for each PDF
os.makedirs(pdf_output_folder, exist_ok=True)
# Save markdown
markdown_text = pdf['markdown']
with open(os.path.join(pdf_output_folder, 'output.md'), 'w', encoding='utf-8') as f:
f.write(markdown_text)
# Save images
image_data = pdf['images']
for image_name, image_base64 in image_data.items():
# Decode base64 image
image_bytes = base64.b64decode(image_base64)
# Save image
with open(os.path.join(pdf_output_folder, image_name), 'wb') as f:
f.write(image_bytes)
def convert_pdf_to_markdown_and_save(pdf_file_paths, output_folder, server_url):
files = []
# Prepare the files for the request
for pdf_file_path in pdf_file_paths:
with open(pdf_file_path, 'rb') as f:
pdf_content = f.read()
files.append(('pdf_files', (os.path.basename(pdf_file_path), pdf_content, 'application/pdf')))
# Send request to FastAPI server with all PDF files attached
response = requests.post(server_url, files=files)
# Check if request was successful
if response.status_code == 200:
# Save markdown and images
response_data = response.json()
save_images_and_markdown(response_data, output_folder)
print("Markdown and images saved successfully.")
else:
print(f"Error: {response.text}")
if __name__ == "__main__":
# Argument parsing
parser = argparse.ArgumentParser(description='Convert PDF to markdown and save.')
parser.add_argument('--server_url', type=str, required=True, help='URL of the server for PDF conversion')
parser.add_argument('--filename', type=str, nargs='+', required=True, help='Paths to the PDF files')
parser.add_argument('--output', type=str, required=True, help='Output folder for saving markdown and images')
args = parser.parse_args()
# Convert PDF to markdown and save
convert_pdf_to_markdown_and_save(args.filename, args.output, args.server_url)
"""
python invoke.py --server_url http://127.0.0.1:8000/convert --filename test1.pdf test2.pdf --output output
"""