88 lines
2.5 KiB
Python
Raw Permalink Normal View History

2024-04-20 11:51:57 +01:00
import tempfile
2024-04-18 11:49:41 +01:00
import fitz
2024-04-20 11:51:57 +01:00
from sanic import Sanic, json, text
from sanic.request import File, Request
2024-04-18 11:49:41 +01:00
2024-04-20 11:51:57 +01:00
upload_folder = tempfile.gettempdir()
app = Sanic("PlumberServer")
2024-04-18 11:49:41 +01:00
2024-10-01 14:32:36 +01:00
bullets = ["", "", "", "§", "", "", "", "", "", ""]
2024-04-18 11:49:41 +01:00
2024-04-20 11:51:57 +01:00
@app.get("/")
def healthz(_: Request):
return text("all good :)")
2024-04-18 11:49:41 +01:00
@app.post("/")
2024-04-20 11:51:57 +01:00
def process_file(request: Request):
if request.files is None:
return text("no files were uploaded", 400)
2024-04-18 11:49:41 +01:00
2024-04-20 11:51:57 +01:00
file: File | None = request.files.get("file")
if file is None:
return text("file not found", 400)
2024-04-18 11:49:41 +01:00
2024-04-20 11:51:57 +01:00
doc = fitz.open(stream=file.body)
2024-04-18 11:49:41 +01:00
textpages = []
for page in doc:
assert isinstance(page, fitz.Page)
textpages.append(page.get_textpage().extractDICT())
doc.close()
sizes: set[int] = set()
for page in textpages:
for block in page["blocks"]:
for line in block["lines"]:
for span in line["spans"]:
sizes.add(round(span["size"]))
sorted_sizes = sorted(sizes)
h1_size = sorted_sizes[-1]
h2_size = sorted_sizes[-2]
2024-04-18 14:07:48 +01:00
pass1 = []
2024-04-18 11:49:41 +01:00
for p in textpages:
2024-04-18 14:07:48 +01:00
pass1_blocks = []
2024-04-18 11:49:41 +01:00
for block in p["blocks"]:
2024-04-18 14:07:48 +01:00
pass1_lines = []
2024-04-18 11:49:41 +01:00
for line in block["lines"]:
is_bullet = False
largest_size = 0
2024-04-18 14:07:48 +01:00
smallest_left = 10000
2024-04-18 11:49:41 +01:00
line_text = ""
for span_index, span in enumerate(line["spans"]):
t = span["text"]
if t in bullets and span_index == 0:
is_bullet = True
continue
span_size = round(span["size"])
if span_size > largest_size:
largest_size = span_size
2024-04-18 14:07:48 +01:00
span_left = round(span["origin"][0])
if span_left < smallest_left:
smallest_left = span_left
2024-04-18 11:49:41 +01:00
line_text += t
line_type = "p"
if is_bullet:
line_type = "li"
elif largest_size == h1_size:
line_type = "h1"
elif largest_size == h2_size:
line_type = "h2"
2024-10-01 14:32:36 +01:00
pass1_lines.append({
"type": line_type,
"value": line_text,
"left": smallest_left,
})
2024-04-18 11:49:41 +01:00
2024-04-18 14:07:48 +01:00
pass1_blocks.append(pass1_lines)
pass1.append(pass1_blocks)
2024-04-20 11:51:57 +01:00
return json(pass1)