plumberserver/main.py
Pal Kerecsenyi d106d181fb
All checks were successful
/ build (push) Successful in 2m12s
Add more bullet points
2024-10-01 14:32:36 +01:00

88 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import tempfile
import fitz
from sanic import Sanic, json, text
from sanic.request import File, Request
upload_folder = tempfile.gettempdir()
app = Sanic("PlumberServer")
bullets = ["", "", "", "§", "", "", "", "", "", ""]
@app.get("/")
def healthz(_: Request):
return text("all good :)")
@app.post("/")
def process_file(request: Request):
if request.files is None:
return text("no files were uploaded", 400)
file: File | None = request.files.get("file")
if file is None:
return text("file not found", 400)
doc = fitz.open(stream=file.body)
textpages = []
for page in doc:
assert isinstance(page, fitz.Page)
textpages.append(page.get_textpage().extractDICT())
doc.close()
sizes: set[int] = set()
for page in textpages:
for block in page["blocks"]:
for line in block["lines"]:
for span in line["spans"]:
sizes.add(round(span["size"]))
sorted_sizes = sorted(sizes)
h1_size = sorted_sizes[-1]
h2_size = sorted_sizes[-2]
pass1 = []
for p in textpages:
pass1_blocks = []
for block in p["blocks"]:
pass1_lines = []
for line in block["lines"]:
is_bullet = False
largest_size = 0
smallest_left = 10000
line_text = ""
for span_index, span in enumerate(line["spans"]):
t = span["text"]
if t in bullets and span_index == 0:
is_bullet = True
continue
span_size = round(span["size"])
if span_size > largest_size:
largest_size = span_size
span_left = round(span["origin"][0])
if span_left < smallest_left:
smallest_left = span_left
line_text += t
line_type = "p"
if is_bullet:
line_type = "li"
elif largest_size == h1_size:
line_type = "h1"
elif largest_size == h2_size:
line_type = "h2"
pass1_lines.append({
"type": line_type,
"value": line_text,
"left": smallest_left,
})
pass1_blocks.append(pass1_lines)
pass1.append(pass1_blocks)
return json(pass1)