import tempfile import fitz from sanic import Sanic, json, text from sanic.request import File, Request upload_folder = tempfile.gettempdir() app = Sanic("PlumberServer") bullets = ["▶", "•", "–"] @app.get("/") def healthz(_: Request): return text("all good :)") @app.post("/") def process_file(request: Request): if request.files is None: return text("no files were uploaded", 400) file: File | None = request.files.get("file") if file is None: return text("file not found", 400) doc = fitz.open(stream=file.body) textpages = [] for page in doc: assert isinstance(page, fitz.Page) textpages.append(page.get_textpage().extractDICT()) doc.close() sizes: set[int] = set() for page in textpages: for block in page["blocks"]: for line in block["lines"]: for span in line["spans"]: sizes.add(round(span["size"])) sorted_sizes = sorted(sizes) h1_size = sorted_sizes[-1] h2_size = sorted_sizes[-2] pass1 = [] for p in textpages: pass1_blocks = [] for block in p["blocks"]: pass1_lines = [] for line in block["lines"]: is_bullet = False largest_size = 0 smallest_left = 10000 line_text = "" for span_index, span in enumerate(line["spans"]): t = span["text"] if t in bullets and span_index == 0: is_bullet = True continue span_size = round(span["size"]) if span_size > largest_size: largest_size = span_size span_left = round(span["origin"][0]) if span_left < smallest_left: smallest_left = span_left line_text += t line_type = "p" if is_bullet: line_type = "li" elif largest_size == h1_size: line_type = "h1" elif largest_size == h2_size: line_type = "h2" pass1_lines.append( { "type": line_type, "value": line_text, "left": smallest_left, } ) pass1_blocks.append(pass1_lines) pass1.append(pass1_blocks) return json(pass1)