import fitz from flask import Flask, request from werkzeug.utils import secure_filename from waitress import serve from os import getenv app = Flask(__name__) bullets = ["▶"] @app.post("/") def process_file(): file = request.files["file"] if file is None: return "file not found", 400 assert file.filename is not None filename = secure_filename(file.filename) doc = fitz.open(filename) textpages = [] for page in doc: assert isinstance(page, fitz.Page) textpages.append(page.get_textpage().extractDICT()) doc.close() sizes: set[int] = set() for page in textpages: for block in page["blocks"]: for line in block["lines"]: for span in line["spans"]: sizes.add(round(span["size"])) sorted_sizes = sorted(sizes) h1_size = sorted_sizes[-1] h2_size = sorted_sizes[-2] output = [] for p in textpages: outputBlocks = [] for block in p["blocks"]: outputLines = [] for line in block["lines"]: is_bullet = False largest_size = 0 line_text = "" for span_index, span in enumerate(line["spans"]): t = span["text"] if t in bullets and span_index == 0: is_bullet = True continue span_size = round(span["size"]) if span_size > largest_size: largest_size = span_size line_text += t line_type = "p" if is_bullet: line_type = "li" elif largest_size == h1_size: line_type = "h1" elif largest_size == h2_size: line_type = "h2" outputLines.append({ "type": line_type, "value": line_text }) outputBlocks.append(outputLines) output.append(outputBlocks) return output if __name__ == "__main__": port = getenv("PORT") portNum = 8080 if port is not None and port.isnumeric(): portNum = int(port) print("using port {}".format(portNum)) serve(app, port=portNum)