plumberserver/main.py

84 lines
2.3 KiB
Python
Raw Normal View History

2024-04-18 11:49:41 +01:00
import fitz
from flask import Flask, request
from werkzeug.utils import secure_filename
from waitress import serve
from os import getenv
app = Flask(__name__)
bullets = [""]
@app.post("/")
def process_file():
file = request.files["file"]
if file is None:
return "file not found", 400
assert file.filename is not None
filename = secure_filename(file.filename)
doc = fitz.open(filename)
textpages = []
for page in doc:
assert isinstance(page, fitz.Page)
textpages.append(page.get_textpage().extractDICT())
doc.close()
sizes: set[int] = set()
for page in textpages:
for block in page["blocks"]:
for line in block["lines"]:
for span in line["spans"]:
sizes.add(round(span["size"]))
sorted_sizes = sorted(sizes)
h1_size = sorted_sizes[-1]
h2_size = sorted_sizes[-2]
output = []
for p in textpages:
outputBlocks = []
for block in p["blocks"]:
outputLines = []
for line in block["lines"]:
is_bullet = False
largest_size = 0
line_text = ""
for span_index, span in enumerate(line["spans"]):
t = span["text"]
if t in bullets and span_index == 0:
is_bullet = True
continue
span_size = round(span["size"])
if span_size > largest_size:
largest_size = span_size
line_text += t
line_type = "p"
if is_bullet:
line_type = "li"
elif largest_size == h1_size:
line_type = "h1"
elif largest_size == h2_size:
line_type = "h2"
outputLines.append({
"type": line_type,
"value": line_text
})
outputBlocks.append(outputLines)
output.append(outputBlocks)
return output
if __name__ == "__main__":
port = getenv("PORT")
portNum = 8080
if port is not None and port.isnumeric():
portNum = int(port)
print("using port {}".format(portNum))
serve(app, port=portNum)