84 lines
2.3 KiB
Python
84 lines
2.3 KiB
Python
|
import fitz
|
||
|
from flask import Flask, request
|
||
|
from werkzeug.utils import secure_filename
|
||
|
from waitress import serve
|
||
|
from os import getenv
|
||
|
|
||
|
app = Flask(__name__)
|
||
|
|
||
|
bullets = ["▶"]
|
||
|
|
||
|
@app.post("/")
|
||
|
def process_file():
|
||
|
file = request.files["file"]
|
||
|
if file is None:
|
||
|
return "file not found", 400
|
||
|
|
||
|
assert file.filename is not None
|
||
|
filename = secure_filename(file.filename)
|
||
|
|
||
|
doc = fitz.open(filename)
|
||
|
|
||
|
textpages = []
|
||
|
for page in doc:
|
||
|
assert isinstance(page, fitz.Page)
|
||
|
textpages.append(page.get_textpage().extractDICT())
|
||
|
doc.close()
|
||
|
|
||
|
sizes: set[int] = set()
|
||
|
for page in textpages:
|
||
|
for block in page["blocks"]:
|
||
|
for line in block["lines"]:
|
||
|
for span in line["spans"]:
|
||
|
sizes.add(round(span["size"]))
|
||
|
|
||
|
sorted_sizes = sorted(sizes)
|
||
|
h1_size = sorted_sizes[-1]
|
||
|
h2_size = sorted_sizes[-2]
|
||
|
|
||
|
output = []
|
||
|
for p in textpages:
|
||
|
outputBlocks = []
|
||
|
for block in p["blocks"]:
|
||
|
outputLines = []
|
||
|
for line in block["lines"]:
|
||
|
is_bullet = False
|
||
|
largest_size = 0
|
||
|
line_text = ""
|
||
|
for span_index, span in enumerate(line["spans"]):
|
||
|
t = span["text"]
|
||
|
if t in bullets and span_index == 0:
|
||
|
is_bullet = True
|
||
|
continue
|
||
|
|
||
|
span_size = round(span["size"])
|
||
|
if span_size > largest_size:
|
||
|
largest_size = span_size
|
||
|
line_text += t
|
||
|
|
||
|
line_type = "p"
|
||
|
if is_bullet:
|
||
|
line_type = "li"
|
||
|
elif largest_size == h1_size:
|
||
|
line_type = "h1"
|
||
|
elif largest_size == h2_size:
|
||
|
line_type = "h2"
|
||
|
|
||
|
outputLines.append({
|
||
|
"type": line_type,
|
||
|
"value": line_text
|
||
|
})
|
||
|
outputBlocks.append(outputLines)
|
||
|
output.append(outputBlocks)
|
||
|
|
||
|
return output
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
port = getenv("PORT")
|
||
|
portNum = 8080
|
||
|
if port is not None and port.isnumeric():
|
||
|
portNum = int(port)
|
||
|
|
||
|
print("using port {}".format(portNum))
|
||
|
serve(app, port=portNum)
|