2024-04-18 11:49:41 +01:00
|
|
|
import fitz
|
|
|
|
from flask import Flask, request
|
2024-04-18 14:07:48 +01:00
|
|
|
from flask_cors import CORS
|
2024-04-18 11:49:41 +01:00
|
|
|
from werkzeug.utils import secure_filename
|
|
|
|
from waitress import serve
|
2024-04-18 14:07:48 +01:00
|
|
|
from os import getenv, path
|
|
|
|
from tempfile import gettempdir
|
2024-04-18 11:49:41 +01:00
|
|
|
|
|
|
|
app = Flask(__name__)
|
2024-04-18 14:07:48 +01:00
|
|
|
CORS(app)
|
2024-04-18 11:49:41 +01:00
|
|
|
|
|
|
|
bullets = ["▶"]
|
|
|
|
|
|
|
|
@app.post("/")
|
|
|
|
def process_file():
|
2024-04-18 17:43:47 +01:00
|
|
|
print(request)
|
|
|
|
print(request.content_type)
|
|
|
|
print(request.files)
|
2024-04-18 11:49:41 +01:00
|
|
|
file = request.files["file"]
|
|
|
|
if file is None:
|
|
|
|
return "file not found", 400
|
|
|
|
|
|
|
|
assert file.filename is not None
|
2024-04-18 14:07:48 +01:00
|
|
|
filename = path.join(gettempdir(), secure_filename(file.filename))
|
|
|
|
file.save(filename)
|
2024-04-18 11:49:41 +01:00
|
|
|
|
|
|
|
doc = fitz.open(filename)
|
|
|
|
|
|
|
|
textpages = []
|
|
|
|
for page in doc:
|
|
|
|
assert isinstance(page, fitz.Page)
|
|
|
|
textpages.append(page.get_textpage().extractDICT())
|
|
|
|
doc.close()
|
|
|
|
|
|
|
|
sizes: set[int] = set()
|
|
|
|
for page in textpages:
|
|
|
|
for block in page["blocks"]:
|
|
|
|
for line in block["lines"]:
|
|
|
|
for span in line["spans"]:
|
|
|
|
sizes.add(round(span["size"]))
|
|
|
|
|
|
|
|
sorted_sizes = sorted(sizes)
|
|
|
|
h1_size = sorted_sizes[-1]
|
|
|
|
h2_size = sorted_sizes[-2]
|
|
|
|
|
2024-04-18 14:07:48 +01:00
|
|
|
pass1 = []
|
2024-04-18 11:49:41 +01:00
|
|
|
for p in textpages:
|
2024-04-18 14:07:48 +01:00
|
|
|
pass1_blocks = []
|
2024-04-18 11:49:41 +01:00
|
|
|
for block in p["blocks"]:
|
2024-04-18 14:07:48 +01:00
|
|
|
pass1_lines = []
|
2024-04-18 11:49:41 +01:00
|
|
|
for line in block["lines"]:
|
|
|
|
is_bullet = False
|
|
|
|
largest_size = 0
|
2024-04-18 14:07:48 +01:00
|
|
|
smallest_left = 10000
|
2024-04-18 11:49:41 +01:00
|
|
|
line_text = ""
|
|
|
|
for span_index, span in enumerate(line["spans"]):
|
|
|
|
t = span["text"]
|
|
|
|
if t in bullets and span_index == 0:
|
|
|
|
is_bullet = True
|
|
|
|
continue
|
|
|
|
|
|
|
|
span_size = round(span["size"])
|
|
|
|
if span_size > largest_size:
|
|
|
|
largest_size = span_size
|
2024-04-18 14:07:48 +01:00
|
|
|
span_left = round(span["origin"][0])
|
|
|
|
if span_left < smallest_left:
|
|
|
|
smallest_left = span_left
|
2024-04-18 11:49:41 +01:00
|
|
|
line_text += t
|
|
|
|
|
|
|
|
line_type = "p"
|
|
|
|
if is_bullet:
|
|
|
|
line_type = "li"
|
|
|
|
elif largest_size == h1_size:
|
|
|
|
line_type = "h1"
|
|
|
|
elif largest_size == h2_size:
|
|
|
|
line_type = "h2"
|
|
|
|
|
2024-04-18 14:07:48 +01:00
|
|
|
pass1_lines.append({
|
2024-04-18 11:49:41 +01:00
|
|
|
"type": line_type,
|
2024-04-18 14:07:48 +01:00
|
|
|
"value": line_text,
|
|
|
|
"left": smallest_left,
|
2024-04-18 11:49:41 +01:00
|
|
|
})
|
|
|
|
|
2024-04-18 14:07:48 +01:00
|
|
|
pass1_blocks.append(pass1_lines)
|
|
|
|
pass1.append(pass1_blocks)
|
|
|
|
|
|
|
|
return pass1
|
2024-04-18 11:49:41 +01:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-04-18 14:07:48 +01:00
|
|
|
|
2024-04-18 11:49:41 +01:00
|
|
|
port = getenv("PORT")
|
|
|
|
portNum = 8080
|
|
|
|
if port is not None and port.isnumeric():
|
|
|
|
portNum = int(port)
|
|
|
|
|
|
|
|
print("using port {}".format(portNum))
|
|
|
|
serve(app, port=portNum)
|