90 lines
2.5 KiB
Python
90 lines
2.5 KiB
Python
import tempfile
|
|
import fitz
|
|
from sanic import Sanic, json, text
|
|
from sanic.request import File, Request
|
|
|
|
|
|
upload_folder = tempfile.gettempdir()
|
|
app = Sanic("PlumberServer")
|
|
|
|
bullets = ["▶"]
|
|
|
|
|
|
@app.get("/")
|
|
def healthz(_: Request):
|
|
return text("all good :)")
|
|
|
|
|
|
@app.post("/")
|
|
def process_file(request: Request):
|
|
if request.files is None:
|
|
return text("no files were uploaded", 400)
|
|
|
|
file: File | None = request.files.get("file")
|
|
if file is None:
|
|
return text("file not found", 400)
|
|
|
|
doc = fitz.open(stream=file.body)
|
|
|
|
textpages = []
|
|
for page in doc:
|
|
assert isinstance(page, fitz.Page)
|
|
textpages.append(page.get_textpage().extractDICT())
|
|
doc.close()
|
|
|
|
sizes: set[int] = set()
|
|
for page in textpages:
|
|
for block in page["blocks"]:
|
|
for line in block["lines"]:
|
|
for span in line["spans"]:
|
|
sizes.add(round(span["size"]))
|
|
|
|
sorted_sizes = sorted(sizes)
|
|
h1_size = sorted_sizes[-1]
|
|
h2_size = sorted_sizes[-2]
|
|
|
|
pass1 = []
|
|
for p in textpages:
|
|
pass1_blocks = []
|
|
for block in p["blocks"]:
|
|
pass1_lines = []
|
|
for line in block["lines"]:
|
|
is_bullet = False
|
|
largest_size = 0
|
|
smallest_left = 10000
|
|
line_text = ""
|
|
for span_index, span in enumerate(line["spans"]):
|
|
t = span["text"]
|
|
if t in bullets and span_index == 0:
|
|
is_bullet = True
|
|
continue
|
|
|
|
span_size = round(span["size"])
|
|
if span_size > largest_size:
|
|
largest_size = span_size
|
|
span_left = round(span["origin"][0])
|
|
if span_left < smallest_left:
|
|
smallest_left = span_left
|
|
line_text += t
|
|
|
|
line_type = "p"
|
|
if is_bullet:
|
|
line_type = "li"
|
|
elif largest_size == h1_size:
|
|
line_type = "h1"
|
|
elif largest_size == h2_size:
|
|
line_type = "h2"
|
|
|
|
pass1_lines.append(
|
|
{
|
|
"type": line_type,
|
|
"value": line_text,
|
|
"left": smallest_left,
|
|
}
|
|
)
|
|
|
|
pass1_blocks.append(pass1_lines)
|
|
pass1.append(pass1_blocks)
|
|
|
|
return json(pass1)
|