90 lines
2.5 KiB
Python
90 lines
2.5 KiB
Python
import tempfile
|
||
import fitz
|
||
from sanic import Sanic, json, text
|
||
from sanic.request import File, Request
|
||
|
||
|
||
upload_folder = tempfile.gettempdir()
|
||
app = Sanic("PlumberServer")
|
||
|
||
bullets = ["▶", "•", "–"]
|
||
|
||
|
||
@app.get("/")
|
||
def healthz(_: Request):
|
||
return text("all good :)")
|
||
|
||
|
||
@app.post("/")
|
||
def process_file(request: Request):
|
||
if request.files is None:
|
||
return text("no files were uploaded", 400)
|
||
|
||
file: File | None = request.files.get("file")
|
||
if file is None:
|
||
return text("file not found", 400)
|
||
|
||
doc = fitz.open(stream=file.body)
|
||
|
||
textpages = []
|
||
for page in doc:
|
||
assert isinstance(page, fitz.Page)
|
||
textpages.append(page.get_textpage().extractDICT())
|
||
doc.close()
|
||
|
||
sizes: set[int] = set()
|
||
for page in textpages:
|
||
for block in page["blocks"]:
|
||
for line in block["lines"]:
|
||
for span in line["spans"]:
|
||
sizes.add(round(span["size"]))
|
||
|
||
sorted_sizes = sorted(sizes)
|
||
h1_size = sorted_sizes[-1]
|
||
h2_size = sorted_sizes[-2]
|
||
|
||
pass1 = []
|
||
for p in textpages:
|
||
pass1_blocks = []
|
||
for block in p["blocks"]:
|
||
pass1_lines = []
|
||
for line in block["lines"]:
|
||
is_bullet = False
|
||
largest_size = 0
|
||
smallest_left = 10000
|
||
line_text = ""
|
||
for span_index, span in enumerate(line["spans"]):
|
||
t = span["text"]
|
||
if t in bullets and span_index == 0:
|
||
is_bullet = True
|
||
continue
|
||
|
||
span_size = round(span["size"])
|
||
if span_size > largest_size:
|
||
largest_size = span_size
|
||
span_left = round(span["origin"][0])
|
||
if span_left < smallest_left:
|
||
smallest_left = span_left
|
||
line_text += t
|
||
|
||
line_type = "p"
|
||
if is_bullet:
|
||
line_type = "li"
|
||
elif largest_size == h1_size:
|
||
line_type = "h1"
|
||
elif largest_size == h2_size:
|
||
line_type = "h2"
|
||
|
||
pass1_lines.append(
|
||
{
|
||
"type": line_type,
|
||
"value": line_text,
|
||
"left": smallest_left,
|
||
}
|
||
)
|
||
|
||
pass1_blocks.append(pass1_lines)
|
||
pass1.append(pass1_blocks)
|
||
|
||
return json(pass1)
|