from PIL import Image from huggingface_hub import hf_hub_download from doclayout_yolo import YOLOv10 from ..storage.schemas import BaseBox import tempfile from pathlib import Path filepath = hf_hub_download( repo_id="juliozhao/DocLayout-YOLO-DocStructBench", filename="doclayout_yolo_docstructbench_imgsz1024.pt" ) model = YOLOv10(filepath) def parse_img( img: Image.Image, device: str = "cpu", box_directory: str = "src/boxes", ): """ Processes an image, runs detection, crops boxes, saves their images, and returns a list of BaseBox objects with box metadata. """ # Create box directory if it doesn't exist Path(box_directory).mkdir(parents=True, exist_ok=True) # Create temp file with delete=False so it stays on disk with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file: img.save(temp_file.name, format="PNG") img_path = temp_file.name # Now model.predict can access the file det_res = model.predict( img_path, imgsz=1024, conf=0.2, device=device ) boxes_data = det_res[0].boxes.data boxes_result = [] crop_image_list = [] for i, box_data in enumerate(boxes_data): box_data = box_data.tolist() crop = img.crop(tuple(box_data[:4])) box_path = str(Path(box_directory) / f"box_{i}.png") crop.save(box_path) crop_image_list.append(crop) box_info = BaseBox( class_name=int(box_data[-1]), x_min=float(box_data[0]), y_min=float(box_data[1]), x_max=float(box_data[2]), y_max=float(box_data[3]), confidence=float(box_data[-2]), saved_img_path=box_path ) boxes_result.append(box_info) # Clean up temp file Path(img_path).unlink(missing_ok=True) return boxes_result, crop_image_list