import { AutoProcessor, AutoModelForVision2Seq, RawImage, TextStreamer, load_image } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers"; import {doclingToHtml} from "./docling-html-parser.js"; const modelLoaderOverlay = document.getElementById("model-loader-overlay"); const imageDropArea = document.getElementById("image-drop-area"); const imagePlaceholder = document.getElementById("image-placeholder"); const imagePreviewContainer = document.getElementById("image-preview-container"); const imagePreview = document.getElementById("image-preview"); const removeImageBtn = document.getElementById("remove-image-btn"); const fileInput = document.getElementById("file-input"); const exampleImages = document.querySelectorAll(".example-image"); const examplesContainer = document.getElementById("examples-container"); const examplesTitle = document.getElementById("examples-title"); const processingIndicator = document.getElementById("processing-indicator"); const welcomeMessage = document.getElementById("welcome-message"); const doclingView = document.getElementById("docling-view"); const htmlView = document.getElementById("html-view"); const doclingOutput = document.getElementById("docling-output"); const htmlIframe = document.getElementById("html-iframe"); const viewToggle = document.getElementById("view-toggle"); const hiddenCanvas = document.getElementById("hidden-canvas"); const promptInput = document.getElementById("prompt-input"); const generateBtn = document.getElementById("generate-btn"); let model, processor; let currentImageWidth, currentImageHeight; let currentImage = null; /** * Loads and initializes the model and processor. */ async function initializeModel() { try { const model_id = "onnx-community/granite-docling-258M-ONNX"; processor = await AutoProcessor.from_pretrained(model_id); const progress = {}; model = await AutoModelForVision2Seq.from_pretrained(model_id, { dtype: { embed_tokens: "fp16", // fp32 (231 MB) | fp16 (116 MB) vision_encoder: "fp32", // fp32 (374 MB) decoder_model_merged: "fp32", // fp32 (658 MB) | q4 (105 MB), q4 sometimes into repetition issues }, device: "webgpu", progress_callback: (data) => { if (data.status === "progress" && data.file?.endsWith?.("onnx_data")) { progress[data.file] = data; const progressPercent = Math.round(data.progress); if (Object.keys(progress).length !== 3) return; let sum = 0; let total = 0; for (const [key, val] of Object.entries(progress)) { sum += val.loaded; total += val.total; } const overallPercent = Math.round((sum / total) * 100); document.getElementById("model-progress").value = overallPercent; document.getElementById("progress-text").textContent = overallPercent + "%"; } }, }); modelLoaderOverlay.style.display = "none"; console.log("Model loaded successfully."); } catch (error) { console.error("Failed to load model:", error); modelLoaderOverlay.innerHTML = `

Failed to Load Model

Please refresh the page to try again. Check the console for errors.

`; } } /** * Processes an image and generates Docling text. * @param {ImageBitmap|HTMLImageElement} imageObject An image object to process. */ async function processImage(imageObject) { if (!model || !processor) { alert("Model is not loaded yet. Please wait."); return; } // Reset UI setUiState("processing"); clearOverlays(); let fullText = ""; doclingOutput.textContent = ""; htmlIframe.srcdoc = ""; try { // 1. Draw image to canvas and get RawImage const ctx = hiddenCanvas.getContext("2d"); hiddenCanvas.width = imageObject.width; hiddenCanvas.height = imageObject.height; ctx.drawImage(imageObject, 0, 0); const image = RawImage.fromCanvas(hiddenCanvas); // 2. Create input messages const messages = [ { role: "user", content: [{type: "image"}, {type: "text", text: promptInput.value}], }, ]; // 3. Prepare inputs for the model const text = processor.apply_chat_template(messages, { add_generation_prompt: true, }); const inputs = await processor(text, [image], { do_image_splitting: true, }); // 5. Generate output await model.generate({ ...inputs, max_new_tokens: 4096, streamer: new TextStreamer(processor.tokenizer, { skip_prompt: true, skip_special_tokens: false, callback_function: (streamedText) => { fullText += streamedText; doclingOutput.textContent += streamedText; }, }), }); // Strip <|end_of_text|> from the end fullText = fullText.replace(/<\|end_of_text\|>$/, ""); doclingOutput.textContent = fullText; // Parse loc tags and create overlays const tagRegex = /<(\w+)>/g; const overlays = []; let match; while ((match = tagRegex.exec(fullText)) !== null) { const tagType = match[1]; const locs = [parseInt(match[2]), parseInt(match[3]), parseInt(match[4]), parseInt(match[5])]; overlays.push({tagType, locs}); } const colorMap = {}; function getRandomColor() { return `rgb(${Math.floor(Math.random() * 256)}, ${Math.floor(Math.random() * 256)}, ${Math.floor(Math.random() * 256)})`; } const imgRect = imagePreview.getBoundingClientRect(); const containerRect = imagePreviewContainer.getBoundingClientRect(); const imageOffsetLeft = imgRect.left - containerRect.left; const imageOffsetTop = imgRect.top - containerRect.top; const scaleX = imgRect.width / currentImageWidth; const scaleY = imgRect.height / currentImageHeight; overlays.forEach(({tagType, locs}) => { const color = colorMap[tagType] || (colorMap[tagType] = getRandomColor()); const [leftLoc, topLoc, rightLoc, bottomLoc] = locs; const left = imageOffsetLeft + (leftLoc / 500) * currentImageWidth * scaleX; const top = imageOffsetTop + (topLoc / 500) * currentImageHeight * scaleY; const width = ((rightLoc - leftLoc) / 500) * currentImageWidth * scaleX; const height = ((bottomLoc - topLoc) / 500) * currentImageHeight * scaleY; const overlay = document.createElement("div"); overlay.className = "overlay"; overlay.style.setProperty('--overlay-color', color); const rgbMatch = color.match(/rgb\((\d+),\s*(\d+),\s*(\d+)\)/); overlay.style.setProperty('--overlay-color-rgb', `${rgbMatch[1]},${rgbMatch[2]},${rgbMatch[3]}`); overlay.style.position = "absolute"; overlay.style.left = left + "px"; overlay.style.top = top + "px"; overlay.style.width = width + "px"; overlay.style.height = height + "px"; imagePreviewContainer.appendChild(overlay); }); // After generation, create the HTML iframe htmlIframe.srcdoc = doclingToHtml(fullText); } catch (error) { console.error("Error during image processing:", error); doclingOutput.textContent = `An error occurred: ${error.message}`; } finally { setUiState("result"); } } /** * Handles the selection of an image file. * @param {File|string} source The image file or URL. */ function handleImageSelection(source) { const reader = new FileReader(); const img = new Image(); img.onload = () => { currentImageWidth = img.naturalWidth; currentImageHeight = img.naturalHeight; currentImage = img; imagePreview.src = img.src; imagePlaceholder.classList.add("hidden"); imagePreviewContainer.classList.remove("hidden"); examplesContainer.classList.add("hidden"); examplesTitle.classList.add("hidden"); processImage(img); }; img.onerror = () => { alert("Failed to load image."); }; if (typeof source === "string") { // It's a URL // To avoid CORS issues with canvas, we can try to fetch it first fetch(source) .then((res) => res.blob()) .then((blob) => { img.src = URL.createObjectURL(blob); }) .catch((e) => { console.error("CORS issue likely. Trying proxy or direct load.", e); // Fallback to direct load which might taint the canvas img.crossOrigin = "anonymous"; img.src = source; }); } else { // It's a File object reader.onload = (e) => { img.src = e.target.result; }; reader.readAsDataURL(source); } } /** * Manages the visibility of UI components based on the app state. * @param {'initial'|'processing'|'result'} state The current state. */ function setUiState(state) { welcomeMessage.style.display = "none"; processingIndicator.classList.add("hidden"); doclingView.classList.add("hidden"); htmlView.classList.add("hidden"); if (state === "initial") { welcomeMessage.style.display = "flex"; generateBtn.disabled = true; } else if (state === "processing") { viewToggle.checked = false; processingIndicator.classList.remove("hidden"); doclingView.classList.remove("hidden"); generateBtn.disabled = true; } else if (state === "result") { viewToggle.checked = true; htmlView.classList.remove("hidden"); generateBtn.disabled = false; } } /** * Clears all overlay divs from the image preview container. */ function clearOverlays() { document.querySelectorAll(".overlay").forEach((el) => el.remove()); } // Drag and Drop imageDropArea.addEventListener("click", () => fileInput.click()); imageDropArea.addEventListener("dragover", (e) => { e.preventDefault(); imageDropArea.classList.add("border-indigo-500", "bg-indigo-50"); }); imageDropArea.addEventListener("dragleave", () => { imageDropArea.classList.remove("border-indigo-500", "bg-indigo-50"); }); imageDropArea.addEventListener("drop", (e) => { e.preventDefault(); imageDropArea.classList.remove("border-indigo-500", "bg-indigo-50"); const files = e.dataTransfer.files; if (files.length > 0 && files[0].type.startsWith("image/")) { handleImageSelection(files[0]); } }); // File input fileInput.addEventListener("change", (e) => { const files = e.target.files; if (files.length > 0) { handleImageSelection(files[0]); } }); // Example images exampleImages.forEach((img) => { img.addEventListener("click", () => { promptInput.value = img.dataset.prompt; handleImageSelection(img.src); }); }); // Remove image removeImageBtn.addEventListener("click", (e) => { e.stopPropagation(); currentImage = null; imagePreview.src = ""; fileInput.value = ""; // Reset file input imagePlaceholder.classList.remove("hidden"); imagePreviewContainer.classList.add("hidden"); examplesContainer.classList.remove("hidden"); examplesTitle.classList.remove("hidden"); setUiState("initial"); doclingOutput.textContent = ""; htmlIframe.srcdoc = ""; clearOverlays(); }); // View toggle viewToggle.addEventListener("change", () => { const isHtmlView = viewToggle.checked; htmlView.classList.toggle("hidden", !isHtmlView); doclingView.classList.toggle("hidden", isHtmlView); }); // Generate button generateBtn.addEventListener("click", () => { if (currentImage) { processImage(currentImage); } }); document.addEventListener("DOMContentLoaded", () => { setUiState("initial"); // Set initial view correctly initializeModel(); });