Spaces:

Chamin09
/

BrailleMenuGenV2

Sleeping

App Files Files Community

Chamin09 commited on May 8

Commit

70d7f43

verified ·

1 Parent(s): 86d4eab

Create document_ai.py

Browse files

Files changed (1) hide show

models/document_ai.py +137 -0

models/document_ai.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+from PIL import Image
+import numpy as np
+import os
+import sys
+# Try to import pytesseract, but handle if it's not available
+try:
+    import pytesseract
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    TESSERACT_AVAILABLE = False
+# Check if tesseract is installed
+if TESSERACT_AVAILABLE:
+    try:
+        pytesseract.get_tesseract_version()
+    except Exception:
+        TESSERACT_AVAILABLE = False
+# Initialize the model and processor with caching
+processor = None
+model = None
+def get_document_ai_models():
+    """Get or initialize document AI models with proper caching."""
+    global processor, model
+    if processor is None:
+        from transformers import LayoutLMv2Processor
+        processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+    if model is None:
+        from transformers import LayoutLMv2ForSequenceClassification
+        model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
+    return processor, model
+def extract_text_with_tesseract(image):
+    """Extract text using Tesseract OCR."""
+    if not TESSERACT_AVAILABLE:
+        raise RuntimeError("tesseract is not installed or it's not in your PATH. See README file for more information.")
+    if isinstance(image, np.ndarray):
+        pil_image = Image.fromarray(image).convert("RGB")
+    else:
+        pil_image = image.convert("RGB")
+    # Use pytesseract for OCR
+    text = pytesseract.image_to_string(pil_image)
+    # Get word boxes for structure
+    boxes = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
+    # Extract words and their positions
+    words = []
+    word_boxes = []
+    for i in range(len(boxes['text'])):
+        if boxes['text'][i].strip() != '':
+            words.append(boxes['text'][i])
+            x, y, w, h = boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i]
+            word_boxes.append([x, y, x + w, y + h])
+    return words, word_boxes
+def extract_text_with_transformers(image):
+    """Extract text using transformers models when Tesseract is not available."""
+    try:
+        from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+        # Initialize the processor and model
+        processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
+        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
+        # Prepare the image
+        if isinstance(image, np.ndarray):
+            pil_image = Image.fromarray(image).convert("RGB")
+        else:
+            pil_image = image.convert("RGB")
+        # Process the image
+        pixel_values = processor(pil_image, return_tensors="pt").pixel_values
+        generated_ids = model.generate(pixel_values)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Split into words
+        words = generated_text.split()
+        # Since we don't have bounding boxes, return empty boxes
+        word_boxes = [[0, 0, 0, 0] for _ in words]
+        return words, word_boxes
+    except Exception as e:
+        # If transformers OCR fails, return a simple error message
+        return ["Error extracting text with transformers OCR:", str(e)], [[0, 0, 0, 0], [0, 0, 0, 0]]
+def extract_text_and_layout(image):
+    """
+    Extract text and layout information using OCR.
+    Args:
+        image: PIL Image object
+    Returns:
+        Dictionary with extracted text and layout information
+    """
+    # Convert numpy array to PIL Image if needed
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image).convert("RGB")
+    try:
+        # Try Tesseract first
+        if TESSERACT_AVAILABLE:
+            words, boxes = extract_text_with_tesseract(image)
+        else:
+            # Fall back to transformers OCR
+            words, boxes = extract_text_with_transformers(image)
+    except Exception as e:
+        # If both methods fail, return the error
+        return {
+            'words': [f"Error extracting text: {str(e)}"],
+            'boxes': [[0, 0, 0, 0]],
+            'success': False
+        }
+    # If no words were found, return empty result
+    if not words:
+        return {
+            'words': [],
+            'boxes': [],
+            'success': False
+        }
+    return {
+        'words': words,
+        'boxes': boxes,
+        'success': True
+    }