Initial commit

2025-12-26 00:52:56 +00:00 · 2025-12-26 00:52:56 +00:00 · 983cee0320
commit 983cee0320
322 changed files with 57174 additions and 0 deletions
--- a/app/ocr.py
+++ b/app/ocr.py
@ -0,0 +1,173 @@
+"""
+OCR Processing Utility
+Extracts text from images and PDFs for searchability
+Security: All file paths validated before processing
+"""
+import os
+import tempfile
+from PIL import Image
+import pytesseract
+from pdf2image import convert_from_path
+import cv2
+import numpy as np
+
+
+def preprocess_image(image):
+    """
+    Preprocess image to improve OCR accuracy
+    - Convert to grayscale
+    - Apply adaptive thresholding
+    - Denoise
+    """
+    try:
+        # Convert PIL Image to numpy array
+        img_array = np.array(image)
+        
+        # Convert to grayscale
+        if len(img_array.shape) == 3:
+            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+        else:
+            gray = img_array
+        
+        # Apply adaptive thresholding
+        thresh = cv2.adaptiveThreshold(
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
+        )
+        
+        # Denoise
+        denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
+        
+        # Convert back to PIL Image
+        return Image.fromarray(denoised)
+    except Exception as e:
+        print(f"Error preprocessing image: {str(e)}")
+        # Return original image if preprocessing fails
+        return image
+
+
+def extract_text_from_image(image_path):
+    """
+    Extract text from an image file using OCR
+    Supports: PNG, JPG, JPEG
+    Security: Validates file exists and is readable
+    Returns: Extracted text or empty string on failure
+    """
+    try:
+        # Security: Validate file exists
+        if not os.path.exists(image_path):
+            print(f"Image file not found: {image_path}")
+            return ""
+        
+        # Open and preprocess image
+        image = Image.open(image_path)
+        preprocessed = preprocess_image(image)
+        
+        # Extract text using Tesseract with English + Romanian
+        text = pytesseract.image_to_string(
+            preprocessed,
+            lang='eng+ron',  # Support both English and Romanian
+            config='--psm 6'  # Assume uniform block of text
+        )
+        
+        return text.strip()
+    except Exception as e:
+        print(f"Error extracting text from image {image_path}: {str(e)}")
+        return ""
+
+
+def extract_text_from_pdf(pdf_path):
+    """
+    Extract text from a PDF file using OCR
+    Converts PDF pages to images, then applies OCR
+    Security: Validates file exists and is readable
+    Returns: Extracted text or empty string on failure
+    """
+    try:
+        # Security: Validate file exists
+        if not os.path.exists(pdf_path):
+            print(f"PDF file not found: {pdf_path}")
+            return ""
+        
+        # Convert PDF to images (first 10 pages max to avoid memory issues)
+        pages = convert_from_path(pdf_path, first_page=1, last_page=10, dpi=300)
+        
+        extracted_text = []
+        for i, page in enumerate(pages):
+            # Preprocess page
+            preprocessed = preprocess_image(page)
+            
+            # Extract text
+            text = pytesseract.image_to_string(
+                preprocessed,
+                lang='eng+ron',
+                config='--psm 6'
+            )
+            
+            if text.strip():
+                extracted_text.append(f"--- Page {i+1} ---\n{text.strip()}")
+        
+        return "\n\n".join(extracted_text)
+    except Exception as e:
+        print(f"Error extracting text from PDF {pdf_path}: {str(e)}")
+        return ""
+
+
+def extract_text_from_file(file_path, file_type):
+    """
+    Extract text from any supported file type
+    Security: Validates file path and type before processing
+    
+    Args:
+        file_path: Absolute path to the file
+        file_type: File extension (pdf, png, jpg, jpeg)
+    
+    Returns:
+        Extracted text or empty string on failure
+    """
+    try:
+        # Security: Validate file path
+        if not os.path.isabs(file_path):
+            print(f"Invalid file path (not absolute): {file_path}")
+            return ""
+        
+        if not os.path.exists(file_path):
+            print(f"File not found: {file_path}")
+            return ""
+        
+        # Normalize file type
+        file_type = file_type.lower().strip('.')
+        
+        # Route to appropriate extractor
+        if file_type == 'pdf':
+            return extract_text_from_pdf(file_path)
+        elif file_type in ['png', 'jpg', 'jpeg']:
+            return extract_text_from_image(file_path)
+        else:
+            print(f"Unsupported file type for OCR: {file_type}")
+            return ""
+    except Exception as e:
+        print(f"Error in extract_text_from_file: {str(e)}")
+        return ""
+
+
+def process_ocr_async(file_path, file_type):
+    """
+    Wrapper for async OCR processing
+    Can be used with background jobs if needed
+    
+    Returns:
+        Dictionary with success status and extracted text
+    """
+    try:
+        text = extract_text_from_file(file_path, file_type)
+        return {
+            'success': True,
+            'text': text,
+            'length': len(text)
+        }
+    except Exception as e:
+        return {
+            'success': False,
+            'error': str(e),
+            'text': ''
+        }