fina/app/ocr.py

"""
OCR Processing Utility
Extracts text from images and PDFs for searchability
Security: All file paths validated before processing
"""
import os
import tempfile
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import cv2
import numpy as np


def preprocess_image(image):
    """
    Preprocess image to improve OCR accuracy
    - Convert to grayscale
    - Apply adaptive thresholding
    - Denoise
    """
    try:
        # Convert PIL Image to numpy array
        img_array = np.array(image)
        
        # Convert to grayscale
        if len(img_array.shape) == 3:
            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        else:
            gray = img_array
        
        # Apply adaptive thresholding
        thresh = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )
        
        # Denoise
        denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
        
        # Convert back to PIL Image
        return Image.fromarray(denoised)
    except Exception as e:
        print(f"Error preprocessing image: {str(e)}")
        # Return original image if preprocessing fails
        return image


def extract_text_from_image(image_path):
    """
    Extract text from an image file using OCR
    Supports: PNG, JPG, JPEG
    Security: Validates file exists and is readable
    Returns: Extracted text or empty string on failure
    """
    try:
        # Security: Validate file exists
        if not os.path.exists(image_path):
            print(f"Image file not found: {image_path}")
            return ""
        
        # Open and preprocess image
        image = Image.open(image_path)
        preprocessed = preprocess_image(image)
        
        # Extract text using Tesseract with English + Romanian
        text = pytesseract.image_to_string(
            preprocessed,
            lang='eng+ron',  # Support both English and Romanian
            config='--psm 6'  # Assume uniform block of text
        )
        
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from image {image_path}: {str(e)}")
        return ""


def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file using OCR
    Converts PDF pages to images, then applies OCR
    Security: Validates file exists and is readable
    Returns: Extracted text or empty string on failure
    """
    try:
        # Security: Validate file exists
        if not os.path.exists(pdf_path):
            print(f"PDF file not found: {pdf_path}")
            return ""
        
        # Convert PDF to images (first 10 pages max to avoid memory issues)
        pages = convert_from_path(pdf_path, first_page=1, last_page=10, dpi=300)
        
        extracted_text = []
        for i, page in enumerate(pages):
            # Preprocess page
            preprocessed = preprocess_image(page)
            
            # Extract text
            text = pytesseract.image_to_string(
                preprocessed,
                lang='eng+ron',
                config='--psm 6'
            )
            
            if text.strip():
                extracted_text.append(f"--- Page {i+1} ---\n{text.strip()}")
        
        return "\n\n".join(extracted_text)
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {str(e)}")
        return ""


def extract_text_from_file(file_path, file_type):
    """
    Extract text from any supported file type
    Security: Validates file path and type before processing
    
    Args:
        file_path: Absolute path to the file
        file_type: File extension (pdf, png, jpg, jpeg)
    
    Returns:
        Extracted text or empty string on failure
    """
    try:
        # Security: Validate file path
        if not os.path.isabs(file_path):
            print(f"Invalid file path (not absolute): {file_path}")
            return ""
        
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return ""
        
        # Normalize file type
        file_type = file_type.lower().strip('.')
        
        # Route to appropriate extractor
        if file_type == 'pdf':
            return extract_text_from_pdf(file_path)
        elif file_type in ['png', 'jpg', 'jpeg']:
            return extract_text_from_image(file_path)
        else:
            print(f"Unsupported file type for OCR: {file_type}")
            return ""
    except Exception as e:
        print(f"Error in extract_text_from_file: {str(e)}")
        return ""


def process_ocr_async(file_path, file_type):
    """
    Wrapper for async OCR processing
    Can be used with background jobs if needed
    
    Returns:
        Dictionary with success status and extracted text
    """
    try:
        text = extract_text_from_file(file_path, file_type)
        return {
            'success': True,
            'text': text,
            'length': len(text)
        }
    except Exception as e:
        return {
            'success': False,
            'error': str(e),
            'text': ''
        }
Initial commit 2025-12-26 00:52:56 +00:00			`"""`
			`OCR Processing Utility`
			`Extracts text from images and PDFs for searchability`
			`Security: All file paths validated before processing`
			`"""`
			`import os`
			`import tempfile`
			`from PIL import Image`
			`import pytesseract`
			`from pdf2image import convert_from_path`
			`import cv2`
			`import numpy as np`


			`def preprocess_image(image):`
			`"""`
			`Preprocess image to improve OCR accuracy`
			`- Convert to grayscale`
			`- Apply adaptive thresholding`
			`- Denoise`
			`"""`
			`try:`
			`# Convert PIL Image to numpy array`
			`img_array = np.array(image)`

			`# Convert to grayscale`
			`if len(img_array.shape) == 3:`
			`gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)`
			`else:`
			`gray = img_array`

			`# Apply adaptive thresholding`
			`thresh = cv2.adaptiveThreshold(`
			`gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2`
			`)`

			`# Denoise`
			`denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)`

			`# Convert back to PIL Image`
			`return Image.fromarray(denoised)`
			`except Exception as e:`
			`print(f"Error preprocessing image: {str(e)}")`
			`# Return original image if preprocessing fails`
			`return image`


			`def extract_text_from_image(image_path):`
			`"""`
			`Extract text from an image file using OCR`
			`Supports: PNG, JPG, JPEG`
			`Security: Validates file exists and is readable`
			`Returns: Extracted text or empty string on failure`
			`"""`
			`try:`
			`# Security: Validate file exists`
			`if not os.path.exists(image_path):`
			`print(f"Image file not found: {image_path}")`
			`return ""`

			`# Open and preprocess image`
			`image = Image.open(image_path)`
			`preprocessed = preprocess_image(image)`

			`# Extract text using Tesseract with English + Romanian`
			`text = pytesseract.image_to_string(`
			`preprocessed,`
			`lang='eng+ron', # Support both English and Romanian`
			`config='--psm 6' # Assume uniform block of text`
			`)`

			`return text.strip()`
			`except Exception as e:`
			`print(f"Error extracting text from image {image_path}: {str(e)}")`
			`return ""`


			`def extract_text_from_pdf(pdf_path):`
			`"""`
			`Extract text from a PDF file using OCR`
			`Converts PDF pages to images, then applies OCR`
			`Security: Validates file exists and is readable`
			`Returns: Extracted text or empty string on failure`
			`"""`
			`try:`
			`# Security: Validate file exists`
			`if not os.path.exists(pdf_path):`
			`print(f"PDF file not found: {pdf_path}")`
			`return ""`

			`# Convert PDF to images (first 10 pages max to avoid memory issues)`
			`pages = convert_from_path(pdf_path, first_page=1, last_page=10, dpi=300)`

			`extracted_text = []`
			`for i, page in enumerate(pages):`
			`# Preprocess page`
			`preprocessed = preprocess_image(page)`

			`# Extract text`
			`text = pytesseract.image_to_string(`
			`preprocessed,`
			`lang='eng+ron',`
			`config='--psm 6'`
			`)`

			`if text.strip():`
			`extracted_text.append(f"--- Page {i+1} ---\n{text.strip()}")`

			`return "\n\n".join(extracted_text)`
			`except Exception as e:`
			`print(f"Error extracting text from PDF {pdf_path}: {str(e)}")`
			`return ""`


			`def extract_text_from_file(file_path, file_type):`
			`"""`
			`Extract text from any supported file type`
			`Security: Validates file path and type before processing`

			`Args:`
			`file_path: Absolute path to the file`
			`file_type: File extension (pdf, png, jpg, jpeg)`

			`Returns:`
			`Extracted text or empty string on failure`
			`"""`
			`try:`
			`# Security: Validate file path`
			`if not os.path.isabs(file_path):`
			`print(f"Invalid file path (not absolute): {file_path}")`
			`return ""`

			`if not os.path.exists(file_path):`
			`print(f"File not found: {file_path}")`
			`return ""`

			`# Normalize file type`
			`file_type = file_type.lower().strip('.')`

			`# Route to appropriate extractor`
			`if file_type == 'pdf':`
			`return extract_text_from_pdf(file_path)`
			`elif file_type in ['png', 'jpg', 'jpeg']:`
			`return extract_text_from_image(file_path)`
			`else:`
			`print(f"Unsupported file type for OCR: {file_type}")`
			`return ""`
			`except Exception as e:`
			`print(f"Error in extract_text_from_file: {str(e)}")`
			`return ""`


			`def process_ocr_async(file_path, file_type):`
			`"""`
			`Wrapper for async OCR processing`
			`Can be used with background jobs if needed`

			`Returns:`
			`Dictionary with success status and extracted text`
			`"""`
			`try:`
			`text = extract_text_from_file(file_path, file_type)`
			`return {`
			`'success': True,`
			`'text': text,`
			`'length': len(text)`
			`}`
			`except Exception as e:`
			`return {`
			`'success': False,`
			`'error': str(e),`
			`'text': ''`
			`}`