fina/app/ocr.py

"""
OCR Processing Utility
Extracts text from images and PDFs for searchability
Security: All file paths validated before processing
"""
import os
import tempfile
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import cv2
import numpy as np


def preprocess_image(image):
    """
    Preprocess image to improve OCR accuracy
    - Convert to grayscale
    - Apply adaptive thresholding
    - Denoise
    """
    try:
        # Convert PIL Image to numpy array
        img_array = np.array(image)

        # Convert to grayscale
        if len(img_array.shape) == 3:
            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        else:
            gray = img_array

        # Apply adaptive thresholding
        thresh = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )

        # Denoise
        denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)

        # Convert back to PIL Image
        return Image.fromarray(denoised)
    except Exception as e:
        print(f"Error preprocessing image: {str(e)}")
        # Return original image if preprocessing fails
        return image


def extract_text_from_image(image_path):
    """
    Extract text from an image file using OCR
    Supports: PNG, JPG, JPEG
    Security: Validates file exists and is readable
    Returns: Extracted text or empty string on failure
    """
    try:
        # Security: Validate file exists
        if not os.path.exists(image_path):
            print(f"Image file not found: {image_path}")
            return ""

        # Open and preprocess image
        image = Image.open(image_path)
        preprocessed = preprocess_image(image)

        # Extract text using Tesseract with English + Romanian
        text = pytesseract.image_to_string(
            preprocessed,
            lang='eng+ron',  # Support both English and Romanian
            config='--psm 6'  # Assume uniform block of text
        )

        return text.strip()
    except Exception as e:
        print(f"Error extracting text from image {image_path}: {str(e)}")
        return ""


def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file using OCR
    Converts PDF pages to images, then applies OCR
    Security: Validates file exists and is readable
    Returns: Extracted text or empty string on failure
    """
    try:
        # Security: Validate file exists
        if not os.path.exists(pdf_path):
            print(f"PDF file not found: {pdf_path}")
            return ""

        # Convert PDF to images (first 10 pages max to avoid memory issues)
        pages = convert_from_path(pdf_path, first_page=1, last_page=10, dpi=300)

        extracted_text = []
        for i, page in enumerate(pages):
            # Preprocess page
            preprocessed = preprocess_image(page)

            # Extract text
            text = pytesseract.image_to_string(
                preprocessed,
                lang='eng+ron',
                config='--psm 6'
            )

            if text.strip():
                extracted_text.append(f"--- Page {i+1} ---\n{text.strip()}")

        return "\n\n".join(extracted_text)
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {str(e)}")
        return ""


def extract_text_from_file(file_path, file_type):
    """
    Extract text from any supported file type
    Security: Validates file path and type before processing

    Args:
        file_path: Absolute path to the file
        file_type: File extension (pdf, png, jpg, jpeg)

    Returns:
        Extracted text or empty string on failure
    """
    try:
        # Security: Validate file path
        if not os.path.isabs(file_path):
            print(f"Invalid file path (not absolute): {file_path}")
            return ""

        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return ""

        # Normalize file type
        file_type = file_type.lower().strip('.')

        # Route to appropriate extractor
        if file_type == 'pdf':
            return extract_text_from_pdf(file_path)
        elif file_type in ['png', 'jpg', 'jpeg']:
            return extract_text_from_image(file_path)
        else:
            print(f"Unsupported file type for OCR: {file_type}")
            return ""
    except Exception as e:
        print(f"Error in extract_text_from_file: {str(e)}")
        return ""


def process_ocr_async(file_path, file_type):
    """
    Wrapper for async OCR processing
    Can be used with background jobs if needed

    Returns:
        Dictionary with success status and extracted text
    """
    try:
        text = extract_text_from_file(file_path, file_type)
        return {
            'success': True,
            'text': text,
            'length': len(text)
        }
    except Exception as e:
        return {
            'success': False,
            'error': str(e),
            'text': ''
        }