fina/backup/first -fina app/app/ocr.py

"""
Receipt OCR Module
Extracts amount, date, and merchant information from receipt images using Tesseract OCR
"""

import pytesseract
from PIL import Image
import re
from datetime import datetime
from dateutil import parser as date_parser
import os


def extract_receipt_data(image_path):
    """
    Extract structured data from receipt image

    Args:
        image_path: Path to the receipt image file

    Returns:
        dict with extracted data: {
            'amount': float or None,
            'date': datetime or None,
            'merchant': str or None,
            'raw_text': str,
            'confidence': str ('high', 'medium', 'low')
        }
    """
    try:
        # Open and preprocess image
        image = Image.open(image_path)

        # Convert to grayscale for better OCR
        if image.mode != 'L':
            image = image.convert('L')

        # Perform OCR
        text = pytesseract.image_to_string(image, config='--psm 6')

        # Extract structured data
        amount = extract_amount(text)
        date = extract_date(text)
        merchant = extract_merchant(text)

        # Determine confidence level
        confidence = calculate_confidence(amount, date, merchant, text)

        return {
            'amount': amount,
            'date': date,
            'merchant': merchant,
            'raw_text': text,
            'confidence': confidence,
            'success': True
        }

    except Exception as e:
        return {
            'amount': None,
            'date': None,
            'merchant': None,
            'raw_text': '',
            'confidence': 'none',
            'success': False,
            'error': str(e)
        }


def extract_amount(text):
    """
    Extract monetary amount from text
    Supports multiple formats: $10.99, 10.99, 10,99, etc.
    """
    # Common patterns for amounts
    patterns = [
        r'(?:total|suma|amount|subtotal|plata)[\s:]*[\$€£]?\s*(\d{1,6}[.,]\d{2})',  # Total: $10.99
        r'[\$€£]\s*(\d{1,6}[.,]\d{2})',  # $10.99
        r'(\d{1,6}[.,]\d{2})\s*(?:RON|USD|EUR|GBP|lei)',  # 10.99 RON
        r'(?:^|\s)(\d{1,6}[.,]\d{2})(?:\s|$)',  # Standalone 10.99
    ]

    amounts = []
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
        for match in matches:
            # Normalize comma to dot
            amount_str = match.replace(',', '.')
            try:
                amount = float(amount_str)
                if 0.01 <= amount <= 999999:  # Reasonable range
                    amounts.append(amount)
            except ValueError:
                continue

    if amounts:
        # Return the largest amount (usually the total)
        return max(amounts)

    return None


def extract_date(text):
    """
    Extract date from text
    Supports multiple formats: DD/MM/YYYY, MM-DD-YYYY, DD.MM.YYYY, etc.
    """
    # Common date patterns
    date_patterns = [
        r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # DD/MM/YYYY, MM-DD-YYYY
        r'\d{1,2}\.\d{1,2}\.\d{2,4}',  # DD.MM.YYYY
        r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',  # YYYY-MM-DD
        r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}',  # Jan 15, 2024
        r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}',  # 15 Jan 2024
    ]

    dates = []
    for pattern in date_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            try:
                # Try to parse the date
                parsed_date = date_parser.parse(match, fuzzy=True)

                # Only accept dates within reasonable range
                if datetime(2000, 1, 1) <= parsed_date <= datetime.now():
                    dates.append(parsed_date)
            except (ValueError, date_parser.ParserError):
                continue

    if dates:
        # Return the most recent date (likely the transaction date)
        return max(dates)

    return None


def extract_merchant(text):
    """
    Extract merchant/store name from text
    Usually appears at the top of the receipt
    """
    lines = text.strip().split('\n')

    # Look at first few lines for merchant name
    for i, line in enumerate(lines[:5]):
        line = line.strip()

        # Skip very short lines
        if len(line) < 3:
            continue

        # Skip lines that look like addresses or numbers
        if re.match(r'^[\d\s\.,]+$', line):
            continue

        # Skip common keywords
        if re.match(r'^(receipt|factura|bon|total|date|time)', line, re.IGNORECASE):
            continue

        # If line has letters and reasonable length, likely merchant
        if re.search(r'[a-zA-Z]{3,}', line) and 3 <= len(line) <= 50:
            # Clean up the line
            cleaned = re.sub(r'[^\w\s-]', ' ', line)
            cleaned = ' '.join(cleaned.split())

            if cleaned:
                return cleaned

    return None


def calculate_confidence(amount, date, merchant, text):
    """
    Calculate confidence level of extraction

    Returns: 'high', 'medium', 'low', or 'none'
    """
    found_count = sum([
        amount is not None,
        date is not None,
        merchant is not None
    ])

    # Check text quality
    text_quality = len(text.strip()) > 50 and len(text.split()) > 10

    if found_count == 3 and text_quality:
        return 'high'
    elif found_count >= 2:
        return 'medium'
    elif found_count >= 1:
        return 'low'
    else:
        return 'none'


def preprocess_image_for_ocr(image_path, output_path=None):
    """
    Preprocess image to improve OCR accuracy

    Args:
        image_path: Path to original image
        output_path: Path to save preprocessed image (optional)

    Returns:
        PIL Image object
    """
    from PIL import ImageEnhance, ImageFilter

    image = Image.open(image_path)

    # Convert to grayscale
    image = image.convert('L')

    # Increase contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)

    # Sharpen image
    image = image.filter(ImageFilter.SHARPEN)

    # Apply threshold (binarization)
    threshold = 128
    image = image.point(lambda p: 255 if p > threshold else 0)

    if output_path:
        image.save(output_path)

    return image


def is_valid_receipt_image(image_path):
    """
    Validate that uploaded file is a valid image

    Security check to prevent malicious files
    """
    try:
        image = Image.open(image_path)
        image.verify()

        # Check file size (max 10MB)
        file_size = os.path.getsize(image_path)
        if file_size > 10 * 1024 * 1024:
            return False, "File too large (max 10MB)"

        # Check image dimensions (reasonable receipt size)
        image = Image.open(image_path)
        width, height = image.size
        if width < 100 or height < 100:
            return False, "Image too small"
        if width > 8000 or height > 8000:
            return False, "Image too large"

        # Check format
        if image.format not in ['JPEG', 'PNG', 'JPG']:
            return False, "Unsupported format (use JPEG or PNG)"

        return True, "Valid"

    except Exception as e:
        return False, f"Invalid image: {str(e)}"


def extract_receipt_data_batch(image_paths):
    """
    Process multiple receipt images in batch

    Args:
        image_paths: List of image file paths

    Returns:
        List of extraction results
    """
    results = []
    for path in image_paths:
        result = extract_receipt_data(path)
        result['file_path'] = path
        results.append(result)
    return results


def format_extraction_summary(data):
    """
    Format extracted data for display

    Returns: Human-readable string
    """
    lines = []

    if data.get('merchant'):
        lines.append(f"🏪 Merchant: {data['merchant']}")

    if data.get('amount'):
        lines.append(f"💰 Amount: {data['amount']:.2f}")

    if data.get('date'):
        lines.append(f"📅 Date: {data['date'].strftime('%Y-%m-%d')}")

    if data.get('confidence'):
        confidence_emoji = {
            'high': '✅',
            'medium': '⚠️',
            'low': '❌',
            'none': '❌'
        }
        emoji = confidence_emoji.get(data['confidence'], '❓')
        lines.append(f"{emoji} Confidence: {data['confidence'].title()}")

    return '\n'.join(lines) if lines else "No data extracted"