Initial commit

2025-12-26 00:52:56 +00:00 · 2025-12-26 00:52:56 +00:00 · 983cee0320
commit 983cee0320
322 changed files with 57174 additions and 0 deletions
--- a/app/app/ocr.py
+++ b/app/app/ocr.py
@ -0,0 +1,311 @@
+"""
+Receipt OCR Module
+Extracts amount, date, and merchant information from receipt images using Tesseract OCR
+"""
+
+import pytesseract
+from PIL import Image
+import re
+from datetime import datetime
+from dateutil import parser as date_parser
+import os
+
+
+def extract_receipt_data(image_path):
+    """
+    Extract structured data from receipt image
+    
+    Args:
+        image_path: Path to the receipt image file
+    
+    Returns:
+        dict with extracted data: {
+            'amount': float or None,
+            'date': datetime or None,
+            'merchant': str or None,
+            'raw_text': str,
+            'confidence': str ('high', 'medium', 'low')
+        }
+    """
+    try:
+        # Open and preprocess image
+        image = Image.open(image_path)
+        
+        # Convert to grayscale for better OCR
+        if image.mode != 'L':
+            image = image.convert('L')
+        
+        # Perform OCR
+        text = pytesseract.image_to_string(image, config='--psm 6')
+        
+        # Extract structured data
+        amount = extract_amount(text)
+        date = extract_date(text)
+        merchant = extract_merchant(text)
+        
+        # Determine confidence level
+        confidence = calculate_confidence(amount, date, merchant, text)
+        
+        return {
+            'amount': amount,
+            'date': date,
+            'merchant': merchant,
+            'raw_text': text,
+            'confidence': confidence,
+            'success': True
+        }
+        
+    except Exception as e:
+        return {
+            'amount': None,
+            'date': None,
+            'merchant': None,
+            'raw_text': '',
+            'confidence': 'none',
+            'success': False,
+            'error': str(e)
+        }
+
+
+def extract_amount(text):
+    """
+    Extract monetary amount from text
+    Supports multiple formats: $10.99, 10.99, 10,99, etc.
+    """
+    # Common patterns for amounts
+    patterns = [
+        r'(?:total|suma|amount|subtotal|plata)[\s:]*[\$€£]?\s*(\d{1,6}[.,]\d{2})',  # Total: $10.99
+        r'[\$€£]\s*(\d{1,6}[.,]\d{2})',  # $10.99
+        r'(\d{1,6}[.,]\d{2})\s*(?:RON|USD|EUR|GBP|lei)',  # 10.99 RON
+        r'(?:^|\s)(\d{1,6}[.,]\d{2})(?:\s|$)',  # Standalone 10.99
+    ]
+    
+    amounts = []
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
+        for match in matches:
+            # Normalize comma to dot
+            amount_str = match.replace(',', '.')
+            try:
+                amount = float(amount_str)
+                if 0.01 <= amount <= 999999:  # Reasonable range
+                    amounts.append(amount)
+            except ValueError:
+                continue
+    
+    if amounts:
+        # Return the largest amount (usually the total)
+        return max(amounts)
+    
+    return None
+
+
+def extract_date(text):
+    """
+    Extract date from text
+    Supports multiple formats: DD/MM/YYYY, MM-DD-YYYY, DD.MM.YYYY, etc.
+    """
+    # Common date patterns
+    date_patterns = [
+        r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # DD/MM/YYYY, MM-DD-YYYY
+        r'\d{1,2}\.\d{1,2}\.\d{2,4}',  # DD.MM.YYYY
+        r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',  # YYYY-MM-DD
+        r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}',  # Jan 15, 2024
+        r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}',  # 15 Jan 2024
+    ]
+    
+    dates = []
+    for pattern in date_patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        for match in matches:
+            try:
+                # Try to parse the date
+                parsed_date = date_parser.parse(match, fuzzy=True)
+                
+                # Only accept dates within reasonable range
+                if datetime(2000, 1, 1) <= parsed_date <= datetime.now():
+                    dates.append(parsed_date)
+            except (ValueError, date_parser.ParserError):
+                continue
+    
+    if dates:
+        # Return the most recent date (likely the transaction date)
+        return max(dates)
+    
+    return None
+
+
+def extract_merchant(text):
+    """
+    Extract merchant/store name from text
+    Usually appears at the top of the receipt
+    """
+    lines = text.strip().split('\n')
+    
+    # Look at first few lines for merchant name
+    for i, line in enumerate(lines[:5]):
+        line = line.strip()
+        
+        # Skip very short lines
+        if len(line) < 3:
+            continue
+        
+        # Skip lines that look like addresses or numbers
+        if re.match(r'^[\d\s\.,]+$', line):
+            continue
+        
+        # Skip common keywords
+        if re.match(r'^(receipt|factura|bon|total|date|time)', line, re.IGNORECASE):
+            continue
+        
+        # If line has letters and reasonable length, likely merchant
+        if re.search(r'[a-zA-Z]{3,}', line) and 3 <= len(line) <= 50:
+            # Clean up the line
+            cleaned = re.sub(r'[^\w\s-]', ' ', line)
+            cleaned = ' '.join(cleaned.split())
+            
+            if cleaned:
+                return cleaned
+    
+    return None
+
+
+def calculate_confidence(amount, date, merchant, text):
+    """
+    Calculate confidence level of extraction
+    
+    Returns: 'high', 'medium', 'low', or 'none'
+    """
+    found_count = sum([
+        amount is not None,
+        date is not None,
+        merchant is not None
+    ])
+    
+    # Check text quality
+    text_quality = len(text.strip()) > 50 and len(text.split()) > 10
+    
+    if found_count == 3 and text_quality:
+        return 'high'
+    elif found_count >= 2:
+        return 'medium'
+    elif found_count >= 1:
+        return 'low'
+    else:
+        return 'none'
+
+
+def preprocess_image_for_ocr(image_path, output_path=None):
+    """
+    Preprocess image to improve OCR accuracy
+    
+    Args:
+        image_path: Path to original image
+        output_path: Path to save preprocessed image (optional)
+    
+    Returns:
+        PIL Image object
+    """
+    from PIL import ImageEnhance, ImageFilter
+    
+    image = Image.open(image_path)
+    
+    # Convert to grayscale
+    image = image.convert('L')
+    
+    # Increase contrast
+    enhancer = ImageEnhance.Contrast(image)
+    image = enhancer.enhance(2.0)
+    
+    # Sharpen image
+    image = image.filter(ImageFilter.SHARPEN)
+    
+    # Apply threshold (binarization)
+    threshold = 128
+    image = image.point(lambda p: 255 if p > threshold else 0)
+    
+    if output_path:
+        image.save(output_path)
+    
+    return image
+
+
+def is_valid_receipt_image(image_path):
+    """
+    Validate that uploaded file is a valid image
+    
+    Security check to prevent malicious files
+    """
+    try:
+        image = Image.open(image_path)
+        image.verify()
+        
+        # Check file size (max 10MB)
+        file_size = os.path.getsize(image_path)
+        if file_size > 10 * 1024 * 1024:
+            return False, "File too large (max 10MB)"
+        
+        # Check image dimensions (reasonable receipt size)
+        image = Image.open(image_path)
+        width, height = image.size
+        if width < 100 or height < 100:
+            return False, "Image too small"
+        if width > 8000 or height > 8000:
+            return False, "Image too large"
+        
+        # Check format
+        if image.format not in ['JPEG', 'PNG', 'JPG']:
+            return False, "Unsupported format (use JPEG or PNG)"
+        
+        return True, "Valid"
+        
+    except Exception as e:
+        return False, f"Invalid image: {str(e)}"
+
+
+def extract_receipt_data_batch(image_paths):
+    """
+    Process multiple receipt images in batch
+    
+    Args:
+        image_paths: List of image file paths
+    
+    Returns:
+        List of extraction results
+    """
+    results = []
+    for path in image_paths:
+        result = extract_receipt_data(path)
+        result['file_path'] = path
+        results.append(result)
+    return results
+
+
+def format_extraction_summary(data):
+    """
+    Format extracted data for display
+    
+    Returns: Human-readable string
+    """
+    lines = []
+    
+    if data.get('merchant'):
+        lines.append(f"🏪 Merchant: {data['merchant']}")
+    
+    if data.get('amount'):
+        lines.append(f"💰 Amount: {data['amount']:.2f}")
+    
+    if data.get('date'):
+        lines.append(f"📅 Date: {data['date'].strftime('%Y-%m-%d')}")
+    
+    if data.get('confidence'):
+        confidence_emoji = {
+            'high': '✅',
+            'medium': '⚠️',
+            'low': '❌',
+            'none': '❌'
+        }
+        emoji = confidence_emoji.get(data['confidence'], '❓')
+        lines.append(f"{emoji} Confidence: {data['confidence'].title()}")
+    
+    return '\n'.join(lines) if lines else "No data extracted"