311 lines
8.5 KiB
Python
311 lines
8.5 KiB
Python
"""
|
|
Receipt OCR Module
|
|
Extracts amount, date, and merchant information from receipt images using Tesseract OCR
|
|
"""
|
|
|
|
import pytesseract
|
|
from PIL import Image
|
|
import re
|
|
from datetime import datetime
|
|
from dateutil import parser as date_parser
|
|
import os
|
|
|
|
|
|
def extract_receipt_data(image_path):
|
|
"""
|
|
Extract structured data from receipt image
|
|
|
|
Args:
|
|
image_path: Path to the receipt image file
|
|
|
|
Returns:
|
|
dict with extracted data: {
|
|
'amount': float or None,
|
|
'date': datetime or None,
|
|
'merchant': str or None,
|
|
'raw_text': str,
|
|
'confidence': str ('high', 'medium', 'low')
|
|
}
|
|
"""
|
|
try:
|
|
# Open and preprocess image
|
|
image = Image.open(image_path)
|
|
|
|
# Convert to grayscale for better OCR
|
|
if image.mode != 'L':
|
|
image = image.convert('L')
|
|
|
|
# Perform OCR
|
|
text = pytesseract.image_to_string(image, config='--psm 6')
|
|
|
|
# Extract structured data
|
|
amount = extract_amount(text)
|
|
date = extract_date(text)
|
|
merchant = extract_merchant(text)
|
|
|
|
# Determine confidence level
|
|
confidence = calculate_confidence(amount, date, merchant, text)
|
|
|
|
return {
|
|
'amount': amount,
|
|
'date': date,
|
|
'merchant': merchant,
|
|
'raw_text': text,
|
|
'confidence': confidence,
|
|
'success': True
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'amount': None,
|
|
'date': None,
|
|
'merchant': None,
|
|
'raw_text': '',
|
|
'confidence': 'none',
|
|
'success': False,
|
|
'error': str(e)
|
|
}
|
|
|
|
|
|
def extract_amount(text):
|
|
"""
|
|
Extract monetary amount from text
|
|
Supports multiple formats: $10.99, 10.99, 10,99, etc.
|
|
"""
|
|
# Common patterns for amounts
|
|
patterns = [
|
|
r'(?:total|suma|amount|subtotal|plata)[\s:]*[\$€£]?\s*(\d{1,6}[.,]\d{2})', # Total: $10.99
|
|
r'[\$€£]\s*(\d{1,6}[.,]\d{2})', # $10.99
|
|
r'(\d{1,6}[.,]\d{2})\s*(?:RON|USD|EUR|GBP|lei)', # 10.99 RON
|
|
r'(?:^|\s)(\d{1,6}[.,]\d{2})(?:\s|$)', # Standalone 10.99
|
|
]
|
|
|
|
amounts = []
|
|
for pattern in patterns:
|
|
matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
|
|
for match in matches:
|
|
# Normalize comma to dot
|
|
amount_str = match.replace(',', '.')
|
|
try:
|
|
amount = float(amount_str)
|
|
if 0.01 <= amount <= 999999: # Reasonable range
|
|
amounts.append(amount)
|
|
except ValueError:
|
|
continue
|
|
|
|
if amounts:
|
|
# Return the largest amount (usually the total)
|
|
return max(amounts)
|
|
|
|
return None
|
|
|
|
|
|
def extract_date(text):
|
|
"""
|
|
Extract date from text
|
|
Supports multiple formats: DD/MM/YYYY, MM-DD-YYYY, DD.MM.YYYY, etc.
|
|
"""
|
|
# Common date patterns
|
|
date_patterns = [
|
|
r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', # DD/MM/YYYY, MM-DD-YYYY
|
|
r'\d{1,2}\.\d{1,2}\.\d{2,4}', # DD.MM.YYYY
|
|
r'\d{4}[/-]\d{1,2}[/-]\d{1,2}', # YYYY-MM-DD
|
|
r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}', # Jan 15, 2024
|
|
r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}', # 15 Jan 2024
|
|
]
|
|
|
|
dates = []
|
|
for pattern in date_patterns:
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
for match in matches:
|
|
try:
|
|
# Try to parse the date
|
|
parsed_date = date_parser.parse(match, fuzzy=True)
|
|
|
|
# Only accept dates within reasonable range
|
|
if datetime(2000, 1, 1) <= parsed_date <= datetime.now():
|
|
dates.append(parsed_date)
|
|
except (ValueError, date_parser.ParserError):
|
|
continue
|
|
|
|
if dates:
|
|
# Return the most recent date (likely the transaction date)
|
|
return max(dates)
|
|
|
|
return None
|
|
|
|
|
|
def extract_merchant(text):
|
|
"""
|
|
Extract merchant/store name from text
|
|
Usually appears at the top of the receipt
|
|
"""
|
|
lines = text.strip().split('\n')
|
|
|
|
# Look at first few lines for merchant name
|
|
for i, line in enumerate(lines[:5]):
|
|
line = line.strip()
|
|
|
|
# Skip very short lines
|
|
if len(line) < 3:
|
|
continue
|
|
|
|
# Skip lines that look like addresses or numbers
|
|
if re.match(r'^[\d\s\.,]+$', line):
|
|
continue
|
|
|
|
# Skip common keywords
|
|
if re.match(r'^(receipt|factura|bon|total|date|time)', line, re.IGNORECASE):
|
|
continue
|
|
|
|
# If line has letters and reasonable length, likely merchant
|
|
if re.search(r'[a-zA-Z]{3,}', line) and 3 <= len(line) <= 50:
|
|
# Clean up the line
|
|
cleaned = re.sub(r'[^\w\s-]', ' ', line)
|
|
cleaned = ' '.join(cleaned.split())
|
|
|
|
if cleaned:
|
|
return cleaned
|
|
|
|
return None
|
|
|
|
|
|
def calculate_confidence(amount, date, merchant, text):
|
|
"""
|
|
Calculate confidence level of extraction
|
|
|
|
Returns: 'high', 'medium', 'low', or 'none'
|
|
"""
|
|
found_count = sum([
|
|
amount is not None,
|
|
date is not None,
|
|
merchant is not None
|
|
])
|
|
|
|
# Check text quality
|
|
text_quality = len(text.strip()) > 50 and len(text.split()) > 10
|
|
|
|
if found_count == 3 and text_quality:
|
|
return 'high'
|
|
elif found_count >= 2:
|
|
return 'medium'
|
|
elif found_count >= 1:
|
|
return 'low'
|
|
else:
|
|
return 'none'
|
|
|
|
|
|
def preprocess_image_for_ocr(image_path, output_path=None):
|
|
"""
|
|
Preprocess image to improve OCR accuracy
|
|
|
|
Args:
|
|
image_path: Path to original image
|
|
output_path: Path to save preprocessed image (optional)
|
|
|
|
Returns:
|
|
PIL Image object
|
|
"""
|
|
from PIL import ImageEnhance, ImageFilter
|
|
|
|
image = Image.open(image_path)
|
|
|
|
# Convert to grayscale
|
|
image = image.convert('L')
|
|
|
|
# Increase contrast
|
|
enhancer = ImageEnhance.Contrast(image)
|
|
image = enhancer.enhance(2.0)
|
|
|
|
# Sharpen image
|
|
image = image.filter(ImageFilter.SHARPEN)
|
|
|
|
# Apply threshold (binarization)
|
|
threshold = 128
|
|
image = image.point(lambda p: 255 if p > threshold else 0)
|
|
|
|
if output_path:
|
|
image.save(output_path)
|
|
|
|
return image
|
|
|
|
|
|
def is_valid_receipt_image(image_path):
|
|
"""
|
|
Validate that uploaded file is a valid image
|
|
|
|
Security check to prevent malicious files
|
|
"""
|
|
try:
|
|
image = Image.open(image_path)
|
|
image.verify()
|
|
|
|
# Check file size (max 10MB)
|
|
file_size = os.path.getsize(image_path)
|
|
if file_size > 10 * 1024 * 1024:
|
|
return False, "File too large (max 10MB)"
|
|
|
|
# Check image dimensions (reasonable receipt size)
|
|
image = Image.open(image_path)
|
|
width, height = image.size
|
|
if width < 100 or height < 100:
|
|
return False, "Image too small"
|
|
if width > 8000 or height > 8000:
|
|
return False, "Image too large"
|
|
|
|
# Check format
|
|
if image.format not in ['JPEG', 'PNG', 'JPG']:
|
|
return False, "Unsupported format (use JPEG or PNG)"
|
|
|
|
return True, "Valid"
|
|
|
|
except Exception as e:
|
|
return False, f"Invalid image: {str(e)}"
|
|
|
|
|
|
def extract_receipt_data_batch(image_paths):
|
|
"""
|
|
Process multiple receipt images in batch
|
|
|
|
Args:
|
|
image_paths: List of image file paths
|
|
|
|
Returns:
|
|
List of extraction results
|
|
"""
|
|
results = []
|
|
for path in image_paths:
|
|
result = extract_receipt_data(path)
|
|
result['file_path'] = path
|
|
results.append(result)
|
|
return results
|
|
|
|
|
|
def format_extraction_summary(data):
|
|
"""
|
|
Format extracted data for display
|
|
|
|
Returns: Human-readable string
|
|
"""
|
|
lines = []
|
|
|
|
if data.get('merchant'):
|
|
lines.append(f"🏪 Merchant: {data['merchant']}")
|
|
|
|
if data.get('amount'):
|
|
lines.append(f"💰 Amount: {data['amount']:.2f}")
|
|
|
|
if data.get('date'):
|
|
lines.append(f"📅 Date: {data['date'].strftime('%Y-%m-%d')}")
|
|
|
|
if data.get('confidence'):
|
|
confidence_emoji = {
|
|
'high': '✅',
|
|
'medium': '⚠️',
|
|
'low': '❌',
|
|
'none': '❌'
|
|
}
|
|
emoji = confidence_emoji.get(data['confidence'], '❓')
|
|
lines.append(f"{emoji} Confidence: {data['confidence'].title()}")
|
|
|
|
return '\n'.join(lines) if lines else "No data extracted"
|