222 lines
7.1 KiB
Python
222 lines
7.1 KiB
Python
|
|
"""
|
||
|
|
Smart Auto-Tagging Utility
|
||
|
|
Automatically generates tags for expenses based on OCR text and description
|
||
|
|
"""
|
||
|
|
import re
|
||
|
|
from typing import List, Dict
|
||
|
|
|
||
|
|
# Tag patterns for auto-detection
|
||
|
|
TAG_PATTERNS = {
|
||
|
|
# Food & Dining
|
||
|
|
'dining': {
|
||
|
|
'keywords': ['restaurant', 'cafe', 'bistro', 'diner', 'eatery', 'pizz', 'burger', 'sushi',
|
||
|
|
'food court', 'takeout', 'delivery', 'uber eats', 'doordash', 'grubhub', 'postmates',
|
||
|
|
'restaurante', 'pizzeria', 'fast food', 'kfc', 'mcdonald', 'subway', 'starbucks'],
|
||
|
|
'color': '#10b981',
|
||
|
|
'icon': 'restaurant'
|
||
|
|
},
|
||
|
|
'groceries': {
|
||
|
|
'keywords': ['supermarket', 'grocery', 'market', 'walmart', 'kroger', 'whole foods', 'trader joe',
|
||
|
|
'safeway', 'costco', 'aldi', 'lidl', 'carrefour', 'tesco', 'fresh', 'produce',
|
||
|
|
'kaufland', 'mega image', 'penny'],
|
||
|
|
'color': '#22c55e',
|
||
|
|
'icon': 'shopping_cart'
|
||
|
|
},
|
||
|
|
'coffee': {
|
||
|
|
'keywords': ['coffee', 'starbucks', 'cafe', 'caffè', 'espresso', 'latte', 'cappuccino'],
|
||
|
|
'color': '#92400e',
|
||
|
|
'icon': 'coffee'
|
||
|
|
},
|
||
|
|
|
||
|
|
# Transportation
|
||
|
|
'gas': {
|
||
|
|
'keywords': ['gas station', 'fuel', 'petrol', 'shell', 'bp', 'exxon', 'chevron', 'mobil',
|
||
|
|
'texaco', 'station', 'benzinarie', 'combustibil', 'petrom', 'omv', 'lukoil'],
|
||
|
|
'color': '#ef4444',
|
||
|
|
'icon': 'local_gas_station'
|
||
|
|
},
|
||
|
|
'parking': {
|
||
|
|
'keywords': ['parking', 'garage', 'parcare', 'parcomat'],
|
||
|
|
'color': '#f97316',
|
||
|
|
'icon': 'local_parking'
|
||
|
|
},
|
||
|
|
'transport': {
|
||
|
|
'keywords': ['uber', 'lyft', 'taxi', 'cab', 'bus', 'metro', 'train', 'subway', 'transit',
|
||
|
|
'bolt', 'autobuz', 'metrou', 'tren', 'ratb'],
|
||
|
|
'color': '#3b82f6',
|
||
|
|
'icon': 'directions_car'
|
||
|
|
},
|
||
|
|
|
||
|
|
# Shopping
|
||
|
|
'online-shopping': {
|
||
|
|
'keywords': ['amazon', 'ebay', 'aliexpress', 'online', 'emag', 'altex', 'flanco'],
|
||
|
|
'color': '#a855f7',
|
||
|
|
'icon': 'shopping_bag'
|
||
|
|
},
|
||
|
|
'clothing': {
|
||
|
|
'keywords': ['clothing', 'fashion', 'apparel', 'zara', 'h&m', 'nike', 'adidas',
|
||
|
|
'levi', 'gap', 'imbracaminte', 'haine'],
|
||
|
|
'color': '#ec4899',
|
||
|
|
'icon': 'checkroom'
|
||
|
|
},
|
||
|
|
'electronics': {
|
||
|
|
'keywords': ['electronics', 'apple', 'samsung', 'sony', 'best buy', 'media markt'],
|
||
|
|
'color': '#6366f1',
|
||
|
|
'icon': 'devices'
|
||
|
|
},
|
||
|
|
|
||
|
|
# Entertainment
|
||
|
|
'entertainment': {
|
||
|
|
'keywords': ['movie', 'cinema', 'theater', 'concert', 'show', 'ticket', 'netflix', 'spotify',
|
||
|
|
'hbo', 'disney', 'entertainment', 'cinema city', 'teatru', 'concert'],
|
||
|
|
'color': '#8b5cf6',
|
||
|
|
'icon': 'movie'
|
||
|
|
},
|
||
|
|
'gym': {
|
||
|
|
'keywords': ['gym', 'fitness', 'workout', 'sport', 'sala', 'world class'],
|
||
|
|
'color': '#14b8a6',
|
||
|
|
'icon': 'fitness_center'
|
||
|
|
},
|
||
|
|
|
||
|
|
# Bills & Utilities
|
||
|
|
'electricity': {
|
||
|
|
'keywords': ['electric', 'power', 'energie', 'enel', 'electrica'],
|
||
|
|
'color': '#fbbf24',
|
||
|
|
'icon': 'bolt'
|
||
|
|
},
|
||
|
|
'water': {
|
||
|
|
'keywords': ['water', 'apa', 'aqua'],
|
||
|
|
'color': '#06b6d4',
|
||
|
|
'icon': 'water_drop'
|
||
|
|
},
|
||
|
|
'internet': {
|
||
|
|
'keywords': ['internet', 'broadband', 'wifi', 'fiber', 'digi', 'upc', 'telekom', 'orange', 'vodafone'],
|
||
|
|
'color': '#3b82f6',
|
||
|
|
'icon': 'wifi'
|
||
|
|
},
|
||
|
|
'phone': {
|
||
|
|
'keywords': ['phone', 'mobile', 'cellular', 'telefon', 'abonament'],
|
||
|
|
'color': '#8b5cf6',
|
||
|
|
'icon': 'phone_iphone'
|
||
|
|
},
|
||
|
|
'subscription': {
|
||
|
|
'keywords': ['subscription', 'abonament', 'monthly', 'recurring'],
|
||
|
|
'color': '#f59e0b',
|
||
|
|
'icon': 'repeat'
|
||
|
|
},
|
||
|
|
|
||
|
|
# Healthcare
|
||
|
|
'pharmacy': {
|
||
|
|
'keywords': ['pharmacy', 'farmacie', 'drug', 'cvs', 'walgreens', 'catena', 'help net', 'sensiblu'],
|
||
|
|
'color': '#ef4444',
|
||
|
|
'icon': 'local_pharmacy'
|
||
|
|
},
|
||
|
|
'medical': {
|
||
|
|
'keywords': ['doctor', 'hospital', 'clinic', 'medical', 'health', 'dental', 'spital', 'clinica'],
|
||
|
|
'color': '#dc2626',
|
||
|
|
'icon': 'medical_services'
|
||
|
|
},
|
||
|
|
|
||
|
|
# Other
|
||
|
|
'insurance': {
|
||
|
|
'keywords': ['insurance', 'asigurare', 'policy'],
|
||
|
|
'color': '#64748b',
|
||
|
|
'icon': 'shield'
|
||
|
|
},
|
||
|
|
'education': {
|
||
|
|
'keywords': ['school', 'university', 'course', 'tuition', 'book', 'educatie', 'scoala', 'universitate'],
|
||
|
|
'color': '#06b6d4',
|
||
|
|
'icon': 'school'
|
||
|
|
},
|
||
|
|
'pet': {
|
||
|
|
'keywords': ['pet', 'vet', 'veterinar', 'animal'],
|
||
|
|
'color': '#f97316',
|
||
|
|
'icon': 'pets'
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def extract_tags_from_text(text: str, max_tags: int = 5) -> List[Dict[str, str]]:
|
||
|
|
"""
|
||
|
|
Extract relevant tags from OCR text or description
|
||
|
|
|
||
|
|
Args:
|
||
|
|
text: The text to analyze (OCR text or expense description)
|
||
|
|
max_tags: Maximum number of tags to return
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of tag dictionaries with name, color, and icon
|
||
|
|
"""
|
||
|
|
if not text:
|
||
|
|
return []
|
||
|
|
|
||
|
|
# Normalize text: lowercase and remove special characters
|
||
|
|
normalized_text = text.lower()
|
||
|
|
normalized_text = re.sub(r'[^\w\s]', ' ', normalized_text)
|
||
|
|
|
||
|
|
detected_tags = []
|
||
|
|
|
||
|
|
# Check each pattern
|
||
|
|
for tag_name, pattern_info in TAG_PATTERNS.items():
|
||
|
|
for keyword in pattern_info['keywords']:
|
||
|
|
# Use word boundary matching for better accuracy
|
||
|
|
if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', normalized_text):
|
||
|
|
detected_tags.append({
|
||
|
|
'name': tag_name,
|
||
|
|
'color': pattern_info['color'],
|
||
|
|
'icon': pattern_info['icon']
|
||
|
|
})
|
||
|
|
break # Don't add the same tag multiple times
|
||
|
|
|
||
|
|
# Remove duplicates and limit to max_tags
|
||
|
|
unique_tags = []
|
||
|
|
seen = set()
|
||
|
|
for tag in detected_tags:
|
||
|
|
if tag['name'] not in seen:
|
||
|
|
seen.add(tag['name'])
|
||
|
|
unique_tags.append(tag)
|
||
|
|
if len(unique_tags) >= max_tags:
|
||
|
|
break
|
||
|
|
|
||
|
|
return unique_tags
|
||
|
|
|
||
|
|
|
||
|
|
def suggest_tags_for_expense(description: str, ocr_text: str = None, category_name: str = None) -> List[Dict[str, str]]:
|
||
|
|
"""
|
||
|
|
Suggest tags for an expense based on description, OCR text, and category
|
||
|
|
|
||
|
|
Args:
|
||
|
|
description: The expense description
|
||
|
|
ocr_text: OCR text from receipt (if available)
|
||
|
|
category_name: The category name (if available)
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of suggested tag dictionaries
|
||
|
|
"""
|
||
|
|
all_text = description
|
||
|
|
|
||
|
|
# Combine all available text
|
||
|
|
if ocr_text:
|
||
|
|
all_text += " " + ocr_text
|
||
|
|
if category_name:
|
||
|
|
all_text += " " + category_name
|
||
|
|
|
||
|
|
return extract_tags_from_text(all_text, max_tags=3)
|
||
|
|
|
||
|
|
|
||
|
|
def get_tag_suggestions() -> Dict[str, List[str]]:
|
||
|
|
"""
|
||
|
|
Get all available tag patterns for UI display
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary of tag names to their keywords
|
||
|
|
"""
|
||
|
|
suggestions = {}
|
||
|
|
for tag_name, pattern_info in TAG_PATTERNS.items():
|
||
|
|
suggestions[tag_name] = {
|
||
|
|
'keywords': pattern_info['keywords'][:5], # Show first 5 keywords
|
||
|
|
'color': pattern_info['color'],
|
||
|
|
'icon': pattern_info['icon']
|
||
|
|
}
|
||
|
|
return suggestions
|