fina/app/auto_tagger.py

"""
Smart Auto-Tagging Utility
Automatically generates tags for expenses based on OCR text and description
"""
import re
from typing import List, Dict

# Tag patterns for auto-detection
TAG_PATTERNS = {
    # Food & Dining
    'dining': {
        'keywords': ['restaurant', 'cafe', 'bistro', 'diner', 'eatery', 'pizz', 'burger', 'sushi', 
                    'food court', 'takeout', 'delivery', 'uber eats', 'doordash', 'grubhub', 'postmates',
                    'restaurante', 'pizzeria', 'fast food', 'kfc', 'mcdonald', 'subway', 'starbucks'],
        'color': '#10b981',
        'icon': 'restaurant'
    },
    'groceries': {
        'keywords': ['supermarket', 'grocery', 'market', 'walmart', 'kroger', 'whole foods', 'trader joe',
                    'safeway', 'costco', 'aldi', 'lidl', 'carrefour', 'tesco', 'fresh', 'produce',
                    'kaufland', 'mega image', 'penny'],
        'color': '#22c55e',
        'icon': 'shopping_cart'
    },
    'coffee': {
        'keywords': ['coffee', 'starbucks', 'cafe', 'caffè', 'espresso', 'latte', 'cappuccino'],
        'color': '#92400e',
        'icon': 'coffee'
    },
    
    # Transportation
    'gas': {
        'keywords': ['gas station', 'fuel', 'petrol', 'shell', 'bp', 'exxon', 'chevron', 'mobil', 
                    'texaco', 'station', 'benzinarie', 'combustibil', 'petrom', 'omv', 'lukoil'],
        'color': '#ef4444',
        'icon': 'local_gas_station'
    },
    'parking': {
        'keywords': ['parking', 'garage', 'parcare', 'parcomat'],
        'color': '#f97316',
        'icon': 'local_parking'
    },
    'transport': {
        'keywords': ['uber', 'lyft', 'taxi', 'cab', 'bus', 'metro', 'train', 'subway', 'transit',
                    'bolt', 'autobuz', 'metrou', 'tren', 'ratb'],
        'color': '#3b82f6',
        'icon': 'directions_car'
    },
    
    # Shopping
    'online-shopping': {
        'keywords': ['amazon', 'ebay', 'aliexpress', 'online', 'emag', 'altex', 'flanco'],
        'color': '#a855f7',
        'icon': 'shopping_bag'
    },
    'clothing': {
        'keywords': ['clothing', 'fashion', 'apparel', 'zara', 'h&m', 'nike', 'adidas', 
                    'levi', 'gap', 'imbracaminte', 'haine'],
        'color': '#ec4899',
        'icon': 'checkroom'
    },
    'electronics': {
        'keywords': ['electronics', 'apple', 'samsung', 'sony', 'best buy', 'media markt'],
        'color': '#6366f1',
        'icon': 'devices'
    },
    
    # Entertainment
    'entertainment': {
        'keywords': ['movie', 'cinema', 'theater', 'concert', 'show', 'ticket', 'netflix', 'spotify',
                    'hbo', 'disney', 'entertainment', 'cinema city', 'teatru', 'concert'],
        'color': '#8b5cf6',
        'icon': 'movie'
    },
    'gym': {
        'keywords': ['gym', 'fitness', 'workout', 'sport', 'sala', 'world class'],
        'color': '#14b8a6',
        'icon': 'fitness_center'
    },
    
    # Bills & Utilities
    'electricity': {
        'keywords': ['electric', 'power', 'energie', 'enel', 'electrica'],
        'color': '#fbbf24',
        'icon': 'bolt'
    },
    'water': {
        'keywords': ['water', 'apa', 'aqua'],
        'color': '#06b6d4',
        'icon': 'water_drop'
    },
    'internet': {
        'keywords': ['internet', 'broadband', 'wifi', 'fiber', 'digi', 'upc', 'telekom', 'orange', 'vodafone'],
        'color': '#3b82f6',
        'icon': 'wifi'
    },
    'phone': {
        'keywords': ['phone', 'mobile', 'cellular', 'telefon', 'abonament'],
        'color': '#8b5cf6',
        'icon': 'phone_iphone'
    },
    'subscription': {
        'keywords': ['subscription', 'abonament', 'monthly', 'recurring'],
        'color': '#f59e0b',
        'icon': 'repeat'
    },
    
    # Healthcare
    'pharmacy': {
        'keywords': ['pharmacy', 'farmacie', 'drug', 'cvs', 'walgreens', 'catena', 'help net', 'sensiblu'],
        'color': '#ef4444',
        'icon': 'local_pharmacy'
    },
    'medical': {
        'keywords': ['doctor', 'hospital', 'clinic', 'medical', 'health', 'dental', 'spital', 'clinica'],
        'color': '#dc2626',
        'icon': 'medical_services'
    },
    
    # Other
    'insurance': {
        'keywords': ['insurance', 'asigurare', 'policy'],
        'color': '#64748b',
        'icon': 'shield'
    },
    'education': {
        'keywords': ['school', 'university', 'course', 'tuition', 'book', 'educatie', 'scoala', 'universitate'],
        'color': '#06b6d4',
        'icon': 'school'
    },
    'pet': {
        'keywords': ['pet', 'vet', 'veterinar', 'animal'],
        'color': '#f97316',
        'icon': 'pets'
    },
}


def extract_tags_from_text(text: str, max_tags: int = 5) -> List[Dict[str, str]]:
    """
    Extract relevant tags from OCR text or description
    
    Args:
        text: The text to analyze (OCR text or expense description)
        max_tags: Maximum number of tags to return
        
    Returns:
        List of tag dictionaries with name, color, and icon
    """
    if not text:
        return []
    
    # Normalize text: lowercase and remove special characters
    normalized_text = text.lower()
    normalized_text = re.sub(r'[^\w\s]', ' ', normalized_text)
    
    detected_tags = []
    
    # Check each pattern
    for tag_name, pattern_info in TAG_PATTERNS.items():
        for keyword in pattern_info['keywords']:
            # Use word boundary matching for better accuracy
            if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', normalized_text):
                detected_tags.append({
                    'name': tag_name,
                    'color': pattern_info['color'],
                    'icon': pattern_info['icon']
                })
                break  # Don't add the same tag multiple times
    
    # Remove duplicates and limit to max_tags
    unique_tags = []
    seen = set()
    for tag in detected_tags:
        if tag['name'] not in seen:
            seen.add(tag['name'])
            unique_tags.append(tag)
            if len(unique_tags) >= max_tags:
                break
    
    return unique_tags


def suggest_tags_for_expense(description: str, ocr_text: str = None, category_name: str = None) -> List[Dict[str, str]]:
    """
    Suggest tags for an expense based on description, OCR text, and category
    
    Args:
        description: The expense description
        ocr_text: OCR text from receipt (if available)
        category_name: The category name (if available)
        
    Returns:
        List of suggested tag dictionaries
    """
    all_text = description
    
    # Combine all available text
    if ocr_text:
        all_text += " " + ocr_text
    if category_name:
        all_text += " " + category_name
    
    return extract_tags_from_text(all_text, max_tags=3)


def get_tag_suggestions() -> Dict[str, List[str]]:
    """
    Get all available tag patterns for UI display
    
    Returns:
        Dictionary of tag names to their keywords
    """
    suggestions = {}
    for tag_name, pattern_info in TAG_PATTERNS.items():
        suggestions[tag_name] = {
            'keywords': pattern_info['keywords'][:5],  # Show first 5 keywords
            'color': pattern_info['color'],
            'icon': pattern_info['icon']
        }
    return suggestions
Initial commit 2025-12-26 00:52:56 +00:00			`"""`
			`Smart Auto-Tagging Utility`
			`Automatically generates tags for expenses based on OCR text and description`
			`"""`
			`import re`
			`from typing import List, Dict`

			`# Tag patterns for auto-detection`
			`TAG_PATTERNS = {`
			`# Food & Dining`
			`'dining': {`
			`'keywords': ['restaurant', 'cafe', 'bistro', 'diner', 'eatery', 'pizz', 'burger', 'sushi',`
			`'food court', 'takeout', 'delivery', 'uber eats', 'doordash', 'grubhub', 'postmates',`
			`'restaurante', 'pizzeria', 'fast food', 'kfc', 'mcdonald', 'subway', 'starbucks'],`
			`'color': '#10b981',`
			`'icon': 'restaurant'`
			`},`
			`'groceries': {`
			`'keywords': ['supermarket', 'grocery', 'market', 'walmart', 'kroger', 'whole foods', 'trader joe',`
			`'safeway', 'costco', 'aldi', 'lidl', 'carrefour', 'tesco', 'fresh', 'produce',`
			`'kaufland', 'mega image', 'penny'],`
			`'color': '#22c55e',`
			`'icon': 'shopping_cart'`
			`},`
			`'coffee': {`
			`'keywords': ['coffee', 'starbucks', 'cafe', 'caffè', 'espresso', 'latte', 'cappuccino'],`
			`'color': '#92400e',`
			`'icon': 'coffee'`
			`},`

			`# Transportation`
			`'gas': {`
			`'keywords': ['gas station', 'fuel', 'petrol', 'shell', 'bp', 'exxon', 'chevron', 'mobil',`
			`'texaco', 'station', 'benzinarie', 'combustibil', 'petrom', 'omv', 'lukoil'],`
			`'color': '#ef4444',`
			`'icon': 'local_gas_station'`
			`},`
			`'parking': {`
			`'keywords': ['parking', 'garage', 'parcare', 'parcomat'],`
			`'color': '#f97316',`
			`'icon': 'local_parking'`
			`},`
			`'transport': {`
			`'keywords': ['uber', 'lyft', 'taxi', 'cab', 'bus', 'metro', 'train', 'subway', 'transit',`
			`'bolt', 'autobuz', 'metrou', 'tren', 'ratb'],`
			`'color': '#3b82f6',`
			`'icon': 'directions_car'`
			`},`

			`# Shopping`
			`'online-shopping': {`
			`'keywords': ['amazon', 'ebay', 'aliexpress', 'online', 'emag', 'altex', 'flanco'],`
			`'color': '#a855f7',`
			`'icon': 'shopping_bag'`
			`},`
			`'clothing': {`
			`'keywords': ['clothing', 'fashion', 'apparel', 'zara', 'h&m', 'nike', 'adidas',`
			`'levi', 'gap', 'imbracaminte', 'haine'],`
			`'color': '#ec4899',`
			`'icon': 'checkroom'`
			`},`
			`'electronics': {`
			`'keywords': ['electronics', 'apple', 'samsung', 'sony', 'best buy', 'media markt'],`
			`'color': '#6366f1',`
			`'icon': 'devices'`
			`},`

			`# Entertainment`
			`'entertainment': {`
			`'keywords': ['movie', 'cinema', 'theater', 'concert', 'show', 'ticket', 'netflix', 'spotify',`
			`'hbo', 'disney', 'entertainment', 'cinema city', 'teatru', 'concert'],`
			`'color': '#8b5cf6',`
			`'icon': 'movie'`
			`},`
			`'gym': {`
			`'keywords': ['gym', 'fitness', 'workout', 'sport', 'sala', 'world class'],`
			`'color': '#14b8a6',`
			`'icon': 'fitness_center'`
			`},`

			`# Bills & Utilities`
			`'electricity': {`
			`'keywords': ['electric', 'power', 'energie', 'enel', 'electrica'],`
			`'color': '#fbbf24',`
			`'icon': 'bolt'`
			`},`
			`'water': {`
			`'keywords': ['water', 'apa', 'aqua'],`
			`'color': '#06b6d4',`
			`'icon': 'water_drop'`
			`},`
			`'internet': {`
			`'keywords': ['internet', 'broadband', 'wifi', 'fiber', 'digi', 'upc', 'telekom', 'orange', 'vodafone'],`
			`'color': '#3b82f6',`
			`'icon': 'wifi'`
			`},`
			`'phone': {`
			`'keywords': ['phone', 'mobile', 'cellular', 'telefon', 'abonament'],`
			`'color': '#8b5cf6',`
			`'icon': 'phone_iphone'`
			`},`
			`'subscription': {`
			`'keywords': ['subscription', 'abonament', 'monthly', 'recurring'],`
			`'color': '#f59e0b',`
			`'icon': 'repeat'`
			`},`

			`# Healthcare`
			`'pharmacy': {`
			`'keywords': ['pharmacy', 'farmacie', 'drug', 'cvs', 'walgreens', 'catena', 'help net', 'sensiblu'],`
			`'color': '#ef4444',`
			`'icon': 'local_pharmacy'`
			`},`
			`'medical': {`
			`'keywords': ['doctor', 'hospital', 'clinic', 'medical', 'health', 'dental', 'spital', 'clinica'],`
			`'color': '#dc2626',`
			`'icon': 'medical_services'`
			`},`

			`# Other`
			`'insurance': {`
			`'keywords': ['insurance', 'asigurare', 'policy'],`
			`'color': '#64748b',`
			`'icon': 'shield'`
			`},`
			`'education': {`
			`'keywords': ['school', 'university', 'course', 'tuition', 'book', 'educatie', 'scoala', 'universitate'],`
			`'color': '#06b6d4',`
			`'icon': 'school'`
			`},`
			`'pet': {`
			`'keywords': ['pet', 'vet', 'veterinar', 'animal'],`
			`'color': '#f97316',`
			`'icon': 'pets'`
			`},`
			`}`


			`def extract_tags_from_text(text: str, max_tags: int = 5) -> List[Dict[str, str]]:`
			`"""`
			`Extract relevant tags from OCR text or description`

			`Args:`
			`text: The text to analyze (OCR text or expense description)`
			`max_tags: Maximum number of tags to return`

			`Returns:`
			`List of tag dictionaries with name, color, and icon`
			`"""`
			`if not text:`
			`return []`

			`# Normalize text: lowercase and remove special characters`
			`normalized_text = text.lower()`
			`normalized_text = re.sub(r'[^\w\s]', ' ', normalized_text)`

			`detected_tags = []`

			`# Check each pattern`
			`for tag_name, pattern_info in TAG_PATTERNS.items():`
			`for keyword in pattern_info['keywords']:`
			`# Use word boundary matching for better accuracy`
			`if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', normalized_text):`
			`detected_tags.append({`
			`'name': tag_name,`
			`'color': pattern_info['color'],`
			`'icon': pattern_info['icon']`
			`})`
			`break # Don't add the same tag multiple times`

			`# Remove duplicates and limit to max_tags`
			`unique_tags = []`
			`seen = set()`
			`for tag in detected_tags:`
			`if tag['name'] not in seen:`
			`seen.add(tag['name'])`
			`unique_tags.append(tag)`
			`if len(unique_tags) >= max_tags:`
			`break`

			`return unique_tags`


			`def suggest_tags_for_expense(description: str, ocr_text: str = None, category_name: str = None) -> List[Dict[str, str]]:`
			`"""`
			`Suggest tags for an expense based on description, OCR text, and category`

			`Args:`
			`description: The expense description`
			`ocr_text: OCR text from receipt (if available)`
			`category_name: The category name (if available)`

			`Returns:`
			`List of suggested tag dictionaries`
			`"""`
			`all_text = description`

			`# Combine all available text`
			`if ocr_text:`
			`all_text += " " + ocr_text`
			`if category_name:`
			`all_text += " " + category_name`

			`return extract_tags_from_text(all_text, max_tags=3)`


			`def get_tag_suggestions() -> Dict[str, List[str]]:`
			`"""`
			`Get all available tag patterns for UI display`

			`Returns:`
			`Dictionary of tag names to their keywords`
			`"""`
			`suggestions = {}`
			`for tag_name, pattern_info in TAG_PATTERNS.items():`
			`suggestions[tag_name] = {`
			`'keywords': pattern_info['keywords'][:5], # Show first 5 keywords`
			`'color': pattern_info['color'],`
			`'icon': pattern_info['icon']`
			`}`
			`return suggestions`