fina/app/routes/csv_import.py
2025-12-26 00:52:56 +00:00

609 lines
22 KiB
Python

"""
CSV/Bank Statement Import Routes for FINA
Handles file upload, parsing, duplicate detection, and category mapping
"""
from flask import Blueprint, request, jsonify
from flask_login import login_required, current_user
from werkzeug.utils import secure_filename
from app import db
from app.models import Expense, Category
from datetime import datetime, timedelta
from sqlalchemy import and_, or_
import csv
import io
import re
import json
from decimal import Decimal
bp = Blueprint('csv_import', __name__, url_prefix='/api/import')
class CSVParser:
"""Parse CSV files with auto-detection of format"""
def __init__(self):
self.errors = []
def detect_delimiter(self, sample):
"""Auto-detect CSV delimiter"""
delimiters = [',', ';', '\t', '|']
counts = {d: sample.count(d) for d in delimiters}
return max(counts, key=counts.get)
def detect_encoding(self, file_bytes):
"""Detect file encoding"""
encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1']
for encoding in encodings:
try:
file_bytes.decode(encoding)
return encoding
except UnicodeDecodeError:
continue
return 'utf-8'
def detect_columns(self, headers):
"""Auto-detect which columns contain date, description, amount"""
headers_lower = [h.lower().strip() if h else '' for h in headers]
mapping = {
'date': None,
'description': None,
'amount': None,
'debit': None,
'credit': None,
'category': None
}
# Date column keywords
date_keywords = ['date', 'data', 'fecha', 'datum', 'transaction date', 'trans date', 'posting date']
for idx, name in enumerate(headers_lower):
if any(keyword in name for keyword in date_keywords):
mapping['date'] = idx
break
# Description column keywords - prioritize "name" for merchant/payee names
# First try to find "name" column (commonly used for merchant/payee)
for idx, name in enumerate(headers_lower):
if name == 'name' or 'payee' in name or 'merchant name' in name:
mapping['description'] = idx
break
# If no "name" column, look for other description columns
if mapping['description'] is None:
desc_keywords = ['description', 'descriere', 'descripción', 'details', 'detalii', 'merchant',
'comerciant', 'narrative', 'memo', 'particulars', 'transaction details']
for idx, name in enumerate(headers_lower):
if any(keyword in name for keyword in desc_keywords):
mapping['description'] = idx
break
# Category column keywords (optional) - avoid generic "type" column that contains payment types
# Only use "category" explicitly, not "type" which often contains payment methods
for idx, name in enumerate(headers_lower):
if name == 'category' or 'categorie' in name or 'categoría' in name:
mapping['category'] = idx
break
# Amount columns
amount_keywords = ['amount', 'suma', 'monto', 'valoare', 'value']
debit_keywords = ['debit', 'withdrawal', 'retragere', 'spent', 'expense', 'cheltuială', 'out']
credit_keywords = ['credit', 'deposit', 'depunere', 'income', 'venit', 'in']
for idx, name in enumerate(headers_lower):
if any(keyword in name for keyword in debit_keywords):
mapping['debit'] = idx
elif any(keyword in name for keyword in credit_keywords):
mapping['credit'] = idx
elif any(keyword in name for keyword in amount_keywords) and mapping['amount'] is None:
mapping['amount'] = idx
return mapping
def parse_date(self, date_str):
"""Parse date string in various formats"""
if not date_str or not isinstance(date_str, str):
return None
date_str = date_str.strip()
if not date_str:
return None
# Common date formats
formats = [
'%d/%m/%Y', '%d-%m-%Y', '%Y-%m-%d', '%Y/%m/%d',
'%d.%m.%Y', '%m/%d/%Y', '%d %b %Y', '%d %B %Y',
'%Y%m%d', '%d-%b-%Y', '%d-%B-%Y', '%b %d, %Y',
'%B %d, %Y', '%Y-%m-%d %H:%M:%S', '%d/%m/%Y %H:%M:%S'
]
for fmt in formats:
try:
return datetime.strptime(date_str, fmt).date()
except ValueError:
continue
return None
def parse_amount(self, amount_str):
"""Parse amount string to float"""
if not amount_str:
return 0.0
if isinstance(amount_str, (int, float)):
return float(amount_str)
# Remove currency symbols and spaces
amount_str = str(amount_str).strip()
amount_str = re.sub(r'[^\d.,\-+]', '', amount_str)
if not amount_str or amount_str == '-':
return 0.0
try:
# Handle European format (1.234,56)
if ',' in amount_str and '.' in amount_str:
if amount_str.rfind(',') > amount_str.rfind('.'):
# European format: 1.234,56
amount_str = amount_str.replace('.', '').replace(',', '.')
else:
# US format: 1,234.56
amount_str = amount_str.replace(',', '')
elif ',' in amount_str:
# Could be European (1,56) or US thousands (1,234)
parts = amount_str.split(',')
if len(parts[-1]) == 2: # Likely European decimal
amount_str = amount_str.replace(',', '.')
else: # Likely US thousands
amount_str = amount_str.replace(',', '')
return abs(float(amount_str))
except (ValueError, AttributeError):
return 0.0
def parse_csv(self, file_bytes):
"""Parse CSV file and extract transactions"""
try:
# Detect encoding
encoding = self.detect_encoding(file_bytes)
content = file_bytes.decode(encoding)
# Detect delimiter
first_line = content.split('\n')[0]
delimiter = self.detect_delimiter(first_line)
# Parse CSV
stream = io.StringIO(content)
reader = csv.reader(stream, delimiter=delimiter)
# Read headers
headers = next(reader, None)
if not headers:
return {'success': False, 'error': 'CSV file is empty'}
# Detect column mapping
column_map = self.detect_columns(headers)
if column_map['date'] is None:
return {'success': False, 'error': 'Could not detect date column. Please ensure your CSV has a date column.'}
if column_map['description'] is None:
column_map['description'] = 1 if len(headers) > 1 else 0
# Parse transactions
transactions = []
row_num = 0
for row in reader:
row_num += 1
if not row or len(row) == 0:
continue
try:
transaction = self.extract_transaction(row, column_map)
if transaction:
transactions.append(transaction)
except Exception as e:
self.errors.append(f"Row {row_num}: {str(e)}")
return {
'success': True,
'transactions': transactions,
'total_found': len(transactions),
'column_mapping': {k: headers[v] if v is not None else None for k, v in column_map.items()},
'errors': self.errors
}
except Exception as e:
return {'success': False, 'error': f'Failed to parse CSV: {str(e)}'}
def extract_transaction(self, row, column_map):
"""Extract transaction data from CSV row"""
if len(row) <= max(v for v in column_map.values() if v is not None):
return None
# Parse date
date_idx = column_map['date']
trans_date = self.parse_date(row[date_idx])
if not trans_date:
return None
# Parse description
desc_idx = column_map['description']
description = row[desc_idx].strip() if desc_idx is not None and desc_idx < len(row) else 'Transaction'
if not description:
description = 'Transaction'
# Parse amount (handle debit/credit or single amount column)
amount = 0.0
trans_type = 'expense'
if column_map['debit'] is not None and column_map['credit'] is not None:
debit_val = self.parse_amount(row[column_map['debit']] if column_map['debit'] < len(row) else '0')
credit_val = self.parse_amount(row[column_map['credit']] if column_map['credit'] < len(row) else '0')
if debit_val > 0:
amount = debit_val
trans_type = 'expense'
elif credit_val > 0:
amount = credit_val
trans_type = 'income'
elif column_map['amount'] is not None:
amount_val = self.parse_amount(row[column_map['amount']] if column_map['amount'] < len(row) else '0')
amount = abs(amount_val)
# Negative amounts are expenses, positive are income
trans_type = 'expense' if amount_val < 0 or amount_val == 0 else 'income'
if amount == 0:
return None
# Get bank category if available
bank_category = None
if column_map['category'] is not None and column_map['category'] < len(row):
bank_category = row[column_map['category']].strip()
return {
'date': trans_date.isoformat(),
'description': description[:200], # Limit description length
'amount': round(amount, 2),
'type': trans_type,
'bank_category': bank_category
}
@bp.route('/parse-csv', methods=['POST'])
@login_required
def parse_csv():
"""
Parse uploaded CSV file and return transactions for review
Security: User must be authenticated, file size limited
"""
if 'file' not in request.files:
return jsonify({'success': False, 'error': 'No file uploaded'}), 400
file = request.files['file']
if not file or not file.filename:
return jsonify({'success': False, 'error': 'No file selected'}), 400
# Security: Validate filename
filename = secure_filename(file.filename)
if not filename.lower().endswith('.csv'):
return jsonify({'success': False, 'error': 'Only CSV files are supported'}), 400
# Security: Check file size (max 10MB)
file_bytes = file.read()
if len(file_bytes) > 10 * 1024 * 1024:
return jsonify({'success': False, 'error': 'File too large. Maximum size is 10MB'}), 400
# Parse CSV
parser = CSVParser()
result = parser.parse_csv(file_bytes)
if not result['success']:
return jsonify(result), 400
return jsonify(result)
@bp.route('/detect-duplicates', methods=['POST'])
@login_required
def detect_duplicates():
"""
Check for duplicate transactions in the database
Security: Only checks current user's expenses
"""
data = request.get_json()
transactions = data.get('transactions', [])
if not transactions:
return jsonify({'success': False, 'error': 'No transactions provided'}), 400
duplicates = []
for trans in transactions:
try:
trans_date = datetime.fromisoformat(trans['date']).date()
amount = float(trans['amount'])
description = trans['description']
# Look for potential duplicates within ±2 days and exact amount
date_start = trans_date - timedelta(days=2)
date_end = trans_date + timedelta(days=2)
# Security: Filter by current user only
existing = Expense.query.filter(
Expense.user_id == current_user.id,
Expense.date >= date_start,
Expense.date <= date_end,
Expense.amount == amount
).all()
# Check for similar descriptions
for exp in existing:
# Simple similarity: check if descriptions overlap significantly
desc_lower = description.lower()
exp_desc_lower = exp.description.lower()
# Check if at least 50% of words match
desc_words = set(desc_lower.split())
exp_words = set(exp_desc_lower.split())
if len(desc_words) > 0:
overlap = len(desc_words.intersection(exp_words)) / len(desc_words)
if overlap >= 0.5:
duplicates.append({
'transaction': trans,
'existing': {
'id': exp.id,
'date': exp.date.isoformat(),
'description': exp.description,
'amount': float(exp.amount),
'category': exp.category.name if exp.category else None
},
'similarity': round(overlap * 100, 0)
})
break
except Exception as e:
continue
return jsonify({
'success': True,
'duplicates': duplicates,
'duplicate_count': len(duplicates)
})
@bp.route('/import', methods=['POST'])
@login_required
def import_transactions():
"""
Import selected transactions into the database
Security: Only imports to current user's account, validates all data
"""
data = request.get_json()
transactions = data.get('transactions', [])
category_mapping = data.get('category_mapping', {})
skip_duplicates = data.get('skip_duplicates', False)
if not transactions:
return jsonify({'success': False, 'error': 'No transactions to import'}), 400
imported = []
skipped = []
errors = []
# Security: Get user's categories
user_categories = {cat.id: cat for cat in Category.query.filter_by(user_id=current_user.id).all()}
if not user_categories:
return jsonify({'success': False, 'error': 'No categories found. Please create categories first.'}), 400
# Get default category
default_category_id = list(user_categories.keys())[0]
for idx, trans in enumerate(transactions):
try:
# Skip if marked as duplicate
if skip_duplicates and trans.get('is_duplicate'):
skipped.append({'transaction': trans, 'reason': 'Duplicate'})
continue
# Parse and validate data
try:
trans_date = datetime.fromisoformat(trans['date']).date()
except (ValueError, KeyError) as e:
errors.append({'transaction': trans, 'error': f'Invalid date: {trans.get("date", "missing")}'})
continue
try:
amount = float(trans['amount'])
except (ValueError, KeyError, TypeError) as e:
errors.append({'transaction': trans, 'error': f'Invalid amount: {trans.get("amount", "missing")}'})
continue
description = trans.get('description', 'Transaction')
# Validate amount
if amount <= 0:
errors.append({'transaction': trans, 'error': f'Invalid amount: {amount}'})
continue
# Get category ID from mapping or bank category
category_id = None
bank_category = trans.get('bank_category')
# Try to get from explicit mapping
if bank_category and bank_category in category_mapping:
category_id = int(category_mapping[bank_category])
elif str(idx) in category_mapping:
category_id = int(category_mapping[str(idx)])
else:
category_id = default_category_id
# Security: Verify category belongs to user
if category_id not in user_categories:
errors.append({'transaction': trans, 'error': f'Invalid category ID: {category_id}'})
continue
# Prepare tags with bank category if available
tags = []
if bank_category:
tags.append(f'Import: {bank_category}')
# Create expense
expense = Expense(
user_id=current_user.id,
category_id=category_id,
amount=amount,
description=description,
date=trans_date,
currency=current_user.currency,
tags=json.dumps(tags)
)
db.session.add(expense)
imported.append({
'date': trans_date.isoformat(),
'description': description,
'amount': amount,
'category': user_categories[category_id].name
})
except Exception as e:
errors.append({'transaction': trans, 'error': str(e)})
# Commit all imports
try:
db.session.commit()
return jsonify({
'success': True,
'imported_count': len(imported),
'skipped_count': len(skipped),
'error_count': len(errors),
'imported': imported,
'skipped': skipped,
'errors': errors
})
except Exception as e:
db.session.rollback()
return jsonify({'success': False, 'error': f'Database error: {str(e)}'}), 500
@bp.route('/create-categories', methods=['POST'])
@login_required
def create_categories():
"""
Create missing categories from CSV bank categories
Security: Only creates for current user
"""
data = request.get_json()
bank_categories = data.get('bank_categories', [])
if not bank_categories:
return jsonify({'success': False, 'error': 'No categories provided'}), 400
# Get existing categories for user
existing_cats = {cat.name.lower(): cat for cat in Category.query.filter_by(user_id=current_user.id).all()}
created = []
mapping = {}
for bank_cat in bank_categories:
if not bank_cat or not bank_cat.strip():
continue
bank_cat_clean = bank_cat.strip()
bank_cat_lower = bank_cat_clean.lower()
# Check if category already exists
if bank_cat_lower in existing_cats:
mapping[bank_cat] = existing_cats[bank_cat_lower].id
else:
# Create new category
max_order = db.session.query(db.func.max(Category.display_order)).filter_by(user_id=current_user.id).scalar() or 0
new_cat = Category(
user_id=current_user.id,
name=bank_cat_clean,
icon='category',
color='#' + format(hash(bank_cat_clean) % 0xFFFFFF, '06x'), # Generate color from name
display_order=max_order + 1
)
db.session.add(new_cat)
db.session.flush() # Get ID without committing
created.append({
'name': bank_cat_clean,
'id': new_cat.id
})
mapping[bank_cat] = new_cat.id
existing_cats[bank_cat_lower] = new_cat
try:
db.session.commit()
return jsonify({
'success': True,
'created': created,
'mapping': mapping,
'message': f'Created {len(created)} new categories'
})
except Exception as e:
db.session.rollback()
return jsonify({'success': False, 'error': f'Failed to create categories: {str(e)}'}), 500
@bp.route('/suggest-category', methods=['POST'])
@login_required
def suggest_category():
"""
Suggest category mapping based on description and existing expenses
Uses simple keyword matching and historical patterns
"""
data = request.get_json()
description = data.get('description', '').lower()
bank_category = data.get('bank_category', '').lower()
if not description:
return jsonify({'success': False, 'error': 'No description provided'}), 400
# Security: Get only user's categories
user_categories = Category.query.filter_by(user_id=current_user.id).all()
# Look for similar expenses in user's history
similar_expenses = Expense.query.filter(
Expense.user_id == current_user.id
).order_by(Expense.date.desc()).limit(100).all()
# Score categories based on keyword matching
category_scores = {cat.id: 0 for cat in user_categories}
for expense in similar_expenses:
exp_desc = expense.description.lower()
# Simple word matching
desc_words = set(description.split())
exp_words = set(exp_desc.split())
overlap = len(desc_words.intersection(exp_words))
if overlap > 0:
category_scores[expense.category_id] += overlap
# Get best match
if max(category_scores.values()) > 0:
best_category_id = max(category_scores, key=category_scores.get)
best_category = next(cat for cat in user_categories if cat.id == best_category_id)
return jsonify({
'success': True,
'suggested_category_id': best_category.id,
'suggested_category_name': best_category.name,
'confidence': min(100, category_scores[best_category_id] * 20)
})
# No match found, return first category
return jsonify({
'success': True,
'suggested_category_id': user_categories[0].id,
'suggested_category_name': user_categories[0].name,
'confidence': 0
})