Initial commit
This commit is contained in:
commit
983cee0320
322 changed files with 57174 additions and 0 deletions
480
backup/first -fina app/app/bank_import.py
Normal file
480
backup/first -fina app/app/bank_import.py
Normal file
|
|
@ -0,0 +1,480 @@
|
|||
"""
|
||||
Bank Statement Import Module for FINA Finance Tracker
|
||||
Parses PDF and CSV bank statements and extracts transactions
|
||||
"""
|
||||
import re
|
||||
import csv
|
||||
import io
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
import PyPDF2
|
||||
|
||||
|
||||
class BankStatementParser:
|
||||
"""Base parser class for bank statements"""
|
||||
|
||||
def __init__(self):
|
||||
self.transactions = []
|
||||
self.detected_format = None
|
||||
self.total_transactions = 0
|
||||
self.parse_errors = []
|
||||
|
||||
def parse(self, file_content, file_type):
|
||||
"""
|
||||
Main parse method - detects format and extracts transactions
|
||||
|
||||
Args:
|
||||
file_content: File content (bytes for PDF, string for CSV)
|
||||
file_type: 'pdf' or 'csv'
|
||||
|
||||
Returns:
|
||||
dict with transactions and metadata
|
||||
"""
|
||||
if file_type == 'pdf':
|
||||
return self.parse_pdf(file_content)
|
||||
elif file_type == 'csv':
|
||||
return self.parse_csv(file_content)
|
||||
else:
|
||||
return {'success': False, 'error': 'Unsupported file type'}
|
||||
|
||||
def parse_pdf(self, pdf_bytes):
|
||||
"""
|
||||
Parse PDF bank statement
|
||||
Extracts transactions using pattern matching
|
||||
"""
|
||||
try:
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
||||
text = ""
|
||||
|
||||
# Extract text from all pages
|
||||
for page in pdf_reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
# Detect bank format
|
||||
bank_format = self.detect_bank_format(text)
|
||||
|
||||
# Parse transactions based on detected format
|
||||
if bank_format == 'generic':
|
||||
transactions = self.parse_generic_pdf(text)
|
||||
else:
|
||||
transactions = self.parse_generic_pdf(text) # Fallback to generic
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'transactions': transactions,
|
||||
'total_found': len(transactions),
|
||||
'bank_format': bank_format,
|
||||
'parse_errors': self.parse_errors
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'success': False,
|
||||
'error': f'PDF parsing failed: {str(e)}',
|
||||
'transactions': []
|
||||
}
|
||||
|
||||
def parse_csv(self, csv_string):
|
||||
"""
|
||||
Parse CSV bank statement
|
||||
Auto-detects column mapping
|
||||
"""
|
||||
try:
|
||||
# Try different delimiters
|
||||
delimiter = self.detect_csv_delimiter(csv_string)
|
||||
|
||||
stream = io.StringIO(csv_string)
|
||||
csv_reader = csv.DictReader(stream, delimiter=delimiter)
|
||||
|
||||
# Auto-detect column names
|
||||
fieldnames = csv_reader.fieldnames
|
||||
column_map = self.detect_csv_columns(fieldnames)
|
||||
|
||||
transactions = []
|
||||
row_num = 0
|
||||
|
||||
for row in csv_reader:
|
||||
row_num += 1
|
||||
try:
|
||||
transaction = self.extract_transaction_from_csv_row(row, column_map)
|
||||
if transaction:
|
||||
transactions.append(transaction)
|
||||
except Exception as e:
|
||||
self.parse_errors.append(f"Row {row_num}: {str(e)}")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'transactions': transactions,
|
||||
'total_found': len(transactions),
|
||||
'column_mapping': column_map,
|
||||
'parse_errors': self.parse_errors
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
'success': False,
|
||||
'error': f'CSV parsing failed: {str(e)}',
|
||||
'transactions': []
|
||||
}
|
||||
|
||||
def detect_bank_format(self, text):
|
||||
"""Detect which bank format the PDF uses"""
|
||||
text_lower = text.lower()
|
||||
|
||||
# Add patterns for specific banks
|
||||
if 'revolut' in text_lower:
|
||||
return 'revolut'
|
||||
elif 'ing' in text_lower or 'ing bank' in text_lower:
|
||||
return 'ing'
|
||||
elif 'bcr' in text_lower or 'banca comercială' in text_lower:
|
||||
return 'bcr'
|
||||
elif 'brd' in text_lower:
|
||||
return 'brd'
|
||||
else:
|
||||
return 'generic'
|
||||
|
||||
def parse_generic_pdf(self, text):
|
||||
"""
|
||||
Parse PDF using generic patterns
|
||||
Looks for common transaction patterns across banks
|
||||
"""
|
||||
transactions = []
|
||||
lines = text.split('\n')
|
||||
|
||||
# Common patterns for transactions
|
||||
# Date patterns: DD/MM/YYYY, DD-MM-YYYY, YYYY-MM-DD
|
||||
date_patterns = [
|
||||
r'(\d{2}[/-]\d{2}[/-]\d{4})', # DD/MM/YYYY or DD-MM-YYYY
|
||||
r'(\d{4}[/-]\d{2}[/-]\d{2})', # YYYY-MM-DD
|
||||
]
|
||||
|
||||
# Amount patterns: -123.45, 123.45, 123,45, -123,45
|
||||
amount_patterns = [
|
||||
r'[-]?\d{1,10}[.,]\d{2}', # With 2 decimals
|
||||
r'[-]?\d{1,10}\s*(?:RON|EUR|USD|GBP|LEI)', # With currency
|
||||
]
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# Skip header lines
|
||||
if any(word in line.lower() for word in ['sold', 'balance', 'iban', 'account', 'statement']):
|
||||
continue
|
||||
|
||||
# Look for date in line
|
||||
date_match = None
|
||||
for pattern in date_patterns:
|
||||
match = re.search(pattern, line)
|
||||
if match:
|
||||
date_match = match.group(1)
|
||||
break
|
||||
|
||||
if not date_match:
|
||||
continue
|
||||
|
||||
# Parse date
|
||||
trans_date = self.parse_date(date_match)
|
||||
if not trans_date:
|
||||
continue
|
||||
|
||||
# Look for amount in this line and nearby lines
|
||||
amount = None
|
||||
description = line
|
||||
|
||||
# Check current line and next 2 lines for amount
|
||||
for j in range(i, min(i + 3, len(lines))):
|
||||
amounts_found = re.findall(r'[-]?\d{1,10}[.,]\d{2}', lines[j])
|
||||
if amounts_found:
|
||||
# Take the last amount (usually the transaction amount)
|
||||
amount_str = amounts_found[-1]
|
||||
amount = self.parse_amount(amount_str)
|
||||
break
|
||||
|
||||
if not amount or amount == 0:
|
||||
continue
|
||||
|
||||
# Clean description
|
||||
description = self.clean_description(line, date_match, str(amount))
|
||||
|
||||
if description:
|
||||
transactions.append({
|
||||
'date': trans_date,
|
||||
'description': description,
|
||||
'amount': abs(amount), # Always positive, type determined by sign
|
||||
'type': 'expense' if amount < 0 else 'income',
|
||||
'original_amount': amount
|
||||
})
|
||||
|
||||
# Deduplicate based on date + amount + description similarity
|
||||
transactions = self.deduplicate_transactions(transactions)
|
||||
|
||||
return transactions
|
||||
|
||||
def detect_csv_delimiter(self, csv_string):
|
||||
"""Detect CSV delimiter (comma, semicolon, tab)"""
|
||||
first_line = csv_string.split('\n')[0]
|
||||
|
||||
comma_count = first_line.count(',')
|
||||
semicolon_count = first_line.count(';')
|
||||
tab_count = first_line.count('\t')
|
||||
|
||||
if semicolon_count > comma_count and semicolon_count > tab_count:
|
||||
return ';'
|
||||
elif tab_count > comma_count:
|
||||
return '\t'
|
||||
else:
|
||||
return ','
|
||||
|
||||
def detect_csv_columns(self, fieldnames):
|
||||
"""
|
||||
Auto-detect which columns contain date, description, amount
|
||||
Returns mapping of column indices
|
||||
"""
|
||||
fieldnames_lower = [f.lower() if f else '' for f in fieldnames]
|
||||
|
||||
column_map = {
|
||||
'date': None,
|
||||
'description': None,
|
||||
'amount': None,
|
||||
'debit': None,
|
||||
'credit': None
|
||||
}
|
||||
|
||||
# Date column keywords
|
||||
date_keywords = ['date', 'data', 'fecha', 'datum', 'transaction date']
|
||||
for idx, name in enumerate(fieldnames_lower):
|
||||
if any(keyword in name for keyword in date_keywords):
|
||||
column_map['date'] = fieldnames[idx]
|
||||
break
|
||||
|
||||
# Description column keywords
|
||||
desc_keywords = ['description', 'descriere', 'descripción', 'details', 'detalii', 'merchant', 'comerciant']
|
||||
for idx, name in enumerate(fieldnames_lower):
|
||||
if any(keyword in name for keyword in desc_keywords):
|
||||
column_map['description'] = fieldnames[idx]
|
||||
break
|
||||
|
||||
# Amount columns
|
||||
amount_keywords = ['amount', 'suma', 'monto', 'valoare']
|
||||
debit_keywords = ['debit', 'withdrawal', 'retragere', 'retiro', 'spent']
|
||||
credit_keywords = ['credit', 'deposit', 'depunere', 'ingreso', 'income']
|
||||
|
||||
for idx, name in enumerate(fieldnames_lower):
|
||||
if any(keyword in name for keyword in amount_keywords):
|
||||
column_map['amount'] = fieldnames[idx]
|
||||
elif any(keyword in name for keyword in debit_keywords):
|
||||
column_map['debit'] = fieldnames[idx]
|
||||
elif any(keyword in name for keyword in credit_keywords):
|
||||
column_map['credit'] = fieldnames[idx]
|
||||
|
||||
return column_map
|
||||
|
||||
def extract_transaction_from_csv_row(self, row, column_map):
|
||||
"""Extract transaction data from CSV row using column mapping"""
|
||||
# Get date
|
||||
date_col = column_map.get('date')
|
||||
if not date_col or date_col not in row:
|
||||
return None
|
||||
|
||||
trans_date = self.parse_date(row[date_col])
|
||||
if not trans_date:
|
||||
return None
|
||||
|
||||
# Get description
|
||||
desc_col = column_map.get('description')
|
||||
description = row.get(desc_col, 'Transaction') if desc_col else 'Transaction'
|
||||
|
||||
# Get amount
|
||||
amount = 0
|
||||
trans_type = 'expense'
|
||||
|
||||
# Check if we have separate debit/credit columns
|
||||
if column_map.get('debit') and column_map.get('credit'):
|
||||
debit_val = self.parse_amount(row.get(column_map['debit'], '0'))
|
||||
credit_val = self.parse_amount(row.get(column_map['credit'], '0'))
|
||||
|
||||
if debit_val > 0:
|
||||
amount = debit_val
|
||||
trans_type = 'expense'
|
||||
elif credit_val > 0:
|
||||
amount = credit_val
|
||||
trans_type = 'income'
|
||||
elif column_map.get('amount'):
|
||||
amount_val = self.parse_amount(row.get(column_map['amount'], '0'))
|
||||
amount = abs(amount_val)
|
||||
trans_type = 'expense' if amount_val < 0 else 'income'
|
||||
|
||||
if amount == 0:
|
||||
return None
|
||||
|
||||
return {
|
||||
'date': trans_date,
|
||||
'description': description.strip(),
|
||||
'amount': amount,
|
||||
'type': trans_type
|
||||
}
|
||||
|
||||
def parse_date(self, date_str):
|
||||
"""Parse date string in various formats"""
|
||||
date_str = date_str.strip()
|
||||
|
||||
# Try different date formats
|
||||
formats = [
|
||||
'%d/%m/%Y',
|
||||
'%d-%m-%Y',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d',
|
||||
'%d.%m.%Y',
|
||||
'%m/%d/%Y',
|
||||
'%d %b %Y',
|
||||
'%d %B %Y'
|
||||
]
|
||||
|
||||
for fmt in formats:
|
||||
try:
|
||||
return datetime.strptime(date_str, fmt).date()
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def parse_amount(self, amount_str):
|
||||
"""Parse amount string to float"""
|
||||
if not amount_str:
|
||||
return 0.0
|
||||
|
||||
# Remove currency symbols and whitespace
|
||||
amount_str = str(amount_str).strip()
|
||||
amount_str = re.sub(r'[^\d.,-]', '', amount_str)
|
||||
|
||||
if not amount_str:
|
||||
return 0.0
|
||||
|
||||
# Handle comma as decimal separator (European format)
|
||||
if ',' in amount_str and '.' in amount_str:
|
||||
# Format: 1.234,56 -> remove dots, replace comma with dot
|
||||
amount_str = amount_str.replace('.', '').replace(',', '.')
|
||||
elif ',' in amount_str:
|
||||
# Format: 1234,56 -> replace comma with dot
|
||||
amount_str = amount_str.replace(',', '.')
|
||||
|
||||
try:
|
||||
return float(amount_str)
|
||||
except ValueError:
|
||||
return 0.0
|
||||
|
||||
def clean_description(self, text, date_str, amount_str):
|
||||
"""Clean transaction description by removing date and amount"""
|
||||
# Remove date
|
||||
text = text.replace(date_str, '')
|
||||
# Remove amount
|
||||
text = text.replace(amount_str, '')
|
||||
# Remove extra whitespace
|
||||
text = ' '.join(text.split())
|
||||
# Remove common keywords
|
||||
remove_words = ['transaction', 'payment', 'transfer', 'tranzactie', 'plata']
|
||||
for word in remove_words:
|
||||
text = re.sub(word, '', text, flags=re.IGNORECASE)
|
||||
|
||||
text = text.strip()
|
||||
|
||||
# If too short, return generic
|
||||
if len(text) < 3:
|
||||
return 'Bank Transaction'
|
||||
|
||||
return text[:200] # Limit length
|
||||
|
||||
def deduplicate_transactions(self, transactions):
|
||||
"""Remove duplicate transactions"""
|
||||
seen = set()
|
||||
unique = []
|
||||
|
||||
for trans in transactions:
|
||||
# Create signature: date + amount + first 20 chars of description
|
||||
signature = (
|
||||
trans['date'].isoformat(),
|
||||
round(trans['amount'], 2),
|
||||
trans['description'][:20].lower()
|
||||
)
|
||||
|
||||
if signature not in seen:
|
||||
seen.add(signature)
|
||||
unique.append(trans)
|
||||
|
||||
return unique
|
||||
|
||||
def validate_file(self, file_content, file_type, max_size_mb=10):
|
||||
"""
|
||||
Validate uploaded file
|
||||
|
||||
Args:
|
||||
file_content: File content bytes
|
||||
file_type: 'pdf' or 'csv'
|
||||
max_size_mb: Maximum file size in MB
|
||||
|
||||
Returns:
|
||||
(is_valid, error_message)
|
||||
"""
|
||||
# Check file size
|
||||
size_mb = len(file_content) / (1024 * 1024)
|
||||
if size_mb > max_size_mb:
|
||||
return False, f'File too large. Maximum size is {max_size_mb}MB'
|
||||
|
||||
# Check file type
|
||||
if file_type == 'pdf':
|
||||
# Check PDF header
|
||||
if not file_content.startswith(b'%PDF'):
|
||||
return False, 'Invalid PDF file'
|
||||
elif file_type == 'csv':
|
||||
# Try to decode as text
|
||||
try:
|
||||
file_content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
file_content.decode('latin-1')
|
||||
except:
|
||||
return False, 'Invalid CSV file encoding'
|
||||
else:
|
||||
return False, 'Unsupported file type. Use PDF or CSV'
|
||||
|
||||
return True, None
|
||||
|
||||
|
||||
def parse_bank_statement(file_content, filename):
|
||||
"""
|
||||
Main entry point for bank statement parsing
|
||||
|
||||
Args:
|
||||
file_content: File content as bytes
|
||||
filename: Original filename
|
||||
|
||||
Returns:
|
||||
Parse results dictionary
|
||||
"""
|
||||
parser = BankStatementParser()
|
||||
|
||||
# Determine file type
|
||||
file_ext = filename.lower().split('.')[-1]
|
||||
if file_ext == 'pdf':
|
||||
file_type = 'pdf'
|
||||
content = file_content
|
||||
elif file_ext == 'csv':
|
||||
file_type = 'csv'
|
||||
# Try to decode
|
||||
try:
|
||||
content = file_content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
content = file_content.decode('latin-1', errors='ignore')
|
||||
else:
|
||||
return {
|
||||
'success': False,
|
||||
'error': 'Unsupported file type. Please upload PDF or CSV files.'
|
||||
}
|
||||
|
||||
# Validate file
|
||||
is_valid, error_msg = parser.validate_file(file_content, file_type)
|
||||
if not is_valid:
|
||||
return {'success': False, 'error': error_msg}
|
||||
|
||||
# Parse file
|
||||
result = parser.parse(content, file_type)
|
||||
|
||||
return result
|
||||
Loading…
Add table
Add a link
Reference in a new issue