""" Backfill OCR text for existing documents and receipts This will process all uploaded files that don't have OCR text yet """ import sys import os sys.path.insert(0, '/app') from app import create_app, db from app.models import Document, Expense from app.ocr import extract_text_from_file app = create_app() def process_documents(): """Process all documents without OCR text""" with app.app_context(): # Find documents without OCR text documents = Document.query.filter( (Document.ocr_text == None) | (Document.ocr_text == '') ).all() print(f"\nFound {len(documents)} documents to process") processed = 0 errors = 0 for doc in documents: try: # Check if file type supports OCR if doc.file_type.lower() not in ['pdf', 'png', 'jpg', 'jpeg']: print(f"⊘ Skipping {doc.original_filename} - {doc.file_type} not supported for OCR") continue # Get absolute file path file_path = os.path.abspath(doc.file_path) if not os.path.exists(file_path): print(f"✗ File not found: {doc.original_filename}") errors += 1 continue print(f"Processing: {doc.original_filename}...", end=' ') # Extract OCR text ocr_text = extract_text_from_file(file_path, doc.file_type) if ocr_text: doc.ocr_text = ocr_text db.session.commit() print(f"✓ Extracted {len(ocr_text)} characters") processed += 1 else: print("⊘ No text found") # Still update to empty string to mark as processed doc.ocr_text = "" db.session.commit() except Exception as e: print(f"✗ Error: {str(e)}") errors += 1 print(f"\n✓ Documents processed: {processed}") print(f"⊘ Documents with no text: {len(documents) - processed - errors}") print(f"✗ Errors: {errors}") def process_receipts(): """Process all expense receipts without OCR text""" with app.app_context(): # Find expenses with receipts but no OCR text expenses = Expense.query.filter( Expense.receipt_path != None, (Expense.receipt_ocr_text == None) | (Expense.receipt_ocr_text == '') ).all() print(f"\nFound {len(expenses)} receipts to process") processed = 0 errors = 0 for expense in expenses: try: # Build absolute path receipt_path = expense.receipt_path.replace('/uploads/', '').lstrip('/') file_path = os.path.abspath(os.path.join('/app', 'uploads', receipt_path)) if not os.path.exists(file_path): print(f"✗ Receipt not found for: {expense.description}") errors += 1 continue # Get file extension file_ext = file_path.rsplit('.', 1)[1].lower() if '.' in file_path else '' if file_ext not in ['pdf', 'png', 'jpg', 'jpeg']: print(f"⊘ Skipping receipt for {expense.description} - {file_ext} not supported") continue print(f"Processing receipt for: {expense.description}...", end=' ') # Extract OCR text ocr_text = extract_text_from_file(file_path, file_ext) if ocr_text: expense.receipt_ocr_text = ocr_text db.session.commit() print(f"✓ Extracted {len(ocr_text)} characters") processed += 1 else: print("⊘ No text found") expense.receipt_ocr_text = "" db.session.commit() except Exception as e: print(f"✗ Error: {str(e)}") errors += 1 print(f"\n✓ Receipts processed: {processed}") print(f"⊘ Receipts with no text: {len(expenses) - processed - errors}") print(f"✗ Errors: {errors}") if __name__ == '__main__': print("=" * 60) print("OCR BACKFILL - Processing existing files") print("=" * 60) process_documents() process_receipts() print("\n" + "=" * 60) print("✓ OCR backfill completed!") print("=" * 60)