fina/app/ocr.py

174 lines
5 KiB
Python
Raw Normal View History

2025-12-26 00:52:56 +00:00
"""
OCR Processing Utility
Extracts text from images and PDFs for searchability
Security: All file paths validated before processing
"""
import os
import tempfile
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import cv2
import numpy as np
def preprocess_image(image):
"""
Preprocess image to improve OCR accuracy
- Convert to grayscale
- Apply adaptive thresholding
- Denoise
"""
try:
# Convert PIL Image to numpy array
img_array = np.array(image)
# Convert to grayscale
if len(img_array.shape) == 3:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
gray = img_array
# Apply adaptive thresholding
thresh = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
# Denoise
denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
# Convert back to PIL Image
return Image.fromarray(denoised)
except Exception as e:
print(f"Error preprocessing image: {str(e)}")
# Return original image if preprocessing fails
return image
def extract_text_from_image(image_path):
"""
Extract text from an image file using OCR
Supports: PNG, JPG, JPEG
Security: Validates file exists and is readable
Returns: Extracted text or empty string on failure
"""
try:
# Security: Validate file exists
if not os.path.exists(image_path):
print(f"Image file not found: {image_path}")
return ""
# Open and preprocess image
image = Image.open(image_path)
preprocessed = preprocess_image(image)
# Extract text using Tesseract with English + Romanian
text = pytesseract.image_to_string(
preprocessed,
lang='eng+ron', # Support both English and Romanian
config='--psm 6' # Assume uniform block of text
)
return text.strip()
except Exception as e:
print(f"Error extracting text from image {image_path}: {str(e)}")
return ""
def extract_text_from_pdf(pdf_path):
"""
Extract text from a PDF file using OCR
Converts PDF pages to images, then applies OCR
Security: Validates file exists and is readable
Returns: Extracted text or empty string on failure
"""
try:
# Security: Validate file exists
if not os.path.exists(pdf_path):
print(f"PDF file not found: {pdf_path}")
return ""
# Convert PDF to images (first 10 pages max to avoid memory issues)
pages = convert_from_path(pdf_path, first_page=1, last_page=10, dpi=300)
extracted_text = []
for i, page in enumerate(pages):
# Preprocess page
preprocessed = preprocess_image(page)
# Extract text
text = pytesseract.image_to_string(
preprocessed,
lang='eng+ron',
config='--psm 6'
)
if text.strip():
extracted_text.append(f"--- Page {i+1} ---\n{text.strip()}")
return "\n\n".join(extracted_text)
except Exception as e:
print(f"Error extracting text from PDF {pdf_path}: {str(e)}")
return ""
def extract_text_from_file(file_path, file_type):
"""
Extract text from any supported file type
Security: Validates file path and type before processing
Args:
file_path: Absolute path to the file
file_type: File extension (pdf, png, jpg, jpeg)
Returns:
Extracted text or empty string on failure
"""
try:
# Security: Validate file path
if not os.path.isabs(file_path):
print(f"Invalid file path (not absolute): {file_path}")
return ""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return ""
# Normalize file type
file_type = file_type.lower().strip('.')
# Route to appropriate extractor
if file_type == 'pdf':
return extract_text_from_pdf(file_path)
elif file_type in ['png', 'jpg', 'jpeg']:
return extract_text_from_image(file_path)
else:
print(f"Unsupported file type for OCR: {file_type}")
return ""
except Exception as e:
print(f"Error in extract_text_from_file: {str(e)}")
return ""
def process_ocr_async(file_path, file_type):
"""
Wrapper for async OCR processing
Can be used with background jobs if needed
Returns:
Dictionary with success status and extracted text
"""
try:
text = extract_text_from_file(file_path, file_type)
return {
'success': True,
'text': text,
'length': len(text)
}
except Exception as e:
return {
'success': False,
'error': str(e),
'text': ''
}