173 lines
5 KiB
Python
173 lines
5 KiB
Python
"""
|
|
OCR Processing Utility
|
|
Extracts text from images and PDFs for searchability
|
|
Security: All file paths validated before processing
|
|
"""
|
|
import os
|
|
import tempfile
|
|
from PIL import Image
|
|
import pytesseract
|
|
from pdf2image import convert_from_path
|
|
import cv2
|
|
import numpy as np
|
|
|
|
|
|
def preprocess_image(image):
|
|
"""
|
|
Preprocess image to improve OCR accuracy
|
|
- Convert to grayscale
|
|
- Apply adaptive thresholding
|
|
- Denoise
|
|
"""
|
|
try:
|
|
# Convert PIL Image to numpy array
|
|
img_array = np.array(image)
|
|
|
|
# Convert to grayscale
|
|
if len(img_array.shape) == 3:
|
|
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
else:
|
|
gray = img_array
|
|
|
|
# Apply adaptive thresholding
|
|
thresh = cv2.adaptiveThreshold(
|
|
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
|
|
)
|
|
|
|
# Denoise
|
|
denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
|
|
|
|
# Convert back to PIL Image
|
|
return Image.fromarray(denoised)
|
|
except Exception as e:
|
|
print(f"Error preprocessing image: {str(e)}")
|
|
# Return original image if preprocessing fails
|
|
return image
|
|
|
|
|
|
def extract_text_from_image(image_path):
|
|
"""
|
|
Extract text from an image file using OCR
|
|
Supports: PNG, JPG, JPEG
|
|
Security: Validates file exists and is readable
|
|
Returns: Extracted text or empty string on failure
|
|
"""
|
|
try:
|
|
# Security: Validate file exists
|
|
if not os.path.exists(image_path):
|
|
print(f"Image file not found: {image_path}")
|
|
return ""
|
|
|
|
# Open and preprocess image
|
|
image = Image.open(image_path)
|
|
preprocessed = preprocess_image(image)
|
|
|
|
# Extract text using Tesseract with English + Romanian
|
|
text = pytesseract.image_to_string(
|
|
preprocessed,
|
|
lang='eng+ron', # Support both English and Romanian
|
|
config='--psm 6' # Assume uniform block of text
|
|
)
|
|
|
|
return text.strip()
|
|
except Exception as e:
|
|
print(f"Error extracting text from image {image_path}: {str(e)}")
|
|
return ""
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path):
|
|
"""
|
|
Extract text from a PDF file using OCR
|
|
Converts PDF pages to images, then applies OCR
|
|
Security: Validates file exists and is readable
|
|
Returns: Extracted text or empty string on failure
|
|
"""
|
|
try:
|
|
# Security: Validate file exists
|
|
if not os.path.exists(pdf_path):
|
|
print(f"PDF file not found: {pdf_path}")
|
|
return ""
|
|
|
|
# Convert PDF to images (first 10 pages max to avoid memory issues)
|
|
pages = convert_from_path(pdf_path, first_page=1, last_page=10, dpi=300)
|
|
|
|
extracted_text = []
|
|
for i, page in enumerate(pages):
|
|
# Preprocess page
|
|
preprocessed = preprocess_image(page)
|
|
|
|
# Extract text
|
|
text = pytesseract.image_to_string(
|
|
preprocessed,
|
|
lang='eng+ron',
|
|
config='--psm 6'
|
|
)
|
|
|
|
if text.strip():
|
|
extracted_text.append(f"--- Page {i+1} ---\n{text.strip()}")
|
|
|
|
return "\n\n".join(extracted_text)
|
|
except Exception as e:
|
|
print(f"Error extracting text from PDF {pdf_path}: {str(e)}")
|
|
return ""
|
|
|
|
|
|
def extract_text_from_file(file_path, file_type):
|
|
"""
|
|
Extract text from any supported file type
|
|
Security: Validates file path and type before processing
|
|
|
|
Args:
|
|
file_path: Absolute path to the file
|
|
file_type: File extension (pdf, png, jpg, jpeg)
|
|
|
|
Returns:
|
|
Extracted text or empty string on failure
|
|
"""
|
|
try:
|
|
# Security: Validate file path
|
|
if not os.path.isabs(file_path):
|
|
print(f"Invalid file path (not absolute): {file_path}")
|
|
return ""
|
|
|
|
if not os.path.exists(file_path):
|
|
print(f"File not found: {file_path}")
|
|
return ""
|
|
|
|
# Normalize file type
|
|
file_type = file_type.lower().strip('.')
|
|
|
|
# Route to appropriate extractor
|
|
if file_type == 'pdf':
|
|
return extract_text_from_pdf(file_path)
|
|
elif file_type in ['png', 'jpg', 'jpeg']:
|
|
return extract_text_from_image(file_path)
|
|
else:
|
|
print(f"Unsupported file type for OCR: {file_type}")
|
|
return ""
|
|
except Exception as e:
|
|
print(f"Error in extract_text_from_file: {str(e)}")
|
|
return ""
|
|
|
|
|
|
def process_ocr_async(file_path, file_type):
|
|
"""
|
|
Wrapper for async OCR processing
|
|
Can be used with background jobs if needed
|
|
|
|
Returns:
|
|
Dictionary with success status and extracted text
|
|
"""
|
|
try:
|
|
text = extract_text_from_file(file_path, file_type)
|
|
return {
|
|
'success': True,
|
|
'text': text,
|
|
'length': len(text)
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'text': ''
|
|
}
|