Initial commit
This commit is contained in:
commit
983cee0320
322 changed files with 57174 additions and 0 deletions
173
app/ocr.py
Normal file
173
app/ocr.py
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
"""
|
||||
OCR Processing Utility
|
||||
Extracts text from images and PDFs for searchability
|
||||
Security: All file paths validated before processing
|
||||
"""
|
||||
import os
|
||||
import tempfile
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_path
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
def preprocess_image(image):
|
||||
"""
|
||||
Preprocess image to improve OCR accuracy
|
||||
- Convert to grayscale
|
||||
- Apply adaptive thresholding
|
||||
- Denoise
|
||||
"""
|
||||
try:
|
||||
# Convert PIL Image to numpy array
|
||||
img_array = np.array(image)
|
||||
|
||||
# Convert to grayscale
|
||||
if len(img_array.shape) == 3:
|
||||
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
||||
else:
|
||||
gray = img_array
|
||||
|
||||
# Apply adaptive thresholding
|
||||
thresh = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
|
||||
)
|
||||
|
||||
# Denoise
|
||||
denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
|
||||
|
||||
# Convert back to PIL Image
|
||||
return Image.fromarray(denoised)
|
||||
except Exception as e:
|
||||
print(f"Error preprocessing image: {str(e)}")
|
||||
# Return original image if preprocessing fails
|
||||
return image
|
||||
|
||||
|
||||
def extract_text_from_image(image_path):
|
||||
"""
|
||||
Extract text from an image file using OCR
|
||||
Supports: PNG, JPG, JPEG
|
||||
Security: Validates file exists and is readable
|
||||
Returns: Extracted text or empty string on failure
|
||||
"""
|
||||
try:
|
||||
# Security: Validate file exists
|
||||
if not os.path.exists(image_path):
|
||||
print(f"Image file not found: {image_path}")
|
||||
return ""
|
||||
|
||||
# Open and preprocess image
|
||||
image = Image.open(image_path)
|
||||
preprocessed = preprocess_image(image)
|
||||
|
||||
# Extract text using Tesseract with English + Romanian
|
||||
text = pytesseract.image_to_string(
|
||||
preprocessed,
|
||||
lang='eng+ron', # Support both English and Romanian
|
||||
config='--psm 6' # Assume uniform block of text
|
||||
)
|
||||
|
||||
return text.strip()
|
||||
except Exception as e:
|
||||
print(f"Error extracting text from image {image_path}: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_path):
|
||||
"""
|
||||
Extract text from a PDF file using OCR
|
||||
Converts PDF pages to images, then applies OCR
|
||||
Security: Validates file exists and is readable
|
||||
Returns: Extracted text or empty string on failure
|
||||
"""
|
||||
try:
|
||||
# Security: Validate file exists
|
||||
if not os.path.exists(pdf_path):
|
||||
print(f"PDF file not found: {pdf_path}")
|
||||
return ""
|
||||
|
||||
# Convert PDF to images (first 10 pages max to avoid memory issues)
|
||||
pages = convert_from_path(pdf_path, first_page=1, last_page=10, dpi=300)
|
||||
|
||||
extracted_text = []
|
||||
for i, page in enumerate(pages):
|
||||
# Preprocess page
|
||||
preprocessed = preprocess_image(page)
|
||||
|
||||
# Extract text
|
||||
text = pytesseract.image_to_string(
|
||||
preprocessed,
|
||||
lang='eng+ron',
|
||||
config='--psm 6'
|
||||
)
|
||||
|
||||
if text.strip():
|
||||
extracted_text.append(f"--- Page {i+1} ---\n{text.strip()}")
|
||||
|
||||
return "\n\n".join(extracted_text)
|
||||
except Exception as e:
|
||||
print(f"Error extracting text from PDF {pdf_path}: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_file(file_path, file_type):
|
||||
"""
|
||||
Extract text from any supported file type
|
||||
Security: Validates file path and type before processing
|
||||
|
||||
Args:
|
||||
file_path: Absolute path to the file
|
||||
file_type: File extension (pdf, png, jpg, jpeg)
|
||||
|
||||
Returns:
|
||||
Extracted text or empty string on failure
|
||||
"""
|
||||
try:
|
||||
# Security: Validate file path
|
||||
if not os.path.isabs(file_path):
|
||||
print(f"Invalid file path (not absolute): {file_path}")
|
||||
return ""
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
print(f"File not found: {file_path}")
|
||||
return ""
|
||||
|
||||
# Normalize file type
|
||||
file_type = file_type.lower().strip('.')
|
||||
|
||||
# Route to appropriate extractor
|
||||
if file_type == 'pdf':
|
||||
return extract_text_from_pdf(file_path)
|
||||
elif file_type in ['png', 'jpg', 'jpeg']:
|
||||
return extract_text_from_image(file_path)
|
||||
else:
|
||||
print(f"Unsupported file type for OCR: {file_type}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
print(f"Error in extract_text_from_file: {str(e)}")
|
||||
return ""
|
||||
|
||||
|
||||
def process_ocr_async(file_path, file_type):
|
||||
"""
|
||||
Wrapper for async OCR processing
|
||||
Can be used with background jobs if needed
|
||||
|
||||
Returns:
|
||||
Dictionary with success status and extracted text
|
||||
"""
|
||||
try:
|
||||
text = extract_text_from_file(file_path, file_type)
|
||||
return {
|
||||
'success': True,
|
||||
'text': text,
|
||||
'length': len(text)
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e),
|
||||
'text': ''
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue