This guide provides practical instructions for uploading, processing, and extracting data from documents using ObjDocument - a comprehensive document management system.
ObjDocument provides powerful document processing capabilities:
sudo apt install poppler-utils # For PDF processing
sudo apt install tesseract-ocr # For OCR
pip install pytesseract PyPDF2 Pillow
import PyPDF2
def extract_text_from_pdf(file_path):
"""Extract all text from a PDF file."""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
text = ''
for page_num in range(pdf_reader.numPages):
page = pdf_reader.getPage(page_num)
text += page.extractText()
return text
# Use
pdf_text = extract_text_from_pdf('document.pdf')
print(pdf_text)
import pytesseract
from PIL import Image
def ocr_image_to_text(image_path):
"""Extract text from image using OCR."""
# Open image
image = Image.open(image_path)
# Extract text
extracted_text = pytesseract.image_to_string(image)
return extracted_text
# Use
text = ocr_image_to_text('receipt.jpg')
print(text)
from ObjDocument import ObjDocument
# Initialize
doc = ObjDocument(0)
# Read document details
doc.Read("DOC001")
# Build file names based on type
doc.build_name()
# Access document properties
print(f"Filename: {doc.Filename}")
print(f"Icon: {doc.Iconname}")
ObjDocument automatically detects and handles:
def get_document_type(filename):
"""Detect document type from filename."""
extension = filename.rsplit('.', 1)[-1].upper()
type_map = {
'JPG': 'image',
'JPEG': 'image',
'GIF': 'image',
'PNG': 'image',
'PDF': 'document',
'DOCX': 'document',
'XLSX': 'spreadsheet',
'CSV': 'data',
'WAV': 'audio',
'MP3': 'audio',
'HTML': 'web'
}
return type_map.get(extension, 'unknown')
# Use
doc_type = get_document_type('invoice.pdf') # Returns: 'document'
import PyPDF2
def extract_pdf_metadata(pdf_path):
"""Extract metadata from PDF."""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
metadata = {
'title': pdf_reader.metadata.get('/Title', 'Unknown'),
'author': pdf_reader.metadata.get('/Author', 'Unknown'),
'subject': pdf_reader.metadata.get('/Subject', 'Unknown'),
'creator': pdf_reader.metadata.get('/Creator', 'Unknown'),
'producer': pdf_reader.metadata.get('/Producer', 'Unknown'),
'creation_date': pdf_reader.metadata.get('/CreationDate', 'Unknown'),
'page_count': pdf_reader.numPages
}
return metadata
# Use
metadata = extract_pdf_metadata('report.pdf')
print(f"Title: {metadata['title']}")
print(f"Pages: {metadata['page_count']}")
def extract_pdf_page(pdf_path, page_number):
"""Extract text from specific PDF page."""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
if page_number < pdf_reader.numPages:
page = pdf_reader.getPage(page_number)
return page.extractText()
return None
# Use
first_page = extract_pdf_page('contract.pdf', 0)
print(first_page)
# Use poppler-utils pdftoppm
pdftoppm -jpeg document.pdf output
# Creates: output-1.jpg, output-2.jpg, etc.
import pytesseract
from PIL import Image
def ocr_with_confidence(image_path):
"""OCR with confidence scores."""
image = Image.open(image_path)
# Get detailed data
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
# Filter by confidence
high_confidence_text = []
for i, conf in enumerate(data['conf']):
if int(conf) > 60: # Confidence > 60%
high_confidence_text.append(data['text'][i])
return ' '.join(high_confidence_text)
# Use
text = ocr_with_confidence('scan.jpg')
def ocr_multilanguage(image_path, languages='eng+deu'):
"""OCR with multiple languages."""
image = Image.open(image_path)
# Extract text with language specification
text = pytesseract.image_to_string(image, lang=languages)
return text
# Use
# English + German
text_de = ocr_multilanguage('document.jpg', 'eng+deu')
# English + French
text_fr = ocr_multilanguage('document.jpg', 'eng+fra')
from PIL import Image, ImageEnhance, ImageFilter
def preprocess_for_ocr(image_path):
"""Preprocess image to improve OCR accuracy."""
image = Image.open(image_path)
# Convert to grayscale
image = image.convert('L')
# Enhance contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0)
# Sharpen image
image = image.filter(ImageFilter.SHARPEN)
# Resize if too small
width, height = image.size
if width < 1000:
scale = 1000 / width
new_size = (int(width * scale), int(height * scale))
image = image.resize(new_size, Image.LANCZOS)
return image
# Use
processed_image = preprocess_for_ocr('low_quality_scan.jpg')
text = pytesseract.image_to_string(processed_image)
def process_invoice(pdf_path):
"""Extract and process invoice data."""
# Step 1: Extract text
text = extract_text_from_pdf(pdf_path)
# Step 2: Parse invoice fields
import re
invoice_data = {
'invoice_number': re.search(r'Invoice #:?\s*(\d+)', text),
'total_amount': re.search(r'Total:?\s*\$?([\d,]+\.?\d*)', text),
'date': re.search(r'Date:?\s*(\d{1,2}/\d{1,2}/\d{4})', text)
}
# Extract matched values
for key, match in invoice_data.items():
invoice_data[key] = match.group(1) if match else None
# Step 3: Store in database
from ObjData import ObjData
obj = ObjData(0)
if invoice_data['invoice_number']:
obj.sql_execute(f"""
INSERT INTO invoices (
invoice_number,
total_amount,
invoice_date,
pdf_path,
processed_at
) VALUES (
'{invoice_data['invoice_number']}',
{invoice_data['total_amount'] or 0},
'{invoice_data['date']}',
'{pdf_path}',
NOW()
)
""")
return invoice_data
# Use
invoice_info = process_invoice('invoice_12345.pdf')
print(f"Invoice #{invoice_info['invoice_number']}: ${invoice_info['total_amount']}")
def process_receipt(image_path):
"""OCR receipt and extract key information."""
# Preprocess image
processed_img = preprocess_for_ocr(image_path)
# Extract text
text = pytesseract.image_to_string(processed_img)
# Parse receipt data
import re
receipt_data = {
'merchant': extract_merchant(text),
'date': extract_date(text),
'total': extract_total(text),
'items': extract_line_items(text)
}
return receipt_data
def extract_merchant(text):
"""Extract merchant name from receipt."""
lines = text.split('\n')
# Usually first non-empty line
for line in lines:
if line.strip():
return line.strip()
return 'Unknown'
def extract_total(text):
"""Extract total amount."""
import re
# Look for "Total" or "Amount" followed by a number
match = re.search(r'(?:Total|Amount|TOTAL).*?\$?\s*([\d,]+\.?\d{2})', text, re.IGNORECASE)
return match.group(1) if match else None
# Use
receipt_info = process_receipt('receipt.jpg')
print(f"Merchant: {receipt_info['merchant']}")
print(f"Total: ${receipt_info['total']}")
def process_contract(pdf_path):
"""Extract key information from contract PDF."""
# Extract full text
contract_text = extract_text_from_pdf(pdf_path)
# Extract key clauses
import re
contract_data = {
'parties': extract_parties(contract_text),
'effective_date': extract_effective_date(contract_text),
'term_length': extract_term(contract_text),
'termination_clause': extract_clause(contract_text, 'termination'),
'payment_terms': extract_clause(contract_text, 'payment')
}
# Store contract
from ObjData import ObjData
obj = ObjData(0)
obj.sql_execute(f"""
INSERT INTO contracts (
pdf_path,
parties,
effective_date,
term_length,
extracted_at,
full_text
) VALUES (
'{pdf_path}',
'{contract_data["parties"]}',
'{contract_data["effective_date"]}',
'{contract_data["term_length"]}',
NOW(),
'{contract_text}'
)
""")
return contract_data
def extract_parties(text):
"""Extract contract parties."""
# Implementation specific to contract format
pass
from ObjData import ObjData
import os
import hashlib
def store_document(file_path, doc_type, metadata=None):
"""Store document with metadata in database."""
obj = ObjData(0)
# Read file
with open(file_path, 'rb') as f:
file_content = f.read()
# Calculate hash
file_hash = hashlib.sha256(file_content).hexdigest()
# Get file info
file_size = os.path.getsize(file_path)
filename = os.path.basename(file_path)
# Store document
obj.sql_execute(f"""
INSERT INTO documents (
filename,
doc_type,
file_size,
file_hash,
metadata_json,
created_at
) VALUES (
'{filename}',
'{doc_type}',
{file_size},
'{file_hash}',
'{metadata}',
NOW()
)
""")
# Copy to document storage
storage_path = f"local.documents/{doc_type}/{filename}"
os.makedirs(os.path.dirname(storage_path), exist_ok=True)
with open(storage_path, 'wb') as f:
f.write(file_content)
return storage_path
# Use rsync to sync documents
rsync --mkpath -r local.documents/ data.documents/
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
def generate_invoice_pdf(invoice_data, output_path):
"""Generate invoice PDF from data."""
c = canvas.Canvas(output_path, pagesize=letter)
# Add title
c.setFont("Helvetica-Bold", 20)
c.drawString(100, 750, "INVOICE")
# Add invoice details
c.setFont("Helvetica", 12)
y_position = 700
for key, value in invoice_data.items():
c.drawString(100, y_position, f"{key}: {value}")
y_position -= 20
# Save PDF
c.save()
return output_path
# Use
invoice = {
"Invoice Number": "INV-001",
"Date": "2025-12-26",
"Customer": "John Doe",
"Amount": "$1,500.00"
}
pdf_path = generate_invoice_pdf(invoice, "invoice_001.pdf")
# Organize by type and date
def get_document_path(doc_type, filename):
"""Get organized document storage path."""
from datetime import datetime
date_path = datetime.now().strftime("%Y/%m")
return f"local.documents/{doc_type}/{date_path}/{filename}"
# Use
pdf_path = get_document_path("invoices", "invoice_123.pdf")
# Returns: local.documents/invoices/2025/12/invoice_123.pdf
import hashlib
def check_duplicate_document(file_path):
"""Check if document already exists."""
# Calculate file hash
with open(file_path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
# Check database
obj = ObjData(0)
existing = obj.query_get_value(f"""
SELECT document_id
FROM documents
WHERE file_hash = '{file_hash}'
""")
return existing is not None, file_hash
def save_document_version(document_id, file_path):
"""Save new version of document."""
obj = ObjData(0)
# Get current version
current_version = obj.query_get_value(f"""
SELECT MAX(version) FROM document_versions
WHERE document_id = '{document_id}'
""") or 0
new_version = current_version + 1
# Store new version
with open(file_path, 'rb') as f:
content = f.read()
obj.sql_execute(f"""
INSERT INTO document_versions (
document_id,
version,
file_content,
created_at
) VALUES (
'{document_id}',
{new_version},
'{content}',
NOW()
)
""")
return new_version
Solution: PDF may be scanned image
# Convert PDF to images, then OCR
import pdf2image
def ocr_scanned_pdf(pdf_path):
"""OCR scanned PDF."""
# Convert PDF pages to images
images = pdf2image.convert_from_path(pdf_path)
# OCR each page
full_text = []
for img in images:
text = pytesseract.image_to_string(img)
full_text.append(text)
return '\n\n'.join(full_text)
Solutions:
# 1. Preprocess image
processed = preprocess_for_ocr('image.jpg')
# 2. Use correct language
text = pytesseract.image_to_string(image, lang='eng')
# 3. Specify page segmentation mode
text = pytesseract.image_to_string(
image,
config='--psm 6' # Assume single uniform block of text
)
Solution: Process in chunks
def process_large_pdf(pdf_path, chunk_size=10):
"""Process large PDF in chunks."""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
total_pages = pdf_reader.numPages
results = []
for start in range(0, total_pages, chunk_size):
end = min(start + chunk_size, total_pages)
chunk_text = []
for page_num in range(start, end):
page = pdf_reader.getPage(page_num)
chunk_text.append(page.extractText())
results.append('\n'.join(chunk_text))
return results