Image
生成和处理图像内容的基础功能模块
Core functionality module for generating and processing image content
clawhub install image使用 OCR 技术识别和处理扫描文档或图像中的文字信息。
# 安装 Skill npx skills add dkyazzentwatwa/chatgpt-skills@ocr-document-processor # 安装后 Claude Code 会自动识别并使用
# 同样的安装命令,兼容所有支持 SKILL.md 的 AI 编程工具 npx skills add dkyazzentwatwa/chatgpt-skills@ocr-document-processor
需要 ChatGPT API 密钥和 OCR 服务配置
from scripts.ocr_processor import OCRProcessor
# Simple text extraction
processor = OCRProcessor("document.png")
text = processor.extract_text()
print(text)
# Extract to structured format
result = processor.extract_structured()
print(result['text'])
print(result['confidence'])
print(result['blocks']) # Text blocks with positions
from scripts.ocr_processor import OCRProcessor
# From image
processor = OCRProcessor("scan.png")
text = processor.extract_text()
# From PDF
processor = OCRProcessor("scanned.pdf")
text = processor.extract_text() # All pages
# Specific pages
text = processor.extract_text(pages=[1, 2, 3])
# Get detailed results
result = processor.extract_structured()
# Result contains:
# - text: Full extracted text
# - blocks: Text blocks with bounding boxes
# - lines: Individual lines
# - words: Individual words with confidence
# - confidence: Overall confidence score
# - language: Detected language
# Export to Markdown
processor.export_markdown("output.md")
# Export to JSON
processor.export_json("output.json")
# Export to searchable PDF
processor.export_searchable_pdf("searchable.pdf")
# Export to HTML
processor.export_html("output.html")
# Specify language for better accuracy
processor = OCRProcessor("german_doc.png", lang='deu')
# Multiple languages
processor = OCRProcessor("mixed_doc.png", lang='eng+fra+deu')
# Auto-detect language
processor = OCRProcessor("document.png", lang='auto')
# Enable preprocessing
processor = OCRProcessor("noisy_scan.png")
processor.preprocess(
deskew=True, # Fix rotation
denoise=True, # Remove noise
threshold=True, # Binarize image
contrast=1.5 # Enhance contrast
)
text = processor.extract_text()
deskew | Correct skewed/rotated images | False |
| denoise | Remove noise and artifacts | False |
| threshold | Convert to black/white | False |
| threshold_method | 'otsu', 'adaptive', 'simple' | 'otsu' |
| contrast | Contrast factor (1.0 = no change) | 1.0 |
| sharpen | Sharpen factor (0 = none) | 0 |
| scale | Upscale factor for small text | 1.0 |
| remove_shadows | Remove shadow artifacts | False |# Extract tables from document
tables = processor.extract_tables()
# Each table is a list of rows
for table in tables:
for row in table:
print(row)
# Export tables to CSV
processor.export_tables_csv("tables/")
# Export to JSON
processor.export_tables_json("tables.json")
# Process all pages
processor = OCRProcessor("document.pdf")
full_text = processor.extract_text()
# Process specific pages
page_3 = processor.extract_text(pages=[3])
# Get per-page results
results = processor.extract_by_page()
for page_num, text in results.items():
print(f"Page {page_num}: {len(text)} characters")
# Convert scanned PDF to searchable PDF
processor = OCRProcessor("scanned.pdf")
processor.export_searchable_pdf("searchable.pdf")
from scripts.ocr_processor import batch_ocr
# Process directory of images
results = batch_ocr(
input_dir="scans/",
output_dir="extracted/",
output_format="markdown",
lang="eng",
recursive=True
)
print(f"Processed: {results['success']} files")
print(f"Failed: {results['failed']} files")
# Parse receipt structure
processor = OCRProcessor("receipt.jpg")
receipt_data = processor.parse_receipt()
# Returns structured data:
# - vendor: Store name
# - date: Transaction date
# - items: List of items with prices
# - subtotal: Subtotal amount
# - tax: Tax amount
# - total: Total amount
# Extract business card info
processor = OCRProcessor("card.jpg")
contact = processor.parse_business_card()
# Returns:
# - name: Person's name
# - title: Job title
# - company: Company name
# - email: Email addresses
# - phone: Phone numbers
# - address: Physical address
# - website: Website URLs
processor = OCRProcessor("document.png")
# Configure OCR settings
processor.config.update({
'psm': 3, # Page segmentation mode
'oem': 3, # OCR engine mode
'dpi': 300, # DPI for processing
'timeout': 30, # Timeout in seconds
'min_confidence': 60, # Minimum word confidence
})
# Get confidence scores
result = processor.extract_structured()
# Overall confidence (0-100)
print(f"Confidence: {result['confidence']}%")
# Per-word confidence
for word in result['words']:
print(f"{word['text']}: {word['confidence']}%")
# Filter low-confidence words
high_conf_words = [w for w in result['words'] if w['confidence'] > 80]
processor.export_markdown("output.md")
processor.export_json("output.json")
{
"source": "document.pdf",
"pages": 5,
"language": "eng",
"confidence": 92.5,
"text": "Full extracted text...",
"blocks": [
{
"type": "paragraph",
"text": "Block text...",
"bbox": [x, y, width, height],
"confidence": 95.2
}
],
"tables": [...]
}
processor.export_html("output.html")
# Basic extraction
python ocr_processor.py image.png -o output.txt
# Extract to markdown
python ocr_processor.py document.pdf -o output.md --format markdown
# Specify language
python ocr_processor.py german.png --lang deu
# Batch processing
python ocr_processor.py scans/ -o extracted/ --batch
# With preprocessing
python ocr_processor.py noisy.png --preprocess --deskew --denoise
from scripts.ocr_processor import OCRProcessor, OCRError
try:
processor = OCRProcessor("document.png")
text = processor.extract_text()
except OCRError as e:
print(f"OCR failed: {e}")
except FileNotFoundError:
print("File not found")
pytesseract>=0.3.10
Pillow>=10.0.0
PyMuPDF>=1.23.0
opencv-python>=4.8.0
numpy>=1.24.0
npx skills run ocr-document-processor --image scan.jpg --language zh-CN --format structured --output result.json图像托管:将图像上传到 img402.dev 获取公开链接,用于消息分享、文档嵌入或社交媒体发布。
Upload images to img402.dev and get a public URL. Free tier: 1MB max, 7-day retention, no auth. Use when the agent needs a hosted image URL — for sharing in messages, embedding in documents, posting to social platforms, or any context that requires a public link to an image file.
clawhub install image-hosting