混元OCR (Hunyuan OCR)
混元OCR 是腾讯基于混元大模型开发的智能OCR系统,结合了大语言模型的理解能力和传统OCR的高效性,特别适合复杂文档场景和中文文档处理。
🌟 核心特性
- ✅ 大模型驱动:基于混元大模型的深度理解能力
- ✅ 高精度识别:中文识别准确率达98%+
- ✅ 版面智能分析:精确识别文档结构和层次
- ✅ 多模态融合:图文联合理解
- ✅ 表格还原:高质量表格识别和重建
- ✅ 公式识别:数学公式LaTeX转换
- ✅ 实时推理:优化的推理速度
- ✅ 企业级稳定:腾讯云服务支持
📦 安装与使用
API方式
# 安装SDK
pip install tencentcloud-sdk-python
快速开始
from tencentcloud.common import credential
from tencentcloud.ocr.v20181119 import ocr_client, models
# 配置认证信息
cred = credential.Credential("your_secret_id", "your_secret_key")
# 创建OCR客户端
client = ocr_client.OcrClient(cred, "ap-guangzhou")
# 通用OCR识别
def general_ocr(image_path):
"""通用OCR识别"""
req = models.GeneralBasicOCRRequest()
# 读取图片(Base64编码)
import base64
with open(image_path, "rb") as f:
image_base64 = base64.b64encode(f.read()).decode()
req.ImageBase64 = image_base64
# 调用接口
resp = client.GeneralBasicOCR(req)
# 解析结果
texts = [item.DetectedText for item in resp.TextDetections]
return '\n'.join(texts)
# 使用示例
text = general_ocr("document.jpg")
print(text)
🏗️ 技术架构
graph TB
A[输入图像] --> B[图像预处理]
B --> C[文本检测]
C --> D[版面分析]
D --> E[文本识别]
E --> F[混元大模型后处理]
F --> G[结构化输出]
C -->|高精度检测| C1[文本定位]
D -->|智能分析| D1[版面理解]
E -->|深度识别| E1[字符识别]
F -->|语义增强| F1[结果优化]
🎯 应用场景
1. 通用文字识别
from tencentcloud.ocr.v20181119 import ocr_client, models
from tencentcloud.common import credential
import base64
def universal_ocr(image_path):
"""通用文字识别"""
cred = credential.Credential("SECRET_ID", "SECRET_KEY")
client = ocr_client.OcrClient(cred, "ap-guangzhou")
# 读取图片
with open(image_path, "rb") as f:
image_base64 = base64.b64encode(f.read()).decode()
# 创建请求
req = models.GeneralBasicOCRRequest()
req.ImageBase64 = image_base64
req.LanguageType = "zh" # 中文
# 调用接口
resp = client.GeneralBasicOCR(req)
# 提取文本和位置
results = []
for item in resp.TextDetections:
results.append({
"text": item.DetectedText,
"confidence": item.Confidence,
"polygon": item.Polygon
})
return results
# 使用
results = universal_ocr("document.jpg")
for r in results:
print(f"文本: {r['text']}, 置信度: {r['confidence']}")
2. 高精度OCR
def high_accuracy_ocr(image_path):
"""高精度OCR(适合印刷体)"""
cred = credential.Credential("SECRET_ID", "SECRET_KEY")
client = ocr_client.OcrClient(cred, "ap-guangzhou")
with open(image_path, "rb") as f:
image_base64 = base64.b64encode(f.read()).decode()
req = models.GeneralAccurateOCRRequest()
req.ImageBase64 = image_base64
resp = client.GeneralAccurateOCR(req)
# 返回高精度结果
return [item.DetectedText for item in resp.TextDetections]
# 适用于:合同、证书、票据等要求高准确度的场景
texts = high_accuracy_ocr("contract.jpg")
3. 表格识别
def recognize_table(image_path):
"""表格识别"""
cred = credential.Credential("SECRET_ID", "SECRET_KEY")
client = ocr_client.OcrClient(cred, "ap-guangzhou")
with open(image_path, "rb") as f:
image_base64 = base64.b64encode(f.read()).decode()
req = models.TableOCRRequest()
req.ImageBase64 = image_base64
resp = client.TableOCR(req)
# 解析表格结构
tables = []
for table in resp.TableDetections:
cells = []
for cell in table.Cells:
cells.append({
"row": cell.RowTl,
"col": cell.ColTl,
"text": cell.Text
})
tables.append(cells)
return tables
# 转为DataFrame
import pandas as pd
def table_to_dataframe(table_cells):
"""表格转DataFrame"""
# 构建二维数组
max_row = max(cell['row'] for cell in table_cells) + 1
max_col = max(cell['col'] for cell in table_cells) + 1
data = [['' for _ in range(max_col)] for _ in range(max_row)]
for cell in table_cells:
data[cell['row']][cell['col']] = cell['text']
return pd.DataFrame(data)
4. 证件识别
def id_card_ocr(image_path, card_side='FRONT'):
"""身份证识别"""
cred = credential.Credential("SECRET_ID", "SECRET_KEY")
client = ocr_client.OcrClient(cred, "ap-guangzhou")
with open(image_path, "rb") as f:
image_base64 = base64.b64encode(f.read()).decode()
req = models.IDCardOCRRequest()
req.ImageBase64 = image_base64
req.CardSide = card_side # FRONT/BACK
resp = client.IDCardOCR(req)
if card_side == 'FRONT':
return {
"name": resp.Name,
"sex": resp.Sex,
"nation": resp.Nation,
"birth": resp.Birth,
"address": resp.Address,
"id_num": resp.IdNum
}
else:
return {
"authority": resp.Authority,
"valid_date": resp.ValidDate
}
# 识别身份证
front_info = id_card_ocr("id_front.jpg", "FRONT")
back_info = id_card_ocr("id_back.jpg", "BACK")
print(f"姓名: {front_info['name']}")
print(f"身份证号: {front_info['id_num']}")
5. 发票识别
def invoice_ocr(image_path):
"""增值税发票识别"""
cred = credential.Credential("SECRET_ID", "SECRET_KEY")
client = ocr_client.OcrClient(cred, "ap-guangzhou")
with open(image_path, "rb") as f:
image_base64 = base64.b64encode(f.read()).decode()
req = models.VatInvoiceOCRRequest()
req.ImageBase64 = image_base64
resp = client.VatInvoiceOCR(req)
# 提取发票信息
invoice_info = {
"invoice_code": resp.InvoiceCode,
"invoice_num": resp.InvoiceNum,
"invoice_date": resp.InvoiceDate,
"buyer_name": resp.BuyerName,
"buyer_tax_code": resp.BuyerTaxCode,
"seller_name": resp.SellerName,
"seller_tax_code": resp.SellerTaxCode,
"total_amount": resp.TotalAmount,
"tax_amount": resp.TaxAmount,
"items": []
}
# 商品明细
for item in resp.VatInvoiceItemInfos:
invoice_info["items"].append({
"name": item.Name,
"spec": item.Specification,
"unit": item.Unit,
"quantity": item.Quantity,
"price": item.Price,
"amount": item.Amount
})
return invoice_info
# 使用
invoice = invoice_ocr("invoice.jpg")
print(f"发票号: {invoice['invoice_num']}")
print(f"价税合计: {invoice['total_amount']}")
6. 批量文档处理
from pathlib import Path
import json
import time
def batch_ocr_documents(input_dir, output_dir):
"""批量OCR处理"""
cred = credential.Credential("SECRET_ID", "SECRET_KEY")
client = ocr_client.OcrClient(cred, "ap-guangzhou")
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
results = []
for img_file in input_path.glob("*.jpg"):
try:
print(f"处理: {img_file.name}")
# OCR识别
with open(img_file, "rb") as f:
image_base64 = base64.b64encode(f.read()).decode()
req = models.GeneralBasicOCRRequest()
req.ImageBase64 = image_base64
resp = client.GeneralBasicOCR(req)
# 提取文本
texts = [item.DetectedText for item in resp.TextDetections]
full_text = '\n'.join(texts)
# 保存结果
txt_file = output_path / f"{img_file.stem}.txt"
with open(txt_file, "w", encoding="utf-8") as f:
f.write(full_text)
results.append({
"file": img_file.name,
"status": "success",
"chars": len(full_text)
})
# 避免超过QPS限制
time.sleep(0.1)
except Exception as e:
results.append({
"file": img_file.name,
"status": "error",
"error": str(e)
})
# 保存报告
with open(output_path / "report.json", "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
return results
🔧 高级配置
自定义识别参数
def custom_ocr(image_path):
"""自定义OCR参数"""
req = models.GeneralBasicOCRRequest()
req.ImageBase64 = image_base64
# 高级参数
req.Scene = "doc" # 场景: doc/card/receipt
req.LanguageType = "zh" # 语言类型
req.IsPdf = False # 是否PDF
req.PdfPageNumber = 1 # PDF页码
req.IsWords = True # 返回单字信息
resp = client.GeneralBasicOCR(req)
return resp
异步批量处理
import asyncio
from concurrent.futures import ThreadPoolExecutor
async def async_ocr_batch(image_paths):
"""异步批量OCR"""
loop = asyncio.get_event_loop()
with ThreadPoolExecutor(max_workers=5) as executor:
tasks = [
loop.run_in_executor(executor, universal_ocr, path)
for path in image_paths
]
results = await asyncio.gather(*tasks)
return results
# 使用
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
results = asyncio.run(async_ocr_batch(image_paths))
结果后处理
def post_process_ocr(ocr_results):
"""OCR结果后处理"""
processed = []
for item in ocr_results:
text = item['text']
# 去除多余空格
text = ' '.join(text.split())
# 修正常见错误
replacements = {
'0': '0', '1': '1', '2': '2', # 全角转半角
'О': '0', 'l': '1', 'I': 'I' # 易混淆字符
}
for old, new in replacements.items():
text = text.replace(old, new)
# 只保留高置信度结果
if item['confidence'] >= 90:
processed.append(text)
return processed
📊 性能对比
| 指标 | 混元OCR | PaddleOCR | Tesseract |
|---|---|---|---|
| 中文准确率 | 98%+ | 95% | 85% |
| 处理速度 | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ |
| 版面理解 | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐ |
| 表格识别 | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐ |
| 部署方式 | 云API | 本地/云 | 本地 |
💡 最佳实践
-
图像质量优化
from PIL import Image, ImageEnhance
def enhance_image(image_path):
"""优化图像质量"""
img = Image.open(image_path)
# 调整对比度
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(1.5)
# 锐化
enhancer = ImageEnhance.Sharpness(img)
img = enhancer.enhance(2.0)
return img -
错误重试机制
def ocr_with_retry(image_path, max_retries=3):
"""带重试的OCR"""
for i in range(max_retries):
try:
return universal_ocr(image_path)
except Exception as e:
if i == max_retries - 1:
raise
time.sleep(1) -
成本控制
# 使用缓存避免重复调用
import hashlib
import json
def cached_ocr(image_path, cache_dir="./cache"):
"""带缓存的OCR"""
# 计算图片哈希
with open(image_path, "rb") as f:
img_hash = hashlib.md5(f.read()).hexdigest()
cache_file = f"{cache_dir}/{img_hash}.json"
# 检查缓存
if os.path.exists(cache_file):
with open(cache_file, "r") as f:
return json.load(f)
# 调用API
result = universal_ocr(image_path)
# 保存缓存
with open(cache_file, "w") as f:
json.dump(result, f)
return result
📚 资源链接
- 官方文档: https://cloud.tencent.com/document/product/866
- API文档: https://cloud.tencent.com/document/api/866/33515
- SDK下载: https://github.com/TencentCloud/tencentcloud-sdk-python
- 控制台: https://console.cloud.tencent.com/ocr
⚠️ 注意事项
- 需要腾讯云账号和API密钥
- 按调用次数计费,注意成本控制
- 单张图片大小限制7MB
- QPS限制,需要控制并发
- 敏感信息注意数据安全
💰 价格说明
| 服务 | 免费额度 | 价格 |
|---|---|---|
| 通用OCR | 1000次/月 | ¥0.15/次 |
| 高精度OCR | - | ¥0.50/次 |
| 表格识别 | - | ¥0.50/次 |
| 身份证识别 | 1000次/月 | ¥0.15/次 |
| 发票识别 | 1000次/月 | ¥0.15/次 |
🆚 适用场景
选择混元OCR当:
- 需要高准确率(特别是中文)
- 云端部署可接受
- 企业级稳定性要求
- 需要专业的证件/票据识别
- 预算充足
选择其他工具当:
- 需要本地部署 → PaddleOCR
- 预算有限 → Tesseract
- 自定义需求多 → 开源方案
- 实时性要求高 → 边缘设备方案
🔄 更新日志
- 2024.08: 接入混元大模型,准确率提升
- 2024.05: 优化表格识别
- 2024.02: 新增版面分析功能
- 2023.12: 性能优化