From c520fe4329c0f0ee748ec37292c69f76b3f8e939 Mon Sep 17 00:00:00 2001 From: hankl Date: Sat, 9 May 2026 19:12:54 +0800 Subject: [PATCH 01/15] init commit --- docs/nova-pdf-refactor-zhipu.md | 565 ++++++++ docs/nova-pdf-technical-design.md | 1175 +++++++++++++++++ docs/spec.md | 31 + packages/nova-pdf/README.md | 166 +++ packages/nova-pdf/pyproject.toml | 74 ++ packages/nova-pdf/src/nova_pdf/__about__.py | 1 + packages/nova-pdf/src/nova_pdf/__init__.py | 13 + packages/nova-pdf/src/nova_pdf/_ai_service.py | 202 +++ packages/nova-pdf/src/nova_pdf/_config.py | 101 ++ packages/nova-pdf/src/nova_pdf/_converter.py | 251 ++++ .../nova-pdf/src/nova_pdf/_page_analyzer.py | 117 ++ .../nova-pdf/src/nova_pdf/_page_renderer.py | 32 + packages/nova-pdf/src/nova_pdf/_plugin.py | 56 + packages/nova-pdf/tests/__init__.py | 1 + packages/nova-pdf/tests/test_ai_service.py | 103 ++ packages/nova-pdf/tests/test_analyzer.py | 131 ++ packages/nova-pdf/tests/test_converter.py | 181 +++ scripts/load_secrets.sh | 13 + 18 files changed, 3213 insertions(+) create mode 100644 docs/nova-pdf-refactor-zhipu.md create mode 100644 docs/nova-pdf-technical-design.md create mode 100644 docs/spec.md create mode 100644 packages/nova-pdf/README.md create mode 100644 packages/nova-pdf/pyproject.toml create mode 100644 packages/nova-pdf/src/nova_pdf/__about__.py create mode 100644 packages/nova-pdf/src/nova_pdf/__init__.py create mode 100644 packages/nova-pdf/src/nova_pdf/_ai_service.py create mode 100644 packages/nova-pdf/src/nova_pdf/_config.py create mode 100644 packages/nova-pdf/src/nova_pdf/_converter.py create mode 100644 packages/nova-pdf/src/nova_pdf/_page_analyzer.py create mode 100644 packages/nova-pdf/src/nova_pdf/_page_renderer.py create mode 100644 packages/nova-pdf/src/nova_pdf/_plugin.py create mode 100644 packages/nova-pdf/tests/__init__.py create mode 100644 packages/nova-pdf/tests/test_ai_service.py create mode 100644 packages/nova-pdf/tests/test_analyzer.py create mode 100644 packages/nova-pdf/tests/test_converter.py create mode 100755 scripts/load_secrets.sh diff --git a/docs/nova-pdf-refactor-zhipu.md b/docs/nova-pdf-refactor-zhipu.md new file mode 100644 index 000000000..cf6b2b5ff --- /dev/null +++ b/docs/nova-pdf-refactor-zhipu.md @@ -0,0 +1,565 @@ +# Nova-PDF 重构方案:使用 zai-sdk + glm-ocr + +## 1. 重构目标 + +将现有的自定义 AI 服务替换为 zai-sdk + glm-ocr,简化代码并提升 OCR 能力。 + +## 2. 技术对比 + +| 项目 | 原方案 | 新方案 | +|------|--------|--------| +| SDK | requests (手动调用) | zai-sdk (官方 SDK) | +| 模型 | 自定义 Workflow | glm-ocr | +| 接口 | 两步上传(上传+调用) | 直接调用 layout_parsing | +| 认证 | 双 token (upload + workflow) | 单 API key | +| 配置 | 环境变量 | 配置文件 + 环境变量 | + +## 3. 接口分析 + +### 3.1 glm-ocr API + +```python +from zai import ZhipuAiClient + +client = ZhipuAiClient(api_key="your-api-key") + +# 支持图片 URL +response = client.layout_parsing.create( + model="glm-ocr", + file="https://example.com/image.png" +) + +# 支持本地文件路径 +response = client.layout_parsing.create( + model="glm-ocr", + file="/path/to/image.png" +) + +# 返回结果(包含 Markdown 格式的内容) +print(response) +``` + +### 3.2 响应结构 + +```python +# response 包含解析后的结构化内容 +# 具体字段需查看实际返回,通常包括: +# - 文本内容 +# - 布局信息 +# - 表格识别结果 +# - Markdown 格式输出 +``` + +## 4. 架构设计 + +### 4.1 组件变更 + +``` +原架构: +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Page Renderer │────►│ File Uploader │────►│ Workflow API │ +│ (截图) │ │ (上传获取URL) │ │ (自定义接口) │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + +新架构: +┌─────────────────┐ ┌─────────────────┐ +│ Page Renderer │────►│ glm-ocr API │ +│ (截图→临时文件) │ │ (layout_parsing)│ +└─────────────────┘ └─────────────────┘ +``` + +### 4.2 文件变更清单 + +| 文件 | 变更类型 | 说明 | +|------|----------|------| +| `_ai_service.py` | **重写** | 使用 zai-sdk + glm-ocr | +| `_converter.py` | 微调 | 适配新 AIService 接口 | +| `_plugin.py` | 微调 | 简化配置参数 | +| `pyproject.toml` | 更新 | 添加 zai-sdk 依赖 | +| `_config.py` | **新增** | 配置文件读取 | +| `README.md` | 更新 | 新的使用说明 | + +## 5. 详细设计 + +### 5.1 配置模块 (_config.py) + +```python +"""Configuration management for nova-pdf.""" + +import os +from pathlib import Path +from typing import Optional +from dataclasses import dataclass + +try: + import tomllib # Python 3.11+ +except ImportError: + import tomli as tomllib + + +@dataclass +class NovaPdfConfig: + """nova-pdf configuration.""" + + # API 配置 + zhipu_api_key: str = "" + + # OCR 配置 + model: str = "glm-ocr" + dpi: int = 150 + timeout: int = 120 + + # 处理策略 + force_ai: bool = False + + @classmethod + def load(cls, config_path: Optional[str] = None) -> "NovaPdfConfig": + """ + 从多个来源加载配置(优先级从高到低): + 1. 环境变量 + 2. 配置文件 (pyproject.toml 或 nova-pdf.toml) + 3. 默认值 + """ + config = cls() + + # 1. 从配置文件加载 + config._load_from_file(config_path) + + # 2. 环境变量覆盖 + config._load_from_env() + + return config + + def _load_from_file(self, config_path: Optional[str] = None): + """从配置文件加载""" + # 查找配置文件 + search_paths = [] + + if config_path: + search_paths.append(Path(config_path)) + + # 当前目录的 pyproject.toml + search_paths.append(Path("pyproject.toml")) + + # 当前目录的 nova-pdf.toml + search_paths.append(Path("nova-pdf.toml")) + + # 用户目录 + search_paths.append(Path.home() / ".config" / "nova-pdf" / "config.toml") + + for path in search_paths: + if path.exists(): + try: + with open(path, "rb") as f: + data = tomllib.load(f) + + # 读取 [tool.nova-pdf] 配置段 + if "tool" in data and "nova-pdf" in data["tool"]: + self._apply_config(data["tool"]["nova-pdf"]) + elif "nova-pdf" in data: + self._apply_config(data["nova-pdf"]) + + break + except Exception: + pass + + def _apply_config(self, data: dict): + """应用配置""" + if "api_key" in data: + self.zhipu_api_key = data["api_key"] + if "model" in data: + self.model = data["model"] + if "dpi" in data: + self.dpi = data["dpi"] + if "timeout" in data: + self.timeout = data["timeout"] + if "force_ai" in data: + self.force_ai = data["force_ai"] + + def _load_from_env(self): + """从环境变量加载(优先级最高)""" + if os.environ.get("NOVA_ZHIPU_API_KEY"): + self.zhipu_api_key = os.environ["NOVA_ZHIPU_API_KEY"] + if os.environ.get("NOVA_MODEL"): + self.model = os.environ["NOVA_MODEL"] + if os.environ.get("NOVA_DPI"): + self.dpi = int(os.environ["NOVA_DPI"]) + if os.environ.get("NOVA_TIMEOUT"): + self.timeout = int(os.environ["NOVA_TIMEOUT"]) + if os.environ.get("NOVA_FORCE_AI"): + self.force_ai = os.environ["NOVA_FORCE_AI"].lower() in ("true", "1", "yes") +``` + +### 5.2 AI 服务模块 (_ai_service.py) + +```python +"""AI service using zai-sdk and glm-ocr.""" + +import io +import os +import tempfile +from dataclasses import dataclass +from typing import BinaryIO, Optional + +try: + from zai import ZhipuAiClient +except ImportError: + ZhipuAiClient = None + +from ._config import NovaPdfConfig + + +@dataclass +class AIResult: + """Result from AI conversion.""" + text: str + success: bool = True + error: Optional[str] = None + + +class AIService: + """ + AI 服务 - 使用 zai-sdk + glm-ocr + + 特点: + - 直接调用 glm-ocr 的 layout_parsing API + - 支持本地文件路径或图片 URL + - 自动处理图片格式转换 + """ + + def __init__( + self, + api_key: Optional[str] = None, + model: str = "glm-ocr", + timeout: int = 120, + config: Optional[NovaPdfConfig] = None, + ): + """ + 初始化 AI 服务 + + Args: + api_key: 智谱 API Key,默认从配置读取 + model: 模型名称,默认 glm-ocr + timeout: 请求超时时间(秒) + config: 配置对象 + """ + if ZhipuAiClient is None: + raise ImportError( + "zai-sdk is required for AIService. " + "Install with: pip install nova-pdf[zhipu]" + ) + + # 从配置加载 + if config: + self.api_key = api_key or config.zhipu_api_key + self.model = model or config.model + self.timeout = timeout or config.timeout + else: + config = NovaPdfConfig.load() + self.api_key = api_key or config.zhipu_api_key + self.model = model + self.timeout = timeout + + if not self.api_key: + raise ValueError( + "API key is required. Set NOVA_ZHIPU_API_KEY environment variable " + "or add 'api_key' to [tool.nova-pdf] in pyproject.toml" + ) + + # 初始化客户端 + self.client = ZhipuAiClient(api_key=self.api_key) + + def image_to_markdown( + self, + image_stream: BinaryIO, + filename: str = "page.png", + ) -> AIResult: + """ + 将图片转换为 Markdown + + Args: + image_stream: 图片流 + filename: 文件名(用于临时文件) + + Returns: + AIResult: 转换结果 + """ + try: + # 方案1:保存为临时文件,传文件路径 + with tempfile.NamedTemporaryFile( + suffix=".png", + delete=False + ) as tmp: + tmp.write(image_stream.read()) + tmp_path = tmp.name + + image_stream.seek(0) + + # 调用 glm-ocr API + response = self.client.layout_parsing.create( + model=self.model, + file=tmp_path + ) + + # 清理临时文件 + try: + os.unlink(tmp_path) + except Exception: + pass + + # 解析响应 + # 响应格式可能是字符串或对象,需要适配 + if hasattr(response, 'content'): + text = response.content + elif hasattr(response, 'text'): + text = response.text + elif isinstance(response, str): + text = response + else: + text = str(response) + + return AIResult( + text=text.strip() if text else "", + success=True, + ) + + except Exception as e: + return AIResult( + text="", + success=False, + error=str(e), + ) +``` + +### 5.3 插件注册 (_plugin.py) + +```python +"""Plugin registration for nova-pdf.""" + +from typing import Any +from markitdown import MarkItDown + +from ._config import NovaPdfConfig +from ._ai_service import AIService +from ._converter import NovaPdfConverter + + +__plugin_interface_version__ = 1 + + +def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: + """ + 注册 nova-pdf 转换器 + + 配置来源(优先级从高到低): + 1. kwargs 参数 + 2. 环境变量 + 3. 配置文件 (pyproject.toml) + 4. 默认值 + """ + # 加载配置 + config = NovaPdfConfig.load() + + # kwargs 覆盖配置 + api_key = kwargs.get("api_key") or kwargs.get("zhipu_api_key") or config.zhipu_api_key + model = kwargs.get("model", config.model) + dpi = kwargs.get("dpi", config.dpi) + force_ai = kwargs.get("force_ai", config.force_ai) + timeout = kwargs.get("timeout", config.timeout) + + # 创建 AI 服务 + ai_service = None + if api_key: + try: + ai_service = AIService( + api_key=api_key, + model=model, + timeout=timeout, + ) + except Exception: + pass + + # 注册转换器 + PRIORITY_NOVA_PDF = -1.0 + + markitdown.register_converter( + NovaPdfConverter( + ai_service=ai_service, + dpi=dpi, + force_ai=force_ai, + ), + priority=PRIORITY_NOVA_PDF, + ) +``` + +### 5.4 pyproject.toml 更新 + +```toml +[project] +name = "nova-pdf" +dependencies = [ + "markitdown>=0.1.0", + "pdfminer.six>=20251230", + "pdfplumber>=0.11.9", + "Pillow>=9.0.0", + "tomli>=2.0.0;python_version<'3.11'", +] + +[project.optional-dependencies] +zhipu = [ + "zai-sdk>=0.2.2", +] +dev = [ + "pytest>=7.0.0", +] + +[project.entry-points."markitdown.plugin"] +nova_pdf = "nova_pdf" + +[tool.nova-pdf] +# API 配置 +api_key = "" +model = "glm-ocr" +dpi = 150 +timeout = 120 +force_ai = false +``` + +## 6. 配置方式 + +### 6.1 本地敏感配置文件(推荐) + +项目根目录下的 `.secrets.local` 文件存储敏感信息,此文件不会被提交到 Git: + +```bash +# .secrets.local +NOVA_ZHIPU_API_KEY="your-api-key-here" +``` + +使用方式: +```bash +# 加载敏感配置 +source .secrets.local + +# 或使用脚本 +source scripts/load_secrets.sh + +# 然后运行 +markitdown -p document.pdf +``` + +### 6.2 配置文件 (pyproject.toml) + +```toml +[tool.nova-pdf] +# API key 请通过环境变量或 .secrets.local 文件设置,不要硬编码 +api_key = "" +model = "glm-ocr" +dpi = 150 +timeout = 120 +``` + +### 6.3 环境变量(推荐) + +```bash +export NOVA_ZHIPU_API_KEY="your-api-key-here" +export NOVA_MODEL="glm-ocr" +export NOVA_DPI="150" +``` + +### 6.3 Python API + +```python +from markitdown import MarkItDown + +md = MarkItDown( + enable_plugins=True, + api_key="your-api-key", +) +``` + +### 6.4 命令行 + +```bash +export NOVA_ZHIPU_API_KEY="your-api-key" +markitdown -p document.pdf +``` + +## 7. 使用示例 + +```python +from markitdown import MarkItDown +from nova_pdf import AIService, NovaPdfConverter + +# 方式1:自动加载配置 +md = MarkItDown(enable_plugins=True) +result = md.convert("document.pdf") + +# 方式2:手动配置 +from nova_pdf import NovaPdfConfig, AIService + +config = NovaPdfConfig.load() +ai_service = AIService( + api_key="your-api-key", + model="glm-ocr", +) + +converter = NovaPdfConverter( + ai_service=ai_service, + dpi=150, +) + +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) +result = md.convert("document.pdf") +``` + +## 8. 迁移路径 + +### 8.1 从旧版本迁移 + +| 旧配置 | 新配置 | +|--------|--------| +| `NOVA_UPLOAD_TOKEN` | `NOVA_ZHIPU_API_KEY` | +| `NOVA_WORKFLOW_TOKEN` | (删除) | +| `NOVA_BASE_URL` | (删除) | +| `NOVA_APP_ID` | (删除) | + +### 8.2 API 兼容性 + +- 旧版 `AIService(upload_token, workflow_token, ...)` → 废弃 +- 新版 `AIService(api_key, ...)` → 推荐 + +## 9. 实施计划 + +### ✅ Phase 1: 核心实现(已完成) +- [x] 设计配置模块 +- [x] 实现 `_config.py` +- [x] 重写 `_ai_service.py`(使用 zai-sdk + glm-ocr) +- [x] 更新 `_plugin.py` + +### ✅ Phase 2: 集成测试(已完成) +- [x] 更新 `pyproject.toml` +- [x] 测试 glm-ocr API +- [x] 测试插件集成 + +### Phase 3: 文档更新(进行中) +- [x] 更新 README.md +- [ ] 更新技术方案文档 +- [ ] 添加迁移指南 + +## 10. 风险与缓解 + +| 风险 | 缓解措施 | +|------|----------| +| zai-sdk 接口变化 | 封装适配层,隔离 SDK 细节 | +| glm-ocr 返回格式不确定 | 做多种格式兼容处理 | +| 临时文件清理失败 | 使用 try-finally 确保清理 | +| API key 泄露 | 支持环境变量,避免硬编码 | + +## 11. 待确认事项 + +- [ ] glm-ocr 返回的具体数据结构 +- [ ] 是否支持直接传图片字节流(不保存临时文件) +- [ ] 超时和重试策略 +- [ ] 并发请求限制 diff --git a/docs/nova-pdf-technical-design.md b/docs/nova-pdf-technical-design.md new file mode 100644 index 000000000..25128e33a --- /dev/null +++ b/docs/nova-pdf-technical-design.md @@ -0,0 +1,1175 @@ +# Nova-PDF 插件技术方案 + +## 1. 概述 + +### 1.1 目标 +开发一个智能 PDF 解析插件 `nova-pdf`,实现: +- 自动检测 PDF 每页内容类型(纯文本 vs 包含图片/表格) +- 对纯文本页面使用默认解析能力(pdfminer/pdfplumber) +- 对包含图片/表格的页面截图后调用 AI 接口转 Markdown + +### 1.2 核心价值 +- **提升复杂 PDF 解析质量**:图表、扫描件等传统方法效果差的内容 +- **降低成本**:纯文本页面不调用 AI,节省 API 费用 +- **灵活配置**:支持自定义 AI 模型、分辨率、提示词等 + +--- + +## 2. 架构设计 + +### 2.1 插件结构 +``` +packages/nova-pdf/ +├── src/ +│ └── nova_pdf/ +│ ├── __init__.py # 导出和版本信息 +│ ├── __about__.py # 版本号 +│ ├── _plugin.py # 插件注册入口 +│ ├── _converter.py # PDF 转换器核心实现 +│ ├── _page_analyzer.py # 页面内容分析器 +│ ├── _page_renderer.py # 页面截图渲染器 +│ └── _ai_service.py # AI 接口封装 +├── tests/ +│ ├── __init__.py +│ ├── test_converter.py +│ ├── test_analyzer.py +│ └── fixtures/ +│ ├── text_only.pdf +│ ├── with_images.pdf +│ └── mixed_content.pdf +├── pyproject.toml +└── README.md +``` + +### 2.2 组件职责 + +| 组件 | 职责 | +|------|------| +| `_plugin.py` | 实现 `register_converters` 入口,注册转换器 | +| `_converter.py` | 继承 `DocumentConverter`,协调整体流程 | +| `_page_analyzer.py` | 分析页面是否包含图片/表格 | +| `_page_renderer.py` | 将 PDF 页面渲染为图片 | +| `_ai_service.py` | 调用 AI Vision API 转换图片为 Markdown | + +### 2.3 流程图 + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ PDF 文件输入 │ +└──────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ 逐页分析 (PageAnalyzer) │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ 对每一页: │ │ +│ │ 1. 检测是否包含图片 (images) │ │ +│ │ 2. 检测是否包含表格 (tables) │ │ +│ │ 3. 标记页面类型: PLAIN_TEXT / COMPLEX │ │ +│ └────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────┘ + │ + ┌─────────────────────┴─────────────────────┐ + ▼ ▼ +┌─────────────────────┐ ┌─────────────────────┐ +│ PLAIN_TEXT 页面 │ │ COMPLEX 页面 │ +│ │ │ │ +│ 使用默认解析: │ │ 1. 截图渲染 │ +│ - pdfplumber 提取 │ │ 2. 调用 AI 接口 │ +│ - pdfminer 备用 │ │ 3. 转换为 Markdown │ +└─────────────────────┘ └─────────────────────┘ + │ │ + └─────────────────────┬─────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ 合并所有页面结果 │ +│ 输出完整 Markdown │ +└──────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 3. 核心算法设计 + +### 3.1 页面内容检测 (PageAnalyzer) + +#### 检测策略 +```python +class PageType(Enum): + PLAIN_TEXT = "plain_text" # 纯文本,使用默认解析 + HAS_IMAGES = "has_images" # 包含图片 + HAS_TABLES = "has_tables" # 包含表格 + COMPLEX = "complex" # 复杂内容(图片+表格+混合) +``` + +#### 图片检测方法 +使用 **pdfplumber** 的页面对象检测: + +```python +def detect_images(page) -> bool: + """检测页面是否包含图片""" + # 方法1: 直接检测 page.images + if hasattr(page, 'images') and len(page.images) > 0: + return True + + # 方法2: 检测页面对象中的图像资源 + if hasattr(page, 'objects'): + if 'image' in page.objects and len(page.objects['image']) > 0: + return True + # 检测 XObject (可能包含内嵌图像) + if 'xobject' in page.objects and len(page.objects['xobject']) > 0: + for obj in page.objects['xobject']: + if obj.get('subtype') == 'Image': + return True + + # 方法3: 检测页面资源字典 + try: + if hasattr(page.page, 'get_resources'): + resources = page.page.get_resources() + if resources and 'XObject' in resources: + return True + except Exception: + pass + + return False +``` + +#### 表格检测方法 +```python +def detect_tables(page) -> bool: + """检测页面是否包含表格""" + # 方法1: 使用 pdfplumber 的 extract_tables + tables = page.extract_tables() + if tables and len(tables) > 0: + # 过滤空表格 + for table in tables: + if table and any(any(cell for cell in row) for row in table): + return True + + # 方法2: 检测表格线(边框线) + if hasattr(page, 'objects') and 'line' in page.objects: + lines = page.objects['line'] + if len(lines) > 10: # 大量线条可能构成表格 + # 分析线条是否形成网格结构 + h_lines = [l for l in lines if l.get('height', 1) < 2] + v_lines = [l for l in lines if l.get('width', 1) < 2] + if len(h_lines) > 2 and len(v_lines) > 2: + return True + + return False +``` + +#### 综合判断 +```python +def analyze_page(page) -> PageType: + """分析页面类型""" + has_images = detect_images(page) + has_tables = detect_tables(page) + + if has_images and has_tables: + return PageType.COMPLEX + elif has_images: + return PageType.HAS_IMAGES + elif has_tables: + return PageType.HAS_TABLES + else: + return PageType.PLAIN_TEXT +``` + +### 3.2 页面截图渲染 (PageRenderer) + +#### 技术选型 + +使用 **pdfplumber.to_image**,理由: +- 已是项目依赖,无需额外安装 +- 实现简单,代码量少 +- 底层使用 PIL,满足需求 + +#### 实现方案 +```python +import io + +def render_page_to_image(page, dpi: int = 150) -> io.BytesIO: + """ + 将 PDF 页面渲染为图片 + + Args: + page: pdfplumber 页面对象 + dpi: 渲染分辨率,默认 150(平衡质量和速度) + + Returns: + BytesIO: PNG 图片流 + """ + # 使用 pdfplumber 的 to_image 方法 + page_image = page.to_image(resolution=dpi) + + # 转换为 BytesIO + img_stream = io.BytesIO() + page_image.original.save(img_stream, format="PNG") + img_stream.seek(0) + + return img_stream +``` + +#### DPI 推荐值 +```python +DPI_SETTINGS = { + "low": 72, # 快速预览,文件小 + "medium": 150, # 平衡质量和速度(默认) + "high": 300, # 高质量,适合复杂图表 +} +``` + +### 3.3 AI 接口调用 (AIService) + +#### 复用 markitdown 的 LLM 客户端机制 +```python +from markitdown.converters._llm_caption import llm_caption + +class AIService: + """AI Vision 服务封装""" + + def __init__( + self, + client, # OpenAI 兼容客户端 + model: str = "gpt-4o", # 模型名称 + prompt: str | None = None, # 自定义提示词 + ): + self.client = client + self.model = model + self.prompt = prompt or self._default_prompt() + + def _default_prompt(self) -> str: + return """请将这张图片的内容转换为 Markdown 格式。 + +要求: +1. 保持原有的文档结构(标题、段落、列表等) +2. 表格使用 Markdown 表格语法 +3. 图片中的文字清晰转写 +4. 数学公式使用 LaTeX 语法 +5. 如有图表,用文字描述其内容 +6. 不要添加任何额外的解释或评论""" + + def image_to_markdown( + self, + image_stream: io.BytesIO, + stream_info: StreamInfo, + ) -> str: + """调用 AI 将图片转为 Markdown""" + result = llm_caption( + image_stream, + stream_info, + client=self.client, + model=self.model, + prompt=self.prompt, + ) + return result or "" +``` + +--- + +## 4. 转换器实现 (_converter.py) + +### 4.1 核心流程 +```python +class NovaPdfConverter(DocumentConverter): + """智能 PDF 转换器""" + + def __init__( + self, + ai_service: AIService | None = None, + dpi: int = 150, + force_ai: bool = False, # 强制所有页面使用 AI + ): + self.ai_service = ai_service + self.dpi = dpi + self.force_ai = force_ai + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + # 读取 PDF + pdf_stream = io.BytesIO(file_stream.read()) + + markdown_parts = [] + + with pdfplumber.open(pdf_stream) as pdf: + for page_num, page in enumerate(pdf.pages): + # 分析页面类型 + page_type = analyze_page(page) + + # 根据类型选择处理方式 + if self.force_ai or page_type != PageType.PLAIN_TEXT: + # 复杂内容:截图 + AI + if self.ai_service: + img = render_page_to_image(page, self.dpi) + md = self.ai_service.image_to_markdown(img, StreamInfo()) + else: + # 无 AI 服务,回退到默认解析 + md = page.extract_text() or "" + else: + # 纯文本:默认解析 + md = page.extract_text() or "" + + if md.strip(): + markdown_parts.append(f"## Page {page_num + 1}\n\n{md}") + + return DocumentConverterResult( + markdown="\n\n".join(markdown_parts), + ) +``` + +--- + +## 5. 配置选项 + +### 5.1 初始化参数 +```python +class NovaPdfConfig: + """nova-pdf 配置""" + + # AI 服务配置 + llm_client: Any = None # OpenAI 兼容客户端(必需) + llm_model: str = "gpt-4o" # 模型名称 + llm_prompt: str | None = None # 自定义提示词 + + # 渲染配置 + dpi: int = 150 # 截图分辨率 + image_format: str = "png" # 图片格式 + + # 处理策略 + force_ai: bool = False # 强制所有页面使用 AI + skip_tables: bool = False # 跳过表格检测(表格用默认解析) + skip_images: bool = False # 跳过图片检测(图片用默认解析) + + # 性能配置 + max_concurrent: int = 5 # 并发请求数 + timeout: int = 60 # 单页 AI 调用超时(秒) +``` + +### 5.2 使用示例 +```python +from openai import OpenAI +from markitdown import MarkItDown + +# 初始化 LLM 客户端 +client = OpenAI(api_key="your-api-key") + +# 创建 MarkItDown 实例并启用 nova-pdf 插件 +md = MarkItDown( + enable_plugins=True, + llm_client=client, + llm_model="gpt-4o", +) + +# 转换 PDF +result = md.convert("complex_document.pdf") +print(result.markdown) +``` + +--- + +## 6. 依赖管理 + +### 6.1 pyproject.toml +```toml +[project] +name = "nova-pdf" +dependencies = [ + "markitdown>=0.1.0", + "pdfminer.six>=20251230", + "pdfplumber>=0.11.9", # 页面解析和截图渲染 + "Pillow>=9.0.0", # 图像处理(pdfplumber.to_image 底层依赖) +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-asyncio>=0.21.0", +] + +# 插件入口点 +[project.entry-points."markitdown.plugin"] +nova_pdf = "nova_pdf" +``` + +--- + +## 7. 错误处理 + +### 7.1 降级策略 +```python +def convert_with_fallback( + self, + pdf_bytes: bytes, + page_num: int, + page_type: PageType, +) -> str: + """带降级的转换""" + + # 尝试 AI 转换 + if self.ai_service and page_type != PageType.PLAIN_TEXT: + try: + img = render_page_to_image(pdf_bytes, page_num, self.dpi) + result = self.ai_service.image_to_markdown(img, StreamInfo()) + if result.strip(): + return result + except AIServiceError as e: + logger.warning(f"AI 转换失败,降级到默认解析: {e}") + + # 降级到默认解析 + with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + page = pdf.pages[page_num] + text = page.extract_text() or "" + + # 尝试提取表格 + tables = page.extract_tables() + if tables: + for table in tables: + text += "\n\n" + self._table_to_markdown(table) + + return text +``` + +--- + +## 8. 性能优化 + +### 8.1 异步处理 +```python +import asyncio +from typing import List + +async def convert_pages_async( + self, + pdf_bytes: bytes, + pages: List[PageInfo], +) -> List[str]: + """异步并发处理多页""" + + async def process_page(page_info: PageInfo) -> str: + if page_info.type == PageType.PLAIN_TEXT: + return self._extract_text(pdf_bytes, page_info.num) + else: + return await self._ai_convert_async(pdf_bytes, page_info.num) + + # 使用信号量限制并发 + semaphore = asyncio.Semaphore(self.max_concurrent) + + async def limited_process(page_info): + async with semaphore: + return await process_page(page_info) + + tasks = [limited_process(p) for p in pages] + return await asyncio.gather(*tasks) +``` + +### 8.2 缓存机制 +```python +from functools import lru_cache +import hashlib + +class CachedAIService(AIService): + """带缓存的 AI 服务""" + + @lru_cache(maxsize=100) + def _get_cache_key(self, image_hash: str) -> str | None: + """获取缓存结果""" + # 可接入 Redis 等 + pass + + def image_to_markdown(self, image_stream: io.BytesIO, ...) -> str: + # 计算图片哈希 + image_hash = hashlib.md5(image_stream.read()).hexdigest() + image_stream.seek(0) + + # 检查缓存 + cached = self._get_cache_key(image_hash) + if cached: + return cached + + # 调用 AI + result = super().image_to_markdown(image_stream, ...) + + # 存入缓存 + self._cache_result(image_hash, result) + return result +``` + +--- + +## 9. 测试策略 + +### 9.1 测试用例设计 +```python +class TestNovaPdfConverter: + """nova-pdf 转换器测试""" + + def test_plain_text_pdf(self): + """纯文本 PDF 应使用默认解析""" + pass + + def test_pdf_with_images(self): + """包含图片的 PDF 应调用 AI""" + pass + + def test_pdf_with_tables(self): + """包含表格的 PDF 应调用 AI""" + pass + + def test_mixed_content_pdf(self): + """混合内容应正确区分处理""" + pass + + def test_ai_service_fallback(self): + """AI 服务失败时应降级""" + pass + + def test_dpi_settings(self): + """不同 DPI 设置的渲染质量""" + pass + + def test_concurrent_processing(self): + """并发处理性能测试""" + pass +``` + +--- + +## 10. 扩展性设计 + +### 10.1 自定义页面分析器 +```python +class PageAnalyzerPlugin(ABC): + """页面分析器插件接口""" + + @abstractmethod + def analyze(self, page) -> PageType: + """分析页面类型""" + pass + +# 允许用户注入自定义分析器 +class NovaPdfConverter(DocumentConverter): + def __init__( + self, + page_analyzer: PageAnalyzerPlugin | None = None, + ... + ): + self.page_analyzer = page_analyzer or DefaultPageAnalyzer() +``` + +### 10.2 自定义 AI 提示词模板 +```python +PROMPT_TEMPLATES = { + "default": "...", + "academic": "学术论文模板...", + "financial": "财务报表模板...", + "legal": "法律文档模板...", +} + +class AIService: + def __init__(self, prompt_template: str = "default", ...): + self.prompt = PROMPT_TEMPLATES.get(prompt_template, PROMPT_TEMPLATES["default"]) +``` + +--- + +## 11. 风险与缓解措施 + +| 风险 | 影响 | 缓解措施 | +|------|------|----------| +| AI API 调用失败 | 转换中断 | 实现降级策略,回退到默认解析 | +| 大文件内存溢出 | 程序崩溃 | 分页处理,控制内存占用 | +| AI 响应慢 | 用户体验差 | 异步处理、进度反馈、超时控制 | +| 解析质量不稳定 | 输出错误 | 多模型对比、人工审核机制 | +| API 费用过高 | 成本失控 | 智能跳过纯文本页面、缓存机制 | + +--- + +## 12. 实施计划 + +### ✅ Phase 1: 基础框架(已完成) +- [x] 创建项目结构 +- [x] 实现插件注册入口 +- [x] 实现基础转换器框架 + +### ✅ Phase 2: 核心功能(已完成) +- [x] 实现页面内容检测 (`_page_analyzer.py`) +- [x] 实现页面截图渲染 (`_page_renderer.py`) +- [x] 实现 AI 服务接口 (`_ai_service.py`) +- [x] 实现完整转换流程 (`_converter.py`) + +### ⏳ Phase 3: 测试与优化(待进行) +- [ ] 运行单元测试 +- [ ] 添加测试 PDF 样本 +- [ ] 性能测试和优化 + +### ⏳ Phase 4: 文档与发布(待进行) +- [x] 编写 README 和使用文档 +- [x] 准备示例代码 +- [ ] 打包发布 + +--- + +## 代码结构 + +``` +packages/nova-pdf/ +├── src/nova_pdf/ +│ ├── __about__.py # 版本号 (0.1.0) +│ ├── __init__.py # 导出 register_converters +│ ├── _plugin.py # 插件注册入口 +│ ├── _converter.py # PDF 转换器核心 +│ ├── _page_analyzer.py # 图片/表格检测 +│ ├── _page_renderer.py # 页面截图 (pdfplumber.to_image) +│ └── _ai_service.py # AI 接口封装(两步上传) +├── tests/ +│ ├── test_analyzer.py # 分析器测试 +│ ├── test_converter.py # 转换器测试 +│ └── test_ai_service.py # AI 服务测试 +├── pyproject.toml # 项目配置 + nova-pdf 配置 +└── README.md # 使用文档 +``` + +**语法验证**: ✓ 所有 Python 文件通过语法检查 + +--- + +## 15. 改造完成总结 + +### 15.1 主要变更 + +| 文件 | 变更内容 | +|------|----------| +| `_ai_service.py` | 重写为两步调用:上传 → Workflow | +| `_plugin.py` | 适配新 AIService 初始化参数 | +| `_converter.py` | 传递文件名给 AI 服务 | +| `pyproject.toml` | 添加 `[tool.nova-pdf]` 配置段 | +| `README.md` | 更新环境变量和配置说明 | +| `tests/test_ai_service.py` | 新增 AI 服务测试(13 个用例)| + +### 15.2 环境变量 + +```bash +export NOVA_UPLOAD_TOKEN="your-fastgpt-token" # 必需 +export NOVA_WORKFLOW_TOKEN="your-workflow-token" # 必需 +export NOVA_BASE_URL="https://xny-test.glodon.com/jsf-ai" # 可选 +export NOVA_APP_ID="69fc37113fedac1eaaf65c82" # 可选 +``` + +### 15.3 快速开始 + +```python +from markitdown import MarkItDown + +# 启用插件 +md = MarkItDown(enable_plugins=True) + +# 转换 PDF(复杂页面自动调用 AI) +result = md.convert("document.pdf") +print(result.markdown) +``` + +### 15.4 实测结果 + +**测试图片**: `数位顺序表.png` (22KB) + +**测试结果**: ✓ 成功转换 + +```markdown +| | 整数部分 | | | | | | | 小数部分 | | | | | +|:---:|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---| +| 数位 | ...... | 万位 | 千位 | 百位 | 十位 | 个位 | . | 十分位 | 百分位 | 千分位 | 万分位 | ...... | +| 单位 | ...... | 万 | 千 | 百 | 十 | 个 | | 十分之一 0.1 | 百分之一 0.01 | 千分之一 0.001 | 万分之一 0.0001 | ...... | +``` + +**关键修正**: +1. 上传接口返回 `code: 200`(不是 0) +2. Workflow 接口需要 `messages` 字段(OpenAI 兼容格式) +3. SSL 验证跳过(`verify=False`)以适配内部 API + +--- + +## 13. 附录 + +### 13.1 参考实现 +- `markitdown-ocr`: 已有的 OCR 插件,可参考架构 +- `markitdown-sample-plugin`: 官方插件示例 +- `_pdf_converter.py`: 默认 PDF 转换器实现 + +### 13.2 关键代码参考 +```python +# 参考 markitdown-ocr 的插件注册方式 +def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: + PRIORITY_NOVA_PDF = -1.0 # 优先于默认 PDF 转换器 + + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model", "gpt-4o") + + ai_service = None + if llm_client: + ai_service = AIService(client=llm_client, model=llm_model) + + markitdown.register_converter( + NovaPdfConverter(ai_service=ai_service), + priority=PRIORITY_NOVA_PDF, + ) + +# 页面截图渲染(简化版) +def render_page_to_image(page, dpi: int = 150) -> io.BytesIO: + """使用 pdfplumber.to_image 渲染页面""" + page_image = page.to_image(resolution=dpi) + img_stream = io.BytesIO() + page_image.original.save(img_stream, format="PNG") + img_stream.seek(0) + return img_stream +``` + +--- + +## 14. AI 接口改造方案(自定义两步调用) + +### 14.1 背景 + +原方案使用 OpenAI 兼容的 base64 图片上传方式,现需改造为自定义两步流程: +1. 上传图片到文件服务,获取 URL +2. 调用 Workflow 接口处理图片 + +### 14.2 接口分析 + +#### Step 1: 文件上传接口 + +**请求** +``` +POST https://xny-test.glodon.com/jsf-ai/api/common/file/upload +Content-Type: multipart/form-data +Cookie: fastgpt_token= +``` + +**表单参数** +| 字段 | 类型 | 必填 | 说明 | +|------|------|------|------| +| metadata | string | ✓ | JSON 字符串,如 `{"chatId":""}`,每次动态生成 | +| bucketName | string | ✓ | 固定值 `chat` | +| file | binary | ✓ | 图片文件(PNG/JPEG) | +| data | string | ✓ | JSON 字符串,如 `{"appId":"69fc37113fedac1eaaf65c82"}` | + +**响应示例** +```json +{ + "code": 200, + "data": { + "previewUrl": "https://xny-test.glodon.com/jsf-ai/api/common/file/read/xxx.png?token=...", + "fileId": "69fc42e024457b47b7e22b4a" + } +} +``` + +> 注意:接口返回 `code: 200` 表示成功(不是 0) + +#### Step 2: Workflow 调用接口 + +**请求** +``` +POST https://xny-test.glodon.com/jsf-ai/api/v1/chat/completions +Content-Type: application/json +Authorization: Bearer +``` + +**请求体**(OpenAI 兼容格式) +```json +{ + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "请将这张图片的内容转换为 Markdown 格式。"}, + {"type": "image_url", "image_url": {"url": "https://...previewUrl..."}} + ] + } + ] +} +``` + +**响应示例**(OpenAI 兼容格式) +```json +{ + "id": "", + "model": "", + "choices": [ + { + "message": { + "role": "assistant", + "content": "| 数位顺序表 |\n|---|" + } + } + ] +} +``` + +> 注意:Workflow 接口使用 OpenAI 兼容的消息格式,需要 `messages` 字段 + +### 14.3 改造后的 AIService + +```python +"""AI service with custom two-step API calls.""" + +import io +import json +import requests +from dataclasses import dataclass +from typing import Any, BinaryIO, Optional + + +@dataclass +class AIResult: + """Result from AI conversion.""" + text: str + success: bool = True + error: Optional[str] = None + + +class AIService: + """ + AI 服务 - 自定义两步调用方式 + + 流程: + 1. 上传图片到文件服务,获取 previewUrl + 2. 调用 Workflow 接口,传入 fileUrls 参数 + """ + + def __init__( + self, + base_url: str = "https://xny-test.glodon.com/jsf-ai", + upload_token: str = "", # fastgpt_token (Cookie) + workflow_token: str = "", # workflow_image2markdown_key (Authorization) + chat_id: str = "", # 用于上传接口的 chatId + app_id: str = "", # 用于上传接口的 appId + timeout: int = 60, + ): + """ + 初始化 AI 服务 + + Args: + base_url: API 基础地址 + upload_token: 文件上传认证 token(fastgpt_token) + workflow_token: Workflow 接口认证 token + chat_id: 会话 ID + app_id: 应用 ID + timeout: 请求超时时间(秒) + """ + self.base_url = base_url.rstrip("/") + self.upload_token = upload_token + self.workflow_token = workflow_token + self.chat_id = chat_id + self.app_id = app_id + self.timeout = timeout + + def image_to_markdown( + self, + image_stream: BinaryIO, + filename: str = "page.png", + ) -> AIResult: + """ + 将图片转换为 Markdown(两步调用) + + Args: + image_stream: 图片流 + filename: 文件名 + + Returns: + AIResult: 转换结果 + """ + try: + # Step 1: 上传图片 + upload_result = self._upload_file(image_stream, filename) + if not upload_result["success"]: + return AIResult( + text="", + success=False, + error=f"Upload failed: {upload_result.get('error')}" + ) + + file_url = upload_result["preview_url"] + + # Step 2: 调用 Workflow + workflow_result = self._call_workflow(file_url) + if not workflow_result["success"]: + return AIResult( + text="", + success=False, + error=f"Workflow failed: {workflow_result.get('error')}" + ) + + return AIResult( + text=workflow_result["text"], + success=True, + ) + + except Exception as e: + return AIResult( + text="", + success=False, + error=str(e), + ) + + def _upload_file( + self, + image_stream: BinaryIO, + filename: str, + ) -> dict: + """ + 上传文件到文件服务 + + Args: + image_stream: 图片流 + filename: 文件名 + + Returns: + dict: {"success": bool, "preview_url": str, "error": str} + """ + url = f"{self.base_url}/api/common/file/upload" + + # 准备 multipart/form-data + files = { + "file": (filename, image_stream, "image/png") + } + + data = { + "metadata": json.dumps({"chatId": self.chat_id}), + "bucketName": "chat", + "data": json.dumps({"appId": self.app_id}), + } + + headers = { + "Cookie": f"fastgpt_token={self.upload_token}", + } + + try: + response = requests.post( + url, + files=files, + data=data, + headers=headers, + timeout=self.timeout, + ) + response.raise_for_status() + + result = response.json() + + if result.get("code") == 0 and result.get("data", {}).get("previewUrl"): + return { + "success": True, + "preview_url": result["data"]["previewUrl"], + } + else: + return { + "success": False, + "error": result.get("message", "Unknown error"), + } + + except requests.RequestException as e: + return { + "success": False, + "error": str(e), + } + + def _call_workflow(self, file_url: str) -> dict: + """ + 调用 Workflow 接口处理图片 + + Args: + file_url: 文件 URL + + Returns: + dict: {"success": bool, "text": str, "error": str} + """ + url = f"{self.base_url}/api/v1/chat/completions" + + headers = { + "Authorization": f"Bearer {self.workflow_token}", + "Content-Type": "application/json", + } + + payload = { + "fileUrls": [file_url], + } + + try: + response = requests.post( + url, + json=payload, + headers=headers, + timeout=self.timeout, + ) + response.raise_for_status() + + result = response.json() + + # 解析 OpenAI 兼容响应格式 + choices = result.get("choices", []) + if choices: + content = choices[0].get("message", {}).get("content", "") + return { + "success": True, + "text": content.strip(), + } + else: + return { + "success": False, + "error": "No response content", + } + + except requests.RequestException as e: + return { + "success": False, + "error": str(e), + } +``` + +### 14.4 使用示例 + +```python +from markitdown import MarkItDown +from nova_pdf import AIService, NovaPdfConverter + +# 创建自定义 AI 服务 +ai_service = AIService( + base_url="https://xny-test.glodon.com/jsf-ai", + upload_token="", # fastgpt_token + workflow_token="your-workflow-token", + chat_id="tv1cyJFTt4wEKLqTKEx1KPEN", + app_id="69fc37113fedac1eaaf65c82", + timeout=120, +) + +# 创建转换器 +converter = NovaPdfConverter( + ai_service=ai_service, + dpi=150, +) + +# 手动注册 +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) + +# 转换 PDF +result = md.convert("document.pdf") +print(result.markdown) +``` + +### 14.5 配置参数说明 + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `base_url` | str | ✓ | API 基础地址 | +| `upload_token` | str | ✓ | 文件上传认证 token(fastgpt_token) | +| `workflow_token` | str | ✓ | Workflow 接口认证 token | +| `chat_id` | str | ✓ | 会话 ID(用于上传接口) | +| `app_id` | str | ✓ | 应用 ID(用于上传接口) | +| `timeout` | int | | 超时时间,默认 60 秒 | + +### 14.6 错误处理 + +```python +def image_to_markdown(self, image_stream, filename="page.png") -> AIResult: + """带完善错误处理的转换""" + try: + # Step 1: 上传 + upload_result = self._upload_file(image_stream, filename) + if not upload_result["success"]: + # 上传失败,返回详细错误 + return AIResult( + text="", + success=False, + error=f"上传失败: {upload_result.get('error')}" + ) + + # Step 2: Workflow + workflow_result = self._call_workflow(upload_result["preview_url"]) + if not workflow_result["success"]: + # Workflow 失败,返回详细错误 + return AIResult( + text="", + success=False, + error=f"AI 处理失败: {workflow_result.get('error')}" + ) + + return AIResult( + text=workflow_result["text"], + success=True, + ) + + except requests.Timeout: + return AIResult( + text="", + success=False, + error="请求超时,请检查网络或增加 timeout 设置" + ) + except requests.ConnectionError: + return AIResult( + text="", + success=False, + error="网络连接失败,请检查网络设置" + ) + except json.JSONDecodeError: + return AIResult( + text="", + success=False, + error="响应解析失败,接口返回非 JSON 格式" + ) + except Exception as e: + return AIResult( + text="", + success=False, + error=f"未知错误: {str(e)}" + ) +``` + +### 14.7 与原方案的对比 + +| 对比项 | 原方案(base64) | 新方案(两步上传) | +|--------|-----------------|-------------------| +| 图片传输 | base64 内嵌 | URL 引用 | +| 请求大小 | 大(含图片数据) | 小(仅 URL) | +| 适用场景 | 小图片 | 大图片、多图片 | +| 依赖 | OpenAI SDK | requests | +| 认证方式 | API Key | Token + Cookie | +| 接口格式 | OpenAI 标准 | 自定义 | + +### 14.8 配置确认 + +- [x] ~~`chat_id` 是否需要每次动态生成?~~ **是的,每次生成 UUID** +- [x] ~~`app_id` 是否固定?~~ **是的,固定值** +- [x] ~~`workflow_image2markdown_key` 如何获取?~~ **在 pyproject.toml 中配置** +- [x] ~~是否需要支持并发上传?~~ **否** + +### 14.9 配置文件设计 + +**pyproject.toml 新增配置项** +```toml +[project.optional-dependencies] +nova-api = [ + "requests>=2.28.0", +] + +[tool.nova-pdf] +# AI 服务配置 +base_url = "https://xny-test.glodon.com/jsf-ai" +app_id = "69fc37113fedac1eaaf65c82" +timeout = 120 + +# 认证配置(建议通过环境变量覆盖) +# upload_token = "" # 环境变量: NOVA_UPLOAD_TOKEN +# workflow_token = "" # 环境变量: NOVA_WORKFLOW_TOKEN +``` + +**环境变量** +- `NOVA_UPLOAD_TOKEN`: 上传接口认证 token (fastgpt_token) +- `NOVA_WORKFLOW_TOKEN`: Workflow 接口认证 token +- `NOVA_BASE_URL`: API 基础地址(可选,覆盖配置文件) +- `NOVA_APP_ID`: 应用 ID(可选,覆盖配置文件) diff --git a/docs/spec.md b/docs/spec.md new file mode 100644 index 000000000..660e90a15 --- /dev/null +++ b/docs/spec.md @@ -0,0 +1,31 @@ +# 目标 +重构调用ai接口解析PDF的功能:对包含图片/表格的页面截图后调用 AI 接口转 Markdown + +# 技术要求 +使用glm-ocr能力,zai-sdk,如下 + +# 关键信息:api key:528b833ddafd74f7ce6d32f6d1e3b39e.yLrspX8jiUwh5BGd 需要从配置文件读取 + +# 安装最新版本 +pip install zai-sdk +# 或指定版本 +pip install zai-sdk==0.2.2 +from zai import ZhipuAiClient + +# 初始化客户端 +client = ZhipuAiClient(api_key="your-api-key") + +image_url = "https://cdn.bigmodel.cn/static/logo/introduction.png" + +# 调用布局解析 API +response = client.layout_parsing.create( + model="glm-ocr", + file=image_url +) + +# 输出结果 +print(response) + +详细文档:https://docs.bigmodel.cn/cn/guide/models/vlm/glm-ocr#python + +先设计重构方案 \ No newline at end of file diff --git a/packages/nova-pdf/README.md b/packages/nova-pdf/README.md new file mode 100644 index 000000000..969e687d5 --- /dev/null +++ b/packages/nova-pdf/README.md @@ -0,0 +1,166 @@ +# Nova-PDF + +智能 PDF 转 Markdown 插件,使用 glm-ocr AI 驱动的图片和表格提取。 + +## 特性 + +- 🔍 **智能检测**:自动识别每页内容类型(纯文本 vs 图片/表格) +- 📄 **默认解析**:纯文本页面使用 pdfplumber/pdfminer 提取,速度快、成本低 +- 🤖 **AI 增强**:复杂页面(图片、表格)使用 glm-ocr 转换为 Markdown +- ⚙️ **灵活配置**:支持配置文件、环境变量等多种配置方式 + +## 安装 + +```bash +# 基础安装 +pip install nova-pdf + +# 安装 AI 功能 +pip install nova-pdf[zhipu] +``` + +## 配置 + +### 本地敏感配置(推荐) + +项目根目录的 `.secrets.local` 文件存储敏感信息,此文件不会被提交到 Git: + +```bash +# 创建 .secrets.local 文件 +echo 'NOVA_ZHIPU_API_KEY="your-api-key"' > .secrets.local + +# 加载配置 +source .secrets.local +``` + +### 环境变量 + +```bash +# 必需 +export NOVA_ZHIPU_API_KEY="your-zhipu-api-key" + +# 可选 +export NOVA_MODEL="glm-ocr" +export NOVA_DPI="150" +export NOVA_TIMEOUT="120" +``` + +### 配置文件 + +在 `pyproject.toml` 中配置默认值: + +```toml +[tool.nova-pdf] +model = "glm-ocr" +dpi = 150 +timeout = 120 +force_ai = false +``` + +## 使用方法 + +### 命令行(推荐) + +```bash +# 1. 加载敏感配置 +source .secrets.local + +# 2. 查看已安装插件 +markitdown --list-plugins + +# 3. 使用插件转换 PDF +markitdown -p document.pdf + +# 4. 保存到文件 +markitdown -p document.pdf -o output.md +``` + +### Python API + +```python +from markitdown import MarkItDown + +# 方式1:自动加载配置 +md = MarkItDown(enable_plugins=True) +result = md.convert("document.pdf") +print(result.markdown) + +# 方式2:手动配置 +from nova_pdf import NovaPdfConfig, AIService, NovaPdfConverter + +config = NovaPdfConfig.load() +ai_service = AIService( + api_key="your-api-key", + model="glm-ocr", +) + +converter = NovaPdfConverter( + ai_service=ai_service, + dpi=150, +) + +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) +result = md.convert("document.pdf") +``` + +## 配置选项 + +### NovaPdfConfig 参数 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `zhipu_api_key` | str | 环境变量 `NOVA_ZHIPU_API_KEY` | 智谱 API Key | +| `model` | str | "glm-ocr" | 模型名称 | +| `dpi` | int | 150 | 截图分辨率 | +| `timeout` | int | 120 | 请求超时(秒) | +| `force_ai` | bool | False | 强制所有页面使用 AI | + +### NovaPdfConverter 参数 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `ai_service` | AIService | None | AI 服务实例 | +| `dpi` | int | 150 | 截图分辨率 | +| `force_ai` | bool | False | 强制所有页面使用 AI | + +## 工作原理 + +``` +PDF 输入 + │ + ▼ +逐页分析内容类型 + │ + ├─ 纯文本页面 ──► pdfplumber 提取文本 + │ + └─ 复杂页面(图片/表格) + │ + ├─ 截图渲染 (150 DPI) + │ + ├─ base64 编码 + │ + └─ 调用 glm-ocr API 转 Markdown + │ + ▼ +合并输出完整 Markdown +``` + +## 技术架构 + +- **zai-sdk**: 智谱 AI 官方 SDK +- **glm-ocr**: 智谱 OCR 模型,支持表格、图片识别 +- **pdfplumber**: PDF 页面分析和截图 +- **pdfminer**: 纯文本页面提取 + +## 依赖 + +- `markitdown>=0.1.0` - 基础框架 +- `pdfplumber>=0.11.9` - PDF 解析和截图 +- `pdfminer.six>=20251230` - 文本提取备用 +- `Pillow>=9.0.0` - 图像处理 +- `zai-sdk>=0.2.2` - 智谱 AI SDK(可选,AI 功能需要) + +## 许可证 + +MIT diff --git a/packages/nova-pdf/pyproject.toml b/packages/nova-pdf/pyproject.toml new file mode 100644 index 000000000..f21aedf0c --- /dev/null +++ b/packages/nova-pdf/pyproject.toml @@ -0,0 +1,74 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "nova-pdf" +dynamic = ["version"] +description = "Intelligent PDF to Markdown converter with AI-powered image/table extraction" +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +keywords = ["markitdown", "pdf", "ocr", "ai", "llm", "vision", "glm-ocr"] +authors = [ + { name = "Contributors", email = "noreply@github.com" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] + +dependencies = [ + "markitdown>=0.1.0", + "pdfminer.six>=20251230", + "pdfplumber>=0.11.9", + "Pillow>=9.0.0", + "tomli>=2.0.0;python_version<'3.11'", +] + +[project.optional-dependencies] +zhipu = [ + "zai-sdk>=0.2.2", +] +dev = [ + "pytest>=7.0.0", +] + +[project.urls] +Documentation = "https://github.com/microsoft/markitdown#readme" +Issues = "https://github.com/microsoft/markitdown/issues" +Source = "https://github.com/microsoft/markitdown" + +[tool.hatch.version] +path = "src/nova_pdf/__about__.py" + +# Plugin entry point - MarkItDown will discover this plugin +[project.entry-points."markitdown.plugin"] +nova_pdf = "nova_pdf" + +[tool.hatch.build.targets.sdist] +only-include = ["src/nova_pdf"] + +[tool.hatch.build.targets.wheel] +packages = ["src/nova_pdf"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] + +# Nova-PDF configuration +[tool.nova-pdf] +# API key - set via environment variable NOVA_ZHIPU_API_KEY +api_key = "" +model = "glm-ocr" +dpi = 150 +timeout = 120 +force_ai = false + +# Legacy config (deprecated, will be removed) +workflow_image2markdown_key = "" +fastgpt_token = "" diff --git a/packages/nova-pdf/src/nova_pdf/__about__.py b/packages/nova-pdf/src/nova_pdf/__about__.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/packages/nova-pdf/src/nova_pdf/__about__.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/packages/nova-pdf/src/nova_pdf/__init__.py b/packages/nova-pdf/src/nova_pdf/__init__.py new file mode 100644 index 000000000..ce059c499 --- /dev/null +++ b/packages/nova-pdf/src/nova_pdf/__init__.py @@ -0,0 +1,13 @@ +from ._plugin import register_converters +from ._config import NovaPdfConfig +from ._ai_service import AIService, AIResult +from ._converter import NovaPdfConverter + +__plugin_interface_version__ = 1 +__all__ = [ + "register_converters", + "NovaPdfConfig", + "AIService", + "AIResult", + "NovaPdfConverter", +] \ No newline at end of file diff --git a/packages/nova-pdf/src/nova_pdf/_ai_service.py b/packages/nova-pdf/src/nova_pdf/_ai_service.py new file mode 100644 index 000000000..2451f9955 --- /dev/null +++ b/packages/nova-pdf/src/nova_pdf/_ai_service.py @@ -0,0 +1,202 @@ +"""AI service using zai-sdk and glm-ocr.""" + +import base64 +import re +from dataclasses import dataclass +from typing import BinaryIO, Optional + +from ._config import NovaPdfConfig + +try: + from zai import ZhipuAiClient +except ImportError: + ZhipuAiClient = None + + +@dataclass +class AIResult: + """Result from AI conversion.""" + text: str + success: bool = True + error: Optional[str] = None + + +class AIService: + """ + AI Service using zai-sdk + glm-ocr. + + Features: + - Direct API call to glm-ocr layout_parsing + - Support image bytes via base64 data URI + - Return Markdown or HTML format content + """ + + def __init__( + self, + api_key: Optional[str] = None, + model: str = "glm-ocr", + timeout: int = 120, + config: Optional[NovaPdfConfig] = None, + ): + if ZhipuAiClient is None: + raise ImportError( + "zai-sdk is required. Install with: pip install nova-pdf[zhipu]" + ) + + if config: + self.api_key = api_key or config.zhipu_api_key + self.model = model or config.model + self.timeout = timeout or config.timeout + else: + config = NovaPdfConfig.load() + self.api_key = api_key or config.zhipu_api_key + self.model = model + self.timeout = timeout + + if not self.api_key: + raise ValueError( + "API key is required. Set NOVA_ZHIPU_API_KEY environment variable" + ) + + self.client = ZhipuAiClient(api_key=self.api_key) + + def image_to_markdown( + self, + image_stream: BinaryIO, + filename: str = "page.png", + keep_html: bool = False, + ) -> AIResult: + """ + Convert image to Markdown using glm-ocr. + + Args: + image_stream: Image stream + filename: Filename (for content type detection) + keep_html: Keep HTML format for complex tables (default: False, convert to MD) + + Returns: + AIResult: Conversion result + """ + try: + image_stream.seek(0) + image_bytes = image_stream.read() + + base64_image = base64.b64encode(image_bytes).decode("utf-8") + content_type = "image/jpeg" if filename.lower().endswith((".jpg", ".jpeg")) else "image/png" + data_uri = f"data:{content_type};base64,{base64_image}" + + response = self.client.layout_parsing.create( + model=self.model, + file=data_uri + ) + + # Get HTML content + html = response.md_results or "" + + if not html and response.layout_details: + parts = [] + for detail_list in response.layout_details: + for detail in detail_list: + if detail.content: + parts.append(detail.content) + html = "\n".join(parts) + + # Convert to Markdown or keep HTML + if keep_html: + text = html.strip() + else: + text = self._html_to_markdown(html.strip()) if html else "" + + return AIResult(text=text, success=True) + + except Exception as e: + return AIResult(text="", success=False, error=str(e)) + finally: + image_stream.seek(0) + + def _html_to_markdown(self, html: str) -> str: + """Convert HTML to Markdown.""" + if not html: + return "" + + # Extract titles from
+ titles = [] + div_pattern = r']*>(.*?)
' + for match in re.finditer(div_pattern, html, re.DOTALL | re.IGNORECASE): + title = re.sub(r'<[^>]+>', '', match.group(1)).strip() + if title: + titles.append(title) + + # Remove
from HTML + html = re.sub(div_pattern, '', html, flags=re.DOTALL | re.IGNORECASE) + + # Check for table + if ']+>', '', html).strip() + if titles: + return f"**{' '.join(titles)}**\n\n{text}" + return text + + def _convert_html_table(self, html: str) -> str: + """Convert HTML table to Markdown table.""" + # Parse rows + rows = [] + rowspan_cells = {} + + for row_idx, row_match in enumerate(re.finditer(r']*>(.*?)', html, re.DOTALL | re.IGNORECASE)): + cells = [] + col_idx = 0 + + # Fill rowspan cells + while (row_idx, col_idx) in rowspan_cells: + cells.append(rowspan_cells[(row_idx, col_idx)]) + col_idx += 1 + + # Parse cells + for cell_match in re.finditer(r']*)>(.*?)', row_match.group(1), re.DOTALL | re.IGNORECASE): + attrs, content = cell_match.groups() + content = re.sub(r'<[^>]+>', '', content).strip().replace('\n', ' ') + + rowspan = int(r.group(1)) if (r := re.search(r'rowspan\s*=\s*["\']?(\d+)', attrs, re.IGNORECASE)) else 1 + colspan = int(c.group(1)) if (c := re.search(r'colspan\s*=\s*["\']?(\d+)', attrs, re.IGNORECASE)) else 1 + + cells.append(content) + cells.extend([""] * (colspan - 1)) + + if rowspan > 1: + for r in range(1, rowspan): + for c in range(colspan): + rowspan_cells[(row_idx + r, col_idx + c)] = content + + col_idx += colspan + + # Fill remaining rowspan + while (row_idx, col_idx) in rowspan_cells: + cells.append(rowspan_cells[(row_idx, col_idx)]) + col_idx += 1 + + rows.append(cells) + + if not rows: + return "" + + # Normalize + max_cols = max(len(row) for row in rows) + for row in rows: + row.extend([""] * (max_cols - len(row))) + + # Simple output: first row as header + md_lines = [] + for i, row in enumerate(rows): + md_row = "| " + " | ".join(c or " " for c in row) + " |" + md_lines.append(md_row) + if i == 0: + md_lines.append("|" + "|".join(["---"] * max_cols) + "|") + + return "\n".join(md_lines) diff --git a/packages/nova-pdf/src/nova_pdf/_config.py b/packages/nova-pdf/src/nova_pdf/_config.py new file mode 100644 index 000000000..305ce1894 --- /dev/null +++ b/packages/nova-pdf/src/nova_pdf/_config.py @@ -0,0 +1,101 @@ +"""Configuration management for nova-pdf.""" + +import os +from pathlib import Path +from typing import Optional +from dataclasses import dataclass + +try: + import tomllib # Python 3.11+ +except ImportError: + import tomli as tomllib + + +@dataclass +class NovaPdfConfig: + """nova-pdf configuration.""" + + # API 配置 + zhipu_api_key: str = "" + + # OCR 配置 + model: str = "glm-ocr" + dpi: int = 150 + timeout: int = 120 + + # 处理策略 + force_ai: bool = False + + @classmethod + def load(cls, config_path: Optional[str] = None) -> "NovaPdfConfig": + """ + Load configuration from multiple sources (priority high to low): + 1. Environment variables + 2. Config file (pyproject.toml or nova-pdf.toml) + 3. Default values + """ + config = cls() + + # 1. Load from config file + config._load_from_file(config_path) + + # 2. Environment variables override + config._load_from_env() + + return config + + def _load_from_file(self, config_path: Optional[str] = None): + """Load from config file.""" + search_paths = [] + + if config_path: + search_paths.append(Path(config_path)) + + # Current directory + search_paths.append(Path("pyproject.toml")) + search_paths.append(Path("nova-pdf.toml")) + + # User config directory + search_paths.append(Path.home() / ".config" / "nova-pdf" / "config.toml") + + for path in search_paths: + if path.exists(): + try: + with open(path, "rb") as f: + data = tomllib.load(f) + + # Read [tool.nova-pdf] section + if "tool" in data and "nova-pdf" in data["tool"]: + self._apply_config(data["tool"]["nova-pdf"]) + elif "nova-pdf" in data: + self._apply_config(data["nova-pdf"]) + + break + except Exception: + pass + + def _apply_config(self, data: dict): + """Apply config from dict.""" + if "api_key" in data: + self.zhipu_api_key = data["api_key"] + if "model" in data: + self.model = data["model"] + if "dpi" in data: + self.dpi = data["dpi"] + if "timeout" in data: + self.timeout = data["timeout"] + if "force_ai" in data: + self.force_ai = data["force_ai"] + + def _load_from_env(self): + """Load from environment variables (highest priority).""" + if os.environ.get("NOVA_ZHIPU_API_KEY"): + self.zhipu_api_key = os.environ["NOVA_ZHIPU_API_KEY"] + if os.environ.get("NOVA_MODEL"): + self.model = os.environ["NOVA_MODEL"] + if os.environ.get("NOVA_DPI"): + self.dpi = int(os.environ["NOVA_DPI"]) + if os.environ.get("NOVA_TIMEOUT"): + self.timeout = int(os.environ["NOVA_TIMEOUT"]) + if os.environ.get("NOVA_FORCE_AI"): + self.force_ai = os.environ["NOVA_FORCE_AI"].lower() in ("true", "1", "yes") diff --git a/packages/nova-pdf/src/nova_pdf/_converter.py b/packages/nova-pdf/src/nova_pdf/_converter.py new file mode 100644 index 000000000..7ee1dd320 --- /dev/null +++ b/packages/nova-pdf/src/nova_pdf/_converter.py @@ -0,0 +1,251 @@ +"""Nova PDF Converter - Intelligent PDF to Markdown conversion.""" + +import io +import sys +from typing import Any, BinaryIO, Optional + +from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo +from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +from ._page_analyzer import PageType, analyze_page +from ._page_renderer import render_page_to_image +from ._ai_service import AIService + +# Import dependencies +_dependency_exc_info = None +try: + import pdfminer + import pdfminer.high_level + import pdfplumber +except ImportError: + _dependency_exc_info = sys.exc_info() + + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/pdf", + "application/x-pdf", +] + +ACCEPTED_FILE_EXTENSIONS = [".pdf"] + + +class NovaPdfConverter(DocumentConverter): + """ + 智能 PDF 转换器 + + 特性: + - 自动检测每页内容类型(纯文本 vs 包含图片/表格) + - 纯文本页面使用默认解析(pdfplumber/pdfminer) + - 复杂页面截图后调用 AI 转换为 Markdown + """ + + def __init__( + self, + ai_service: Optional[AIService] = None, + dpi: int = 150, + force_ai: bool = False, + ): + """ + 初始化转换器 + + Args: + ai_service: AI 服务实例 + dpi: 截图分辨率(默认 150) + force_ai: 强制所有页面使用 AI(默认 False) + """ + self.ai_service = ai_service + self.dpi = dpi + self.force_ai = force_ai + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".pdf", + feature="pdf", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) + + # 获取 AI 服务(从 kwargs 或实例) + ai_service = kwargs.get("ai_service") or self.ai_service + + # 读取 PDF + pdf_stream = io.BytesIO(file_stream.read()) + markdown_parts = [] + + try: + with pdfplumber.open(pdf_stream) as pdf: + for page_num, page in enumerate(pdf.pages): + # 分析页面类型 + page_type = analyze_page(page) + + # 根据类型选择处理方式 + if self.force_ai or page_type != PageType.PLAIN_TEXT: + # 复杂内容:截图 + AI + if ai_service: + markdown = self._convert_with_ai( + page, page_num, ai_service + ) + else: + # 无 AI 服务,回退到默认解析 + markdown = self._extract_text_with_tables(page) + else: + # 纯文本:默认解析 + markdown = self._extract_text_with_tables(page) + + if markdown.strip(): + markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}") + + # 释放页面资源 + page.close() + + markdown = "\n\n".join(markdown_parts).strip() + + except Exception: + # 异常情况:回退到 pdfminer + pdf_stream.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_stream) or "" + + # 最终回退 + if not markdown: + pdf_stream.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_stream) or "" + + return DocumentConverterResult(markdown=markdown) + + def _convert_with_ai( + self, + page: Any, + page_num: int, + ai_service: AIService, + ) -> str: + """ + 使用 AI 转换页面 + + Args: + page: pdfplumber 页面对象 + page_num: 页码 + ai_service: AI 服务 + + Returns: + str: Markdown 内容 + """ + try: + # 截图 + img_stream = render_page_to_image(page, self.dpi) + + # 调用 AI(文件名使用页码) + filename = f"page_{page_num + 1}.png" + result = ai_service.image_to_markdown(img_stream, filename=filename) + + if result.success and result.text.strip(): + return result.text + else: + # AI 失败,回退到默认解析 + return self._extract_text_with_tables(page) + + except Exception: + # 异常情况,回退到默认解析 + return self._extract_text_with_tables(page) + + def _extract_text_with_tables(self, page: Any) -> str: + """ + 提取文本和表格 + + Args: + page: pdfplumber 页面对象 + + Returns: + str: Markdown 内容 + """ + parts = [] + + # 提取文本 + text = page.extract_text() or "" + if text.strip(): + parts.append(text.strip()) + + # 提取表格 + try: + tables = page.extract_tables() + if tables: + for table in tables: + if table: + md_table = self._table_to_markdown(table) + if md_table.strip(): + parts.append(md_table) + except Exception: + pass + + return "\n\n".join(parts) + + def _table_to_markdown(self, table: list[list[str]]) -> str: + """ + 将表格转换为 Markdown + + Args: + table: 2D 列表 + + Returns: + str: Markdown 表格 + """ + if not table: + return "" + + # 过滤 None 值 + table = [[cell if cell is not None else "" for cell in row] for row in table] + + # 过滤空行 + table = [row for row in table if any(cell.strip() for cell in row)] + + if not table: + return "" + + # 计算列宽 + col_widths = [ + max(len(str(row[i])) if i < len(row) else 0 for row in table) + for i in range(max(len(row) for row in table)) + ] + + # 格式化表格 + lines = [] + for row_idx, row in enumerate(table): + # 补齐列数 + padded_row = row + [""] * (len(col_widths) - len(row)) + line = "| " + " | ".join( + str(cell).ljust(width) for cell, width in zip(padded_row, col_widths) + ) + " |" + lines.append(line) + + # 添加分隔行 + if row_idx == 0: + sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|" + lines.append(sep) + + return "\n".join(lines) diff --git a/packages/nova-pdf/src/nova_pdf/_page_analyzer.py b/packages/nova-pdf/src/nova_pdf/_page_analyzer.py new file mode 100644 index 000000000..1aa014043 --- /dev/null +++ b/packages/nova-pdf/src/nova_pdf/_page_analyzer.py @@ -0,0 +1,117 @@ +"""Page content analyzer for detecting images and tables.""" + +from enum import Enum +from typing import Any + + +class PageType(Enum): + """Page content type classification.""" + PLAIN_TEXT = "plain_text" # 纯文本,使用默认解析 + HAS_IMAGES = "has_images" # 包含图片 + HAS_TABLES = "has_tables" # 包含表格 + COMPLEX = "complex" # 复杂内容(图片+表格+混合) + + +def detect_images(page: Any) -> bool: + """ + 检测页面是否包含图片 + + Args: + page: pdfplumber 页面对象 + + Returns: + bool: 是否包含图片 + """ + # 方法1: 直接检测 page.images + if hasattr(page, 'images') and len(page.images) > 0: + return True + + # 方法2: 检测页面对象中的图像资源 + if hasattr(page, 'objects'): + objects = page.objects + if 'image' in objects and len(objects['image']) > 0: + return True + # 检测 XObject (可能包含内嵌图像) + if 'xobject' in objects and len(objects['xobject']) > 0: + for obj in objects['xobject']: + if isinstance(obj, dict) and obj.get('subtype') == 'Image': + return True + + # 方法3: 检测页面资源字典 + try: + if hasattr(page, 'page') and hasattr(page.page, 'get_resources'): + resources = page.page.get_resources() + if resources and 'XObject' in resources: + return True + except Exception: + pass + + return False + + +def detect_tables(page: Any) -> bool: + """ + 检测页面是否包含表格 + + Args: + page: pdfplumber 页面对象 + + Returns: + bool: 是否包含表格 + """ + # 方法1: 使用 pdfplumber 的 extract_tables + try: + tables = page.extract_tables() + if tables and len(tables) > 0: + # 过滤空表格 + for table in tables: + if table and any(any(cell for cell in row if cell) for row in table): + return True + except Exception: + pass + + # 方法2: 检测表格线(边框线) + try: + if hasattr(page, 'objects') and 'line' in page.objects: + lines = page.objects['line'] + if len(lines) > 10: # 大量线条可能构成表格 + # 分析线条是否形成网格结构 + h_lines = [] + v_lines = [] + for line in lines: + # 水平线:高度很小 + if abs(line.get('height', 1)) < 2: + h_lines.append(line) + # 垂直线:宽度很小 + elif abs(line.get('width', 1)) < 2: + v_lines.append(line) + + if len(h_lines) > 2 and len(v_lines) > 2: + return True + except Exception: + pass + + return False + + +def analyze_page(page: Any) -> PageType: + """ + 分析页面类型 + + Args: + page: pdfplumber 页面对象 + + Returns: + PageType: 页面类型 + """ + has_images = detect_images(page) + has_tables = detect_tables(page) + + if has_images and has_tables: + return PageType.COMPLEX + elif has_images: + return PageType.HAS_IMAGES + elif has_tables: + return PageType.HAS_TABLES + else: + return PageType.PLAIN_TEXT diff --git a/packages/nova-pdf/src/nova_pdf/_page_renderer.py b/packages/nova-pdf/src/nova_pdf/_page_renderer.py new file mode 100644 index 000000000..d517e8780 --- /dev/null +++ b/packages/nova-pdf/src/nova_pdf/_page_renderer.py @@ -0,0 +1,32 @@ +"""Page renderer for converting PDF pages to images.""" + +import io +from typing import Any + + +def render_page_to_image(page: Any, dpi: int = 150) -> io.BytesIO: + """ + 将 PDF 页面渲染为图片 + + Args: + page: pdfplumber 页面对象 + dpi: 渲染分辨率,默认 150(平衡质量和速度) + + Returns: + io.BytesIO: PNG 图片流 + """ + # 使用 pdfplumber 的 to_image 方法 + page_image = page.to_image(resolution=dpi) + + # 转换为 BytesIO + img_stream = io.BytesIO() + page_image.original.save(img_stream, format="PNG") + img_stream.seek(0) + + return img_stream + + +# DPI 预设值 +DPI_LOW = 72 # 快速预览,文件小 +DPI_MEDIUM = 150 # 平衡质量和速度(默认) +DPI_HIGH = 300 # 高质量,适合复杂图表 diff --git a/packages/nova-pdf/src/nova_pdf/_plugin.py b/packages/nova-pdf/src/nova_pdf/_plugin.py new file mode 100644 index 000000000..08a4fd8a0 --- /dev/null +++ b/packages/nova-pdf/src/nova_pdf/_plugin.py @@ -0,0 +1,56 @@ +"""Plugin registration for nova-pdf.""" + +from typing import Any +from markitdown import MarkItDown + +from ._config import NovaPdfConfig +from ._ai_service import AIService +from ._converter import NovaPdfConverter + + +__plugin_interface_version__ = 1 + + +def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: + """ + Register nova-pdf converter. + + Config sources (priority high to low): + 1. kwargs parameters + 2. Environment variables + 3. Config file (pyproject.toml) + 4. Default values + """ + # Load config + config = NovaPdfConfig.load() + + # kwargs override config + api_key = kwargs.get("api_key") or kwargs.get("zhipu_api_key") or config.zhipu_api_key + model = kwargs.get("model", config.model) + dpi = kwargs.get("dpi", config.dpi) + force_ai = kwargs.get("force_ai", config.force_ai) + timeout = kwargs.get("timeout", config.timeout) + + # Create AI service + ai_service = None + if api_key: + try: + ai_service = AIService( + api_key=api_key, + model=model, + timeout=timeout, + ) + except Exception: + pass + + # Register converter + PRIORITY_NOVA_PDF = -1.0 + + markitdown.register_converter( + NovaPdfConverter( + ai_service=ai_service, + dpi=dpi, + force_ai=force_ai, + ), + priority=PRIORITY_NOVA_PDF, + ) diff --git a/packages/nova-pdf/tests/__init__.py b/packages/nova-pdf/tests/__init__.py new file mode 100644 index 000000000..dfa7b4968 --- /dev/null +++ b/packages/nova-pdf/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for nova-pdf converter.""" \ No newline at end of file diff --git a/packages/nova-pdf/tests/test_ai_service.py b/packages/nova-pdf/tests/test_ai_service.py new file mode 100644 index 000000000..7c7636848 --- /dev/null +++ b/packages/nova-pdf/tests/test_ai_service.py @@ -0,0 +1,103 @@ +"""Tests for AI service with zai-sdk.""" + +import io +import pytest +from unittest.mock import MagicMock, patch + +from nova_pdf._ai_service import AIService, AIResult +from nova_pdf._config import NovaPdfConfig + + +class TestAIService: + """AI Service tests with zai-sdk.""" + + def test_missing_zai_sdk_raises_error(self): + """Missing zai-sdk raises error.""" + with patch("nova_pdf._ai_service.ZhipuAiClient", None): + with pytest.raises(ImportError, match="zai-sdk is required"): + AIService(api_key="test") + + def test_missing_api_key_raises_error(self): + """Missing API key raises error.""" + with patch("nova_pdf._ai_service.ZhipuAiClient", MagicMock()): + with pytest.raises(ValueError, match="API key is required"): + AIService(api_key="") + + def test_successful_conversion(self): + """Successful conversion.""" + # Mock ZhipuAiClient + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.md_results = "
Test
" + mock_response.layout_details = [] + mock_client.layout_parsing.create.return_value = mock_response + + with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + service = AIService(api_key="test-api-key") + result = service.image_to_markdown(io.BytesIO(b"fake-image")) + + assert result.success is True + assert "Test" in result.text + + def test_html_table_conversion(self): + """HTML table to Markdown conversion.""" + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.md_results = '
AB
12
' + mock_response.layout_details = [] + mock_client.layout_parsing.create.return_value = mock_response + + with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + service = AIService(api_key="test-api-key") + result = service.image_to_markdown(io.BytesIO(b"fake-image")) + + assert result.success is True + assert "| A | B |" in result.text + assert "|---|---|" in result.text + assert "| 1 | 2 |" in result.text + + def test_empty_result(self): + """Empty result handling.""" + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.md_results = "" + mock_response.layout_details = [] + mock_client.layout_parsing.create.return_value = mock_response + + with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + service = AIService(api_key="test-api-key") + result = service.image_to_markdown(io.BytesIO(b"fake-image")) + + assert result.success is True + assert result.text == "" + + def test_error_handling(self): + """Error handling.""" + mock_client = MagicMock() + mock_client.layout_parsing.create.side_effect = Exception("API Error") + + with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + service = AIService(api_key="test-api-key") + result = service.image_to_markdown(io.BytesIO(b"fake-image")) + + assert result.success is False + assert "API Error" in result.error + + def test_base64_encoding(self): + """Test base64 encoding of image.""" + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.md_results = "test" + mock_response.layout_details = [] + mock_client.layout_parsing.create.return_value = mock_response + + with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + service = AIService(api_key="test-api-key") + result = service.image_to_markdown(io.BytesIO(b"fake-image"), "test.png") + + assert result.success is True + + # Verify data URI was used + call_args = mock_client.layout_parsing.create.call_args + file_arg = call_args.kwargs['file'] + assert file_arg.startswith("data:image/png;base64,") diff --git a/packages/nova-pdf/tests/test_analyzer.py b/packages/nova-pdf/tests/test_analyzer.py new file mode 100644 index 000000000..137e486ab --- /dev/null +++ b/packages/nova-pdf/tests/test_analyzer.py @@ -0,0 +1,131 @@ +"""Tests for page analyzer.""" + +import pytest +from unittest.mock import MagicMock + +from nova_pdf._page_analyzer import ( + PageType, + detect_images, + detect_tables, + analyze_page, +) + + +class TestDetectImages: + """图片检测测试""" + + def test_no_images(self): + """无图片页面""" + page = MagicMock() + page.images = [] + page.objects = {} + + assert detect_images(page) is False + + def test_has_images_via_images_attr(self): + """通过 page.images 检测图片""" + page = MagicMock() + page.images = [MagicMock(x0=0, y0=0, x1=100, y1=100)] + + assert detect_images(page) is True + + def test_has_images_via_objects(self): + """通过 page.objects 检测图片""" + page = MagicMock() + page.images = [] + page.objects = {"image": [MagicMock()]} + + assert detect_images(page) is True + + def test_has_xobject_image(self): + """通过 XObject 检测图片""" + page = MagicMock() + page.images = [] + page.objects = { + "xobject": [{"subtype": "Image"}] + } + + assert detect_images(page) is True + + +class TestDetectTables: + """表格检测测试""" + + def test_no_tables(self): + """无表格页面""" + page = MagicMock() + page.extract_tables.return_value = [] + + assert detect_tables(page) is False + + def test_has_tables_via_extract_tables(self): + """通过 extract_tables 检测表格""" + page = MagicMock() + page.extract_tables.return_value = [ + [["A", "B", "C"], ["1", "2", "3"]] + ] + + assert detect_tables(page) is True + + def test_empty_table_not_detected(self): + """空表格不应被检测""" + page = MagicMock() + page.extract_tables.return_value = [ + [["", "", ""], ["", "", ""]] + ] + + assert detect_tables(page) is False + + def test_has_table_lines(self): + """通过线条检测表格""" + page = MagicMock() + page.extract_tables.return_value = [] + + # 模拟网格线条 + lines = [] + for i in range(5): + # 水平线 + lines.append({"height": 0.5, "width": 100}) + # 垂直线 + lines.append({"height": 100, "width": 0.5}) + + page.objects = {"line": lines} + + assert detect_tables(page) is True + + +class TestAnalyzePage: + """页面分析测试""" + + def test_plain_text_page(self): + """纯文本页面""" + page = MagicMock() + page.images = [] + page.objects = {} + page.extract_tables.return_value = [] + + assert analyze_page(page) == PageType.PLAIN_TEXT + + def test_page_with_images(self): + """仅包含图片""" + page = MagicMock() + page.images = [MagicMock()] + page.extract_tables.return_value = [] + + assert analyze_page(page) == PageType.HAS_IMAGES + + def test_page_with_tables(self): + """仅包含表格""" + page = MagicMock() + page.images = [] + page.extract_tables.return_value = [[["A", "B"]]] + + assert analyze_page(page) == PageType.HAS_TABLES + + def test_complex_page(self): + """同时包含图片和表格""" + page = MagicMock() + page.images = [MagicMock()] + page.extract_tables.return_value = [[["A", "B"]]] + + assert analyze_page(page) == PageType.COMPLEX \ No newline at end of file diff --git a/packages/nova-pdf/tests/test_converter.py b/packages/nova-pdf/tests/test_converter.py new file mode 100644 index 000000000..ea13266f7 --- /dev/null +++ b/packages/nova-pdf/tests/test_converter.py @@ -0,0 +1,181 @@ +"""Tests for nova-pdf converter.""" + +import io +import pytest +from unittest.mock import MagicMock, patch + +from nova_pdf._converter import NovaPdfConverter +from nova_pdf._ai_service import AIService, AIResult +from nova_pdf._page_analyzer import PageType + + +class TestNovaPdfConverter: + """转换器测试""" + + def test_accepts_pdf_extension(self): + """接受 .pdf 扩展名""" + converter = NovaPdfConverter() + stream = io.BytesIO(b"%PDF-1.4") + stream_info = MagicMock(extension=".pdf", mimetype=None) + + assert converter.accepts(stream, stream_info) is True + + def test_accepts_pdf_mimetype(self): + """接受 PDF MIME 类型""" + converter = NovaPdfConverter() + stream = io.BytesIO(b"%PDF-1.4") + stream_info = MagicMock(extension=None, mimetype="application/pdf") + + assert converter.accepts(stream, stream_info) is True + + def test_rejects_non_pdf(self): + """拒绝非 PDF 文件""" + converter = NovaPdfConverter() + stream = io.BytesIO(b"not a pdf") + stream_info = MagicMock(extension=".txt", mimetype="text/plain") + + assert converter.accepts(stream, stream_info) is False + + def test_table_to_markdown(self): + """表格转 Markdown""" + converter = NovaPdfConverter() + table = [ + ["Name", "Age", "City"], + ["Alice", "25", "Beijing"], + ["Bob", "30", "Shanghai"], + ] + + result = converter._table_to_markdown(table) + + assert "|" in result + assert "Name" in result + assert "Alice" in result + assert "---" in result # 分隔行 + + def test_plain_text_page_without_ai(self): + """纯文本页面不使用 AI""" + converter = NovaPdfConverter() + + # 模拟页面 + page = MagicMock() + page.images = [] + page.objects = {} + page.extract_tables.return_value = [] + page.extract_text.return_value = "Hello World" + page.close = MagicMock() + + # 模拟 PDF + mock_pdf = MagicMock() + mock_pdf.pages = [page] + + with patch("nova_pdf._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = mock_pdf + + stream = io.BytesIO(b"%PDF-1.4") + result = converter.convert(stream, MagicMock()) + + assert "Hello World" in result.markdown + + def test_complex_page_with_ai(self): + """复杂页面使用 AI""" + # 模拟 AI 服务 + ai_service = MagicMock(spec=AIService) + ai_service.image_to_markdown.return_value = AIResult( + success=True, + text="# AI Generated\n\nThis is from AI." + ) + + converter = NovaPdfConverter(ai_service=ai_service) + + # 模拟页面 + page = MagicMock() + page.images = [MagicMock()] + page.extract_tables.return_value = [] + page.extract_text.return_value = "Plain text" + page.to_image.return_value.original = MagicMock() + page.close = MagicMock() + + # 模拟图片保存 + img_stream = io.BytesIO() + page.to_image.return_value.original.save = lambda s, format: s.write(b"fake") + + # 模拟 PDF + mock_pdf = MagicMock() + mock_pdf.pages = [page] + + with patch("nova_pdf._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = mock_pdf + + stream = io.BytesIO(b"%PDF-1.4") + result = converter.convert(stream, MagicMock()) + + # 应该调用 AI + ai_service.image_to_markdown.assert_called_once() + assert "AI Generated" in result.markdown + + def test_force_ai_mode(self): + """强制 AI 模式""" + ai_service = MagicMock(spec=AIService) + ai_service.image_to_markdown.return_value = AIResult( + success=True, + text="AI result" + ) + + converter = NovaPdfConverter(ai_service=ai_service, force_ai=True) + + # 即使是纯文本页面 + page = MagicMock() + page.images = [] + page.objects = {} + page.extract_tables.return_value = [] + page.extract_text.return_value = "Plain text" + page.to_image.return_value.original = MagicMock() + page.close = MagicMock() + + img_stream = io.BytesIO() + page.to_image.return_value.original.save = lambda s, format: s.write(b"fake") + + mock_pdf = MagicMock() + mock_pdf.pages = [page] + + with patch("nova_pdf._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = mock_pdf + + stream = io.BytesIO(b"%PDF-1.4") + result = converter.convert(stream, MagicMock()) + + # 应该调用 AI(因为 force_ai=True) + ai_service.image_to_markdown.assert_called_once() + + def test_fallback_on_ai_failure(self): + """AI 失败时回退到默认解析""" + ai_service = MagicMock(spec=AIService) + ai_service.image_to_markdown.return_value = AIResult( + success=False, + text="", + error="API error" + ) + + converter = NovaPdfConverter(ai_service=ai_service) + + page = MagicMock() + page.images = [MagicMock()] + page.extract_tables.return_value = [] + page.extract_text.return_value = "Fallback text" + page.to_image.return_value.original = MagicMock() + page.close = MagicMock() + + img_stream = io.BytesIO() + page.to_image.return_value.original.save = lambda s, format: s.write(b"fake") + + mock_pdf = MagicMock() + mock_pdf.pages = [page] + + with patch("nova_pdf._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = mock_pdf + + stream = io.BytesIO(b"%PDF-1.4") + result = converter.convert(stream, MagicMock()) + + # 应该回退到默认文本 + assert "Fallback text" in result.markdown diff --git a/scripts/load_secrets.sh b/scripts/load_secrets.sh new file mode 100755 index 000000000..ede9291d0 --- /dev/null +++ b/scripts/load_secrets.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# 加载本地敏感配置 + +if [ -f ".secrets.local" ]; then + echo "Loading secrets from .secrets.local" + set -a + source .secrets.local + set +a + echo "✓ Secrets loaded" +else + echo "✗ .secrets.local not found" + exit 1 +fi From 6bd22b487a28be9ee983e75f39394a973adcfb15 Mon Sep 17 00:00:00 2001 From: hankl Date: Sat, 9 May 2026 19:13:04 +0800 Subject: [PATCH 02/15] Update .gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 15613ea8a..5a6b7d117 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,8 @@ cython_debug/ src/.DS_Store .DS_Store .cursorrules + +# Local secrets (never commit) +.secrets.local +*.secrets +.env.local From af938277c616f9912ac3c4486832f3bcfce4ad8c Mon Sep 17 00:00:00 2001 From: hankl Date: Sat, 9 May 2026 19:34:45 +0800 Subject: [PATCH 03/15] refactor: rename nova-pdf to markitdown-glmocr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename package from nova-pdf to markitdown-glmocr - Rename module from nova_pdf to markitdown_glmocr - Rename classes: NovaPdfConfig → GlmOcrConfig, NovaPdfConverter → GlmOcrPdfConverter - Update environment variable: NOVA_ZHIPU_API_KEY → GLMOCR_API_KEY - Update config section: [tool.nova-pdf] → [tool.markitdown-glmocr] - Refactor AI service to use zai-sdk with glm-ocr - Support base64 image transmission - Add HTML/Markdown output options for complex tables Verified with: - Syntax check passed - Module imports successful - Plugin installation successful - Image conversion tests passed (table1/2/3) --- .../{nova-pdf => markitdown-glmocr}/README.md | 30 +++---- .../pyproject.toml | 24 +++-- .../src/markitdown_glmocr}/__about__.py | 0 .../src/markitdown_glmocr}/__init__.py | 8 +- .../src/markitdown_glmocr}/_ai_service.py | 16 ++-- .../src/markitdown_glmocr}/_config.py | 48 +++++----- .../src/markitdown_glmocr}/_converter.py | 90 +++++++++---------- .../src/markitdown_glmocr}/_page_analyzer.py | 0 .../src/markitdown_glmocr}/_page_renderer.py | 0 .../src/markitdown_glmocr}/_plugin.py | 20 ++--- .../tests/__init__.py | 0 .../tests/test_ai_service.py | 20 ++--- .../tests/test_analyzer.py | 2 +- .../tests/test_converter.py | 76 ++++++++-------- 14 files changed, 165 insertions(+), 169 deletions(-) rename packages/{nova-pdf => markitdown-glmocr}/README.md (84%) rename packages/{nova-pdf => markitdown-glmocr}/pyproject.toml (73%) rename packages/{nova-pdf/src/nova_pdf => markitdown-glmocr/src/markitdown_glmocr}/__about__.py (100%) rename packages/{nova-pdf/src/nova_pdf => markitdown-glmocr/src/markitdown_glmocr}/__init__.py (61%) rename packages/{nova-pdf/src/nova_pdf => markitdown-glmocr/src/markitdown_glmocr}/_ai_service.py (94%) rename packages/{nova-pdf/src/nova_pdf => markitdown-glmocr/src/markitdown_glmocr}/_config.py (58%) rename packages/{nova-pdf/src/nova_pdf => markitdown-glmocr/src/markitdown_glmocr}/_converter.py (75%) rename packages/{nova-pdf/src/nova_pdf => markitdown-glmocr/src/markitdown_glmocr}/_page_analyzer.py (100%) rename packages/{nova-pdf/src/nova_pdf => markitdown-glmocr/src/markitdown_glmocr}/_page_renderer.py (100%) rename packages/{nova-pdf/src/nova_pdf => markitdown-glmocr/src/markitdown_glmocr}/_plugin.py (74%) rename packages/{nova-pdf => markitdown-glmocr}/tests/__init__.py (100%) rename packages/{nova-pdf => markitdown-glmocr}/tests/test_ai_service.py (81%) rename packages/{nova-pdf => markitdown-glmocr}/tests/test_analyzer.py (98%) rename packages/{nova-pdf => markitdown-glmocr}/tests/test_converter.py (72%) diff --git a/packages/nova-pdf/README.md b/packages/markitdown-glmocr/README.md similarity index 84% rename from packages/nova-pdf/README.md rename to packages/markitdown-glmocr/README.md index 969e687d5..746f923fe 100644 --- a/packages/nova-pdf/README.md +++ b/packages/markitdown-glmocr/README.md @@ -1,4 +1,4 @@ -# Nova-PDF +# markitdown-glmocr 智能 PDF 转 Markdown 插件,使用 glm-ocr AI 驱动的图片和表格提取。 @@ -13,10 +13,10 @@ ```bash # 基础安装 -pip install nova-pdf +pip install markitdown-glmocr # 安装 AI 功能 -pip install nova-pdf[zhipu] +pip install markitdown-glmocr[zhipu] ``` ## 配置 @@ -27,7 +27,7 @@ pip install nova-pdf[zhipu] ```bash # 创建 .secrets.local 文件 -echo 'NOVA_ZHIPU_API_KEY="your-api-key"' > .secrets.local +echo 'GLMOCR_API_KEY="your-api-key"' > .secrets.local # 加载配置 source .secrets.local @@ -37,12 +37,12 @@ source .secrets.local ```bash # 必需 -export NOVA_ZHIPU_API_KEY="your-zhipu-api-key" +export GLMOCR_API_KEY="your-zhipu-api-key" # 可选 -export NOVA_MODEL="glm-ocr" -export NOVA_DPI="150" -export NOVA_TIMEOUT="120" +export GLMOCR_MODEL="glm-ocr" +export GLMOCR_DPI="150" +export GLMOCR_TIMEOUT="120" ``` ### 配置文件 @@ -50,7 +50,7 @@ export NOVA_TIMEOUT="120" 在 `pyproject.toml` 中配置默认值: ```toml -[tool.nova-pdf] +[tool.markitdown-glmocr] model = "glm-ocr" dpi = 150 timeout = 120 @@ -86,15 +86,15 @@ result = md.convert("document.pdf") print(result.markdown) # 方式2:手动配置 -from nova_pdf import NovaPdfConfig, AIService, NovaPdfConverter +from markitdown_glmocr import GlmOcrConfig, AIService, GlmOcrPdfConverter -config = NovaPdfConfig.load() +config = GlmOcrConfig.load() ai_service = AIService( api_key="your-api-key", model="glm-ocr", ) -converter = NovaPdfConverter( +converter = GlmOcrPdfConverter( ai_service=ai_service, dpi=150, ) @@ -106,17 +106,17 @@ result = md.convert("document.pdf") ## 配置选项 -### NovaPdfConfig 参数 +### GlmOcrConfig 参数 | 参数 | 类型 | 默认值 | 说明 | |------|------|--------|------| -| `zhipu_api_key` | str | 环境变量 `NOVA_ZHIPU_API_KEY` | 智谱 API Key | +| `api_key` | str | 环境变量 `GLMOCR_API_KEY` | 智谱 API Key | | `model` | str | "glm-ocr" | 模型名称 | | `dpi` | int | 150 | 截图分辨率 | | `timeout` | int | 120 | 请求超时(秒) | | `force_ai` | bool | False | 强制所有页面使用 AI | -### NovaPdfConverter 参数 +### GlmOcrPdfConverter 参数 | 参数 | 类型 | 默认值 | 说明 | |------|------|--------|------| diff --git a/packages/nova-pdf/pyproject.toml b/packages/markitdown-glmocr/pyproject.toml similarity index 73% rename from packages/nova-pdf/pyproject.toml rename to packages/markitdown-glmocr/pyproject.toml index f21aedf0c..a9277c272 100644 --- a/packages/nova-pdf/pyproject.toml +++ b/packages/markitdown-glmocr/pyproject.toml @@ -3,9 +3,9 @@ requires = ["hatchling"] build-backend = "hatchling.build" [project] -name = "nova-pdf" +name = "markitdown-glmocr" dynamic = ["version"] -description = "Intelligent PDF to Markdown converter with AI-powered image/table extraction" +description = "Intelligent PDF to Markdown converter with glm-ocr AI-powered image/table extraction" readme = "README.md" requires-python = ">=3.10" license = "MIT" @@ -44,31 +44,27 @@ Issues = "https://github.com/microsoft/markitdown/issues" Source = "https://github.com/microsoft/markitdown" [tool.hatch.version] -path = "src/nova_pdf/__about__.py" +path = "src/markitdown_glmocr/__about__.py" # Plugin entry point - MarkItDown will discover this plugin [project.entry-points."markitdown.plugin"] -nova_pdf = "nova_pdf" +markitdown_glmocr = "markitdown_glmocr" [tool.hatch.build.targets.sdist] -only-include = ["src/nova_pdf"] +only-include = ["src/markitdown_glmocr"] [tool.hatch.build.targets.wheel] -packages = ["src/nova_pdf"] +packages = ["src/markitdown_glmocr"] [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["test_*.py"] -# Nova-PDF configuration -[tool.nova-pdf] -# API key - set via environment variable NOVA_ZHIPU_API_KEY +# markitdown-glmocr configuration +[tool.markitdown-glmocr] +# API key - set via environment variable GLMOCR_API_KEY api_key = "" model = "glm-ocr" dpi = 150 timeout = 120 -force_ai = false - -# Legacy config (deprecated, will be removed) -workflow_image2markdown_key = "" -fastgpt_token = "" +force_ai = false \ No newline at end of file diff --git a/packages/nova-pdf/src/nova_pdf/__about__.py b/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py similarity index 100% rename from packages/nova-pdf/src/nova_pdf/__about__.py rename to packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py diff --git a/packages/nova-pdf/src/nova_pdf/__init__.py b/packages/markitdown-glmocr/src/markitdown_glmocr/__init__.py similarity index 61% rename from packages/nova-pdf/src/nova_pdf/__init__.py rename to packages/markitdown-glmocr/src/markitdown_glmocr/__init__.py index ce059c499..55b9caf87 100644 --- a/packages/nova-pdf/src/nova_pdf/__init__.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/__init__.py @@ -1,13 +1,13 @@ from ._plugin import register_converters -from ._config import NovaPdfConfig +from ._config import GlmOcrConfig from ._ai_service import AIService, AIResult -from ._converter import NovaPdfConverter +from ._converter import GlmOcrPdfConverter __plugin_interface_version__ = 1 __all__ = [ "register_converters", - "NovaPdfConfig", + "GlmOcrConfig", "AIService", "AIResult", - "NovaPdfConverter", + "GlmOcrPdfConverter", ] \ No newline at end of file diff --git a/packages/nova-pdf/src/nova_pdf/_ai_service.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_ai_service.py similarity index 94% rename from packages/nova-pdf/src/nova_pdf/_ai_service.py rename to packages/markitdown-glmocr/src/markitdown_glmocr/_ai_service.py index 2451f9955..6c8e362a4 100644 --- a/packages/nova-pdf/src/nova_pdf/_ai_service.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_ai_service.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import BinaryIO, Optional -from ._config import NovaPdfConfig +from ._config import GlmOcrConfig try: from zai import ZhipuAiClient @@ -36,26 +36,26 @@ def __init__( api_key: Optional[str] = None, model: str = "glm-ocr", timeout: int = 120, - config: Optional[NovaPdfConfig] = None, + config: Optional[GlmOcrConfig] = None, ): if ZhipuAiClient is None: raise ImportError( - "zai-sdk is required. Install with: pip install nova-pdf[zhipu]" + "zai-sdk is required. Install with: pip install markitdown-glmocr[zhipu]" ) if config: - self.api_key = api_key or config.zhipu_api_key + self.api_key = api_key or config.api_key self.model = model or config.model self.timeout = timeout or config.timeout else: - config = NovaPdfConfig.load() - self.api_key = api_key or config.zhipu_api_key + config = GlmOcrConfig.load() + self.api_key = api_key or config.api_key self.model = model self.timeout = timeout if not self.api_key: raise ValueError( - "API key is required. Set NOVA_ZHIPU_API_KEY environment variable" + "API key is required. Set GLMOCR_API_KEY environment variable" ) self.client = ZhipuAiClient(api_key=self.api_key) @@ -199,4 +199,4 @@ def _convert_html_table(self, html: str) -> str: if i == 0: md_lines.append("|" + "|".join(["---"] * max_cols) + "|") - return "\n".join(md_lines) + return "\n".join(md_lines) \ No newline at end of file diff --git a/packages/nova-pdf/src/nova_pdf/_config.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py similarity index 58% rename from packages/nova-pdf/src/nova_pdf/_config.py rename to packages/markitdown-glmocr/src/markitdown_glmocr/_config.py index 305ce1894..44c72da22 100644 --- a/packages/nova-pdf/src/nova_pdf/_config.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py @@ -1,4 +1,4 @@ -"""Configuration management for nova-pdf.""" +"""Configuration management for markitdown-glmocr.""" import os from pathlib import Path @@ -12,11 +12,11 @@ @dataclass -class NovaPdfConfig: - """nova-pdf configuration.""" +class GlmOcrConfig: + """markitdown-glmocr configuration.""" # API 配置 - zhipu_api_key: str = "" + api_key: str = "" # OCR 配置 model: str = "glm-ocr" @@ -27,11 +27,11 @@ class NovaPdfConfig: force_ai: bool = False @classmethod - def load(cls, config_path: Optional[str] = None) -> "NovaPdfConfig": + def load(cls, config_path: Optional[str] = None) -> "GlmOcrConfig": """ Load configuration from multiple sources (priority high to low): 1. Environment variables - 2. Config file (pyproject.toml or nova-pdf.toml) + 2. Config file (pyproject.toml or markitdown-glmocr.toml) 3. Default values """ config = cls() @@ -53,10 +53,10 @@ def _load_from_file(self, config_path: Optional[str] = None): # Current directory search_paths.append(Path("pyproject.toml")) - search_paths.append(Path("nova-pdf.toml")) + search_paths.append(Path("markitdown-glmocr.toml")) # User config directory - search_paths.append(Path.home() / ".config" / "nova-pdf" / "config.toml") + search_paths.append(Path.home() / ".config" / "markitdown-glmocr" / "config.toml") for path in search_paths: if path.exists(): @@ -64,11 +64,11 @@ def _load_from_file(self, config_path: Optional[str] = None): with open(path, "rb") as f: data = tomllib.load(f) - # Read [tool.nova-pdf] section - if "tool" in data and "nova-pdf" in data["tool"]: - self._apply_config(data["tool"]["nova-pdf"]) - elif "nova-pdf" in data: - self._apply_config(data["nova-pdf"]) + # Read [tool.markitdown-glmocr] section + if "tool" in data and "markitdown-glmocr" in data["tool"]: + self._apply_config(data["tool"]["markitdown-glmocr"]) + elif "markitdown-glmocr" in data: + self._apply_config(data["markitdown-glmocr"]) break except Exception: @@ -77,7 +77,7 @@ def _load_from_file(self, config_path: Optional[str] = None): def _apply_config(self, data: dict): """Apply config from dict.""" if "api_key" in data: - self.zhipu_api_key = data["api_key"] + self.api_key = data["api_key"] if "model" in data: self.model = data["model"] if "dpi" in data: @@ -89,13 +89,13 @@ def _apply_config(self, data: dict): def _load_from_env(self): """Load from environment variables (highest priority).""" - if os.environ.get("NOVA_ZHIPU_API_KEY"): - self.zhipu_api_key = os.environ["NOVA_ZHIPU_API_KEY"] - if os.environ.get("NOVA_MODEL"): - self.model = os.environ["NOVA_MODEL"] - if os.environ.get("NOVA_DPI"): - self.dpi = int(os.environ["NOVA_DPI"]) - if os.environ.get("NOVA_TIMEOUT"): - self.timeout = int(os.environ["NOVA_TIMEOUT"]) - if os.environ.get("NOVA_FORCE_AI"): - self.force_ai = os.environ["NOVA_FORCE_AI"].lower() in ("true", "1", "yes") + if os.environ.get("GLMOCR_API_KEY"): + self.api_key = os.environ["GLMOCR_API_KEY"] + if os.environ.get("GLMOCR_MODEL"): + self.model = os.environ["GLMOCR_MODEL"] + if os.environ.get("GLMOCR_DPI"): + self.dpi = int(os.environ["GLMOCR_DPI"]) + if os.environ.get("GLMOCR_TIMEOUT"): + self.timeout = int(os.environ["GLMOCR_TIMEOUT"]) + if os.environ.get("GLMOCR_FORCE_AI"): + self.force_ai = os.environ["GLMOCR_FORCE_AI"].lower() in ("true", "1", "yes") \ No newline at end of file diff --git a/packages/nova-pdf/src/nova_pdf/_converter.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py similarity index 75% rename from packages/nova-pdf/src/nova_pdf/_converter.py rename to packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py index 7ee1dd320..f1434c115 100644 --- a/packages/nova-pdf/src/nova_pdf/_converter.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py @@ -1,4 +1,4 @@ -"""Nova PDF Converter - Intelligent PDF to Markdown conversion.""" +"""GlmOcr PDF Converter - Intelligent PDF to Markdown conversion.""" import io import sys @@ -29,14 +29,14 @@ ACCEPTED_FILE_EXTENSIONS = [".pdf"] -class NovaPdfConverter(DocumentConverter): +class GlmOcrPdfConverter(DocumentConverter): """ - 智能 PDF 转换器 + Intelligent PDF converter using glm-ocr. - 特性: - - 自动检测每页内容类型(纯文本 vs 包含图片/表格) - - 纯文本页面使用默认解析(pdfplumber/pdfminer) - - 复杂页面截图后调用 AI 转换为 Markdown + Features: + - Auto-detect page content type (plain text vs images/tables) + - Plain text pages use default parser (pdfplumber/pdfminer) + - Complex pages use AI screenshot conversion to Markdown """ def __init__( @@ -46,12 +46,12 @@ def __init__( force_ai: bool = False, ): """ - 初始化转换器 + Initialize converter. Args: - ai_service: AI 服务实例 - dpi: 截图分辨率(默认 150) - force_ai: 强制所有页面使用 AI(默认 False) + ai_service: AI service instance + dpi: Screenshot DPI (default: 150) + force_ai: Force all pages to use AI (default: False) """ self.ai_service = ai_service self.dpi = dpi @@ -92,47 +92,47 @@ def convert( _dependency_exc_info[2] ) - # 获取 AI 服务(从 kwargs 或实例) + # Get AI service (from kwargs or instance) ai_service = kwargs.get("ai_service") or self.ai_service - # 读取 PDF + # Read PDF pdf_stream = io.BytesIO(file_stream.read()) markdown_parts = [] try: with pdfplumber.open(pdf_stream) as pdf: for page_num, page in enumerate(pdf.pages): - # 分析页面类型 + # Analyze page type page_type = analyze_page(page) - # 根据类型选择处理方式 + # Choose processing method based on type if self.force_ai or page_type != PageType.PLAIN_TEXT: - # 复杂内容:截图 + AI + # Complex content: screenshot + AI if ai_service: markdown = self._convert_with_ai( page, page_num, ai_service ) else: - # 无 AI 服务,回退到默认解析 + # No AI service, fallback to default markdown = self._extract_text_with_tables(page) else: - # 纯文本:默认解析 + # Plain text: default parser markdown = self._extract_text_with_tables(page) if markdown.strip(): markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}") - # 释放页面资源 + # Release page resources page.close() markdown = "\n\n".join(markdown_parts).strip() except Exception: - # 异常情况:回退到 pdfminer + # Exception: fallback to pdfminer pdf_stream.seek(0) markdown = pdfminer.high_level.extract_text(pdf_stream) or "" - # 最终回退 + # Final fallback if not markdown: pdf_stream.seek(0) markdown = pdfminer.high_level.extract_text(pdf_stream) or "" @@ -146,52 +146,52 @@ def _convert_with_ai( ai_service: AIService, ) -> str: """ - 使用 AI 转换页面 + Convert page using AI. Args: - page: pdfplumber 页面对象 - page_num: 页码 - ai_service: AI 服务 + page: pdfplumber page object + page_num: Page number + ai_service: AI service Returns: - str: Markdown 内容 + str: Markdown content """ try: - # 截图 + # Screenshot img_stream = render_page_to_image(page, self.dpi) - # 调用 AI(文件名使用页码) + # Call AI (filename uses page number) filename = f"page_{page_num + 1}.png" result = ai_service.image_to_markdown(img_stream, filename=filename) if result.success and result.text.strip(): return result.text else: - # AI 失败,回退到默认解析 + # AI failed, fallback to default return self._extract_text_with_tables(page) except Exception: - # 异常情况,回退到默认解析 + # Exception, fallback to default return self._extract_text_with_tables(page) def _extract_text_with_tables(self, page: Any) -> str: """ - 提取文本和表格 + Extract text and tables. Args: - page: pdfplumber 页面对象 + page: pdfplumber page object Returns: - str: Markdown 内容 + str: Markdown content """ parts = [] - # 提取文本 + # Extract text text = page.extract_text() or "" if text.strip(): parts.append(text.strip()) - # 提取表格 + # Extract tables try: tables = page.extract_tables() if tables: @@ -207,45 +207,45 @@ def _extract_text_with_tables(self, page: Any) -> str: def _table_to_markdown(self, table: list[list[str]]) -> str: """ - 将表格转换为 Markdown + Convert table to Markdown. Args: - table: 2D 列表 + table: 2D list Returns: - str: Markdown 表格 + str: Markdown table """ if not table: return "" - # 过滤 None 值 + # Filter None values table = [[cell if cell is not None else "" for cell in row] for row in table] - # 过滤空行 + # Filter empty rows table = [row for row in table if any(cell.strip() for cell in row)] if not table: return "" - # 计算列宽 + # Calculate column widths col_widths = [ max(len(str(row[i])) if i < len(row) else 0 for row in table) for i in range(max(len(row) for row in table)) ] - # 格式化表格 + # Format table lines = [] for row_idx, row in enumerate(table): - # 补齐列数 + # Pad columns padded_row = row + [""] * (len(col_widths) - len(row)) line = "| " + " | ".join( str(cell).ljust(width) for cell, width in zip(padded_row, col_widths) ) + " |" lines.append(line) - # 添加分隔行 + # Add separator if row_idx == 0: sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|" lines.append(sep) - return "\n".join(lines) + return "\n".join(lines) \ No newline at end of file diff --git a/packages/nova-pdf/src/nova_pdf/_page_analyzer.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_page_analyzer.py similarity index 100% rename from packages/nova-pdf/src/nova_pdf/_page_analyzer.py rename to packages/markitdown-glmocr/src/markitdown_glmocr/_page_analyzer.py diff --git a/packages/nova-pdf/src/nova_pdf/_page_renderer.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_page_renderer.py similarity index 100% rename from packages/nova-pdf/src/nova_pdf/_page_renderer.py rename to packages/markitdown-glmocr/src/markitdown_glmocr/_page_renderer.py diff --git a/packages/nova-pdf/src/nova_pdf/_plugin.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py similarity index 74% rename from packages/nova-pdf/src/nova_pdf/_plugin.py rename to packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py index 08a4fd8a0..8abf29ccb 100644 --- a/packages/nova-pdf/src/nova_pdf/_plugin.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py @@ -1,11 +1,11 @@ -"""Plugin registration for nova-pdf.""" +"""Plugin registration for markitdown-glmocr.""" from typing import Any from markitdown import MarkItDown -from ._config import NovaPdfConfig +from ._config import GlmOcrConfig from ._ai_service import AIService -from ._converter import NovaPdfConverter +from ._converter import GlmOcrPdfConverter __plugin_interface_version__ = 1 @@ -13,7 +13,7 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: """ - Register nova-pdf converter. + Register markitdown-glmocr converter. Config sources (priority high to low): 1. kwargs parameters @@ -22,10 +22,10 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: 4. Default values """ # Load config - config = NovaPdfConfig.load() + config = GlmOcrConfig.load() # kwargs override config - api_key = kwargs.get("api_key") or kwargs.get("zhipu_api_key") or config.zhipu_api_key + api_key = kwargs.get("api_key") or config.api_key model = kwargs.get("model", config.model) dpi = kwargs.get("dpi", config.dpi) force_ai = kwargs.get("force_ai", config.force_ai) @@ -44,13 +44,13 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: pass # Register converter - PRIORITY_NOVA_PDF = -1.0 + PRIORITY_GLMOCR = -1.0 markitdown.register_converter( - NovaPdfConverter( + GlmOcrPdfConverter( ai_service=ai_service, dpi=dpi, force_ai=force_ai, ), - priority=PRIORITY_NOVA_PDF, - ) + priority=PRIORITY_GLMOCR, + ) \ No newline at end of file diff --git a/packages/nova-pdf/tests/__init__.py b/packages/markitdown-glmocr/tests/__init__.py similarity index 100% rename from packages/nova-pdf/tests/__init__.py rename to packages/markitdown-glmocr/tests/__init__.py diff --git a/packages/nova-pdf/tests/test_ai_service.py b/packages/markitdown-glmocr/tests/test_ai_service.py similarity index 81% rename from packages/nova-pdf/tests/test_ai_service.py rename to packages/markitdown-glmocr/tests/test_ai_service.py index 7c7636848..dbbe06d50 100644 --- a/packages/nova-pdf/tests/test_ai_service.py +++ b/packages/markitdown-glmocr/tests/test_ai_service.py @@ -4,8 +4,8 @@ import pytest from unittest.mock import MagicMock, patch -from nova_pdf._ai_service import AIService, AIResult -from nova_pdf._config import NovaPdfConfig +from markitdown_glmocr._ai_service import AIService, AIResult +from markitdown_glmocr._config import GlmOcrConfig class TestAIService: @@ -13,13 +13,13 @@ class TestAIService: def test_missing_zai_sdk_raises_error(self): """Missing zai-sdk raises error.""" - with patch("nova_pdf._ai_service.ZhipuAiClient", None): + with patch("markitdown_glmocr._ai_service.ZhipuAiClient", None): with pytest.raises(ImportError, match="zai-sdk is required"): AIService(api_key="test") def test_missing_api_key_raises_error(self): """Missing API key raises error.""" - with patch("nova_pdf._ai_service.ZhipuAiClient", MagicMock()): + with patch("markitdown_glmocr._ai_service.ZhipuAiClient", MagicMock()): with pytest.raises(ValueError, match="API key is required"): AIService(api_key="") @@ -32,7 +32,7 @@ def test_successful_conversion(self): mock_response.layout_details = [] mock_client.layout_parsing.create.return_value = mock_response - with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): service = AIService(api_key="test-api-key") result = service.image_to_markdown(io.BytesIO(b"fake-image")) @@ -47,7 +47,7 @@ def test_html_table_conversion(self): mock_response.layout_details = [] mock_client.layout_parsing.create.return_value = mock_response - with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): service = AIService(api_key="test-api-key") result = service.image_to_markdown(io.BytesIO(b"fake-image")) @@ -64,7 +64,7 @@ def test_empty_result(self): mock_response.layout_details = [] mock_client.layout_parsing.create.return_value = mock_response - with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): service = AIService(api_key="test-api-key") result = service.image_to_markdown(io.BytesIO(b"fake-image")) @@ -76,7 +76,7 @@ def test_error_handling(self): mock_client = MagicMock() mock_client.layout_parsing.create.side_effect = Exception("API Error") - with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): service = AIService(api_key="test-api-key") result = service.image_to_markdown(io.BytesIO(b"fake-image")) @@ -91,7 +91,7 @@ def test_base64_encoding(self): mock_response.layout_details = [] mock_client.layout_parsing.create.return_value = mock_response - with patch("nova_pdf._ai_service.ZhipuAiClient", return_value=mock_client): + with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): service = AIService(api_key="test-api-key") result = service.image_to_markdown(io.BytesIO(b"fake-image"), "test.png") @@ -100,4 +100,4 @@ def test_base64_encoding(self): # Verify data URI was used call_args = mock_client.layout_parsing.create.call_args file_arg = call_args.kwargs['file'] - assert file_arg.startswith("data:image/png;base64,") + assert file_arg.startswith("data:image/png;base64,") \ No newline at end of file diff --git a/packages/nova-pdf/tests/test_analyzer.py b/packages/markitdown-glmocr/tests/test_analyzer.py similarity index 98% rename from packages/nova-pdf/tests/test_analyzer.py rename to packages/markitdown-glmocr/tests/test_analyzer.py index 137e486ab..6841f0b44 100644 --- a/packages/nova-pdf/tests/test_analyzer.py +++ b/packages/markitdown-glmocr/tests/test_analyzer.py @@ -3,7 +3,7 @@ import pytest from unittest.mock import MagicMock -from nova_pdf._page_analyzer import ( +from markitdown_glmocr._page_analyzer import ( PageType, detect_images, detect_tables, diff --git a/packages/nova-pdf/tests/test_converter.py b/packages/markitdown-glmocr/tests/test_converter.py similarity index 72% rename from packages/nova-pdf/tests/test_converter.py rename to packages/markitdown-glmocr/tests/test_converter.py index ea13266f7..d48c75f2d 100644 --- a/packages/nova-pdf/tests/test_converter.py +++ b/packages/markitdown-glmocr/tests/test_converter.py @@ -1,44 +1,44 @@ -"""Tests for nova-pdf converter.""" +"""Tests for markitdown-glmocr converter.""" import io import pytest from unittest.mock import MagicMock, patch -from nova_pdf._converter import NovaPdfConverter -from nova_pdf._ai_service import AIService, AIResult -from nova_pdf._page_analyzer import PageType +from markitdown_glmocr._converter import GlmOcrPdfConverter +from markitdown_glmocr._ai_service import AIService, AIResult +from markitdown_glmocr._page_analyzer import PageType -class TestNovaPdfConverter: - """转换器测试""" +class TestGlmOcrPdfConverter: + """Converter tests.""" def test_accepts_pdf_extension(self): - """接受 .pdf 扩展名""" - converter = NovaPdfConverter() + """Accept .pdf extension.""" + converter = GlmOcrPdfConverter() stream = io.BytesIO(b"%PDF-1.4") stream_info = MagicMock(extension=".pdf", mimetype=None) assert converter.accepts(stream, stream_info) is True def test_accepts_pdf_mimetype(self): - """接受 PDF MIME 类型""" - converter = NovaPdfConverter() + """Accept PDF MIME type.""" + converter = GlmOcrPdfConverter() stream = io.BytesIO(b"%PDF-1.4") stream_info = MagicMock(extension=None, mimetype="application/pdf") assert converter.accepts(stream, stream_info) is True def test_rejects_non_pdf(self): - """拒绝非 PDF 文件""" - converter = NovaPdfConverter() + """Reject non-PDF files.""" + converter = GlmOcrPdfConverter() stream = io.BytesIO(b"not a pdf") stream_info = MagicMock(extension=".txt", mimetype="text/plain") assert converter.accepts(stream, stream_info) is False def test_table_to_markdown(self): - """表格转 Markdown""" - converter = NovaPdfConverter() + """Table to Markdown conversion.""" + converter = GlmOcrPdfConverter() table = [ ["Name", "Age", "City"], ["Alice", "25", "Beijing"], @@ -50,13 +50,13 @@ def test_table_to_markdown(self): assert "|" in result assert "Name" in result assert "Alice" in result - assert "---" in result # 分隔行 + assert "---" in result # Separator def test_plain_text_page_without_ai(self): - """纯文本页面不使用 AI""" - converter = NovaPdfConverter() + """Plain text page without AI.""" + converter = GlmOcrPdfConverter() - # 模拟页面 + # Mock page page = MagicMock() page.images = [] page.objects = {} @@ -64,11 +64,11 @@ def test_plain_text_page_without_ai(self): page.extract_text.return_value = "Hello World" page.close = MagicMock() - # 模拟 PDF + # Mock PDF mock_pdf = MagicMock() mock_pdf.pages = [page] - with patch("nova_pdf._converter.pdfplumber.open") as mock_open: + with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: mock_open.return_value.__enter__.return_value = mock_pdf stream = io.BytesIO(b"%PDF-1.4") @@ -77,17 +77,17 @@ def test_plain_text_page_without_ai(self): assert "Hello World" in result.markdown def test_complex_page_with_ai(self): - """复杂页面使用 AI""" - # 模拟 AI 服务 + """Complex page with AI.""" + # Mock AI service ai_service = MagicMock(spec=AIService) ai_service.image_to_markdown.return_value = AIResult( success=True, text="# AI Generated\n\nThis is from AI." ) - converter = NovaPdfConverter(ai_service=ai_service) + converter = GlmOcrPdfConverter(ai_service=ai_service) - # 模拟页面 + # Mock page page = MagicMock() page.images = [MagicMock()] page.extract_tables.return_value = [] @@ -95,35 +95,35 @@ def test_complex_page_with_ai(self): page.to_image.return_value.original = MagicMock() page.close = MagicMock() - # 模拟图片保存 + # Mock image save img_stream = io.BytesIO() page.to_image.return_value.original.save = lambda s, format: s.write(b"fake") - # 模拟 PDF + # Mock PDF mock_pdf = MagicMock() mock_pdf.pages = [page] - with patch("nova_pdf._converter.pdfplumber.open") as mock_open: + with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: mock_open.return_value.__enter__.return_value = mock_pdf stream = io.BytesIO(b"%PDF-1.4") result = converter.convert(stream, MagicMock()) - # 应该调用 AI + # Should call AI ai_service.image_to_markdown.assert_called_once() assert "AI Generated" in result.markdown def test_force_ai_mode(self): - """强制 AI 模式""" + """Force AI mode.""" ai_service = MagicMock(spec=AIService) ai_service.image_to_markdown.return_value = AIResult( success=True, text="AI result" ) - converter = NovaPdfConverter(ai_service=ai_service, force_ai=True) + converter = GlmOcrPdfConverter(ai_service=ai_service, force_ai=True) - # 即使是纯文本页面 + # Even plain text page page = MagicMock() page.images = [] page.objects = {} @@ -138,17 +138,17 @@ def test_force_ai_mode(self): mock_pdf = MagicMock() mock_pdf.pages = [page] - with patch("nova_pdf._converter.pdfplumber.open") as mock_open: + with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: mock_open.return_value.__enter__.return_value = mock_pdf stream = io.BytesIO(b"%PDF-1.4") result = converter.convert(stream, MagicMock()) - # 应该调用 AI(因为 force_ai=True) + # Should call AI (because force_ai=True) ai_service.image_to_markdown.assert_called_once() def test_fallback_on_ai_failure(self): - """AI 失败时回退到默认解析""" + """Fallback on AI failure.""" ai_service = MagicMock(spec=AIService) ai_service.image_to_markdown.return_value = AIResult( success=False, @@ -156,7 +156,7 @@ def test_fallback_on_ai_failure(self): error="API error" ) - converter = NovaPdfConverter(ai_service=ai_service) + converter = GlmOcrPdfConverter(ai_service=ai_service) page = MagicMock() page.images = [MagicMock()] @@ -171,11 +171,11 @@ def test_fallback_on_ai_failure(self): mock_pdf = MagicMock() mock_pdf.pages = [page] - with patch("nova_pdf._converter.pdfplumber.open") as mock_open: + with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: mock_open.return_value.__enter__.return_value = mock_pdf stream = io.BytesIO(b"%PDF-1.4") result = converter.convert(stream, MagicMock()) - # 应该回退到默认文本 - assert "Fallback text" in result.markdown + # Should fallback to default text + assert "Fallback text" in result.markdown \ No newline at end of file From 8f2dd6a1182d0c3e5a2dab2e73249eff230c0d70 Mon Sep 17 00:00:00 2001 From: hankl Date: Sun, 10 May 2026 10:08:00 +0800 Subject: [PATCH 04/15] Update spec.md --- docs/spec.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/spec.md b/docs/spec.md index 660e90a15..d2d624ddf 100644 --- a/docs/spec.md +++ b/docs/spec.md @@ -1,3 +1,4 @@ +# sprint0 # 目标 重构调用ai接口解析PDF的功能:对包含图片/表格的页面截图后调用 AI 接口转 Markdown @@ -28,4 +29,7 @@ print(response) 详细文档:https://docs.bigmodel.cn/cn/guide/models/vlm/glm-ocr#python -先设计重构方案 \ No newline at end of file +先设计重构方案 + +## sprint1 +重命名:nova-pdf 改成markitdown-glmocr From f81ef9f34c2e0a266c3daa7ec3407e02967aecad Mon Sep 17 00:00:00 2001 From: hankl Date: Tue, 12 May 2026 16:54:35 +0800 Subject: [PATCH 05/15] =?UTF-8?q?=E6=9B=BF=E6=8D=A2=E4=B8=BAglmsdk?= =?UTF-8?q?=E6=9D=A5=E8=A7=A3=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/markitdown-glmocr/README.md | 168 +++++++++------ packages/markitdown-glmocr/pyproject.toml | 20 +- .../src/markitdown_glmocr/__init__.py | 9 +- .../src/markitdown_glmocr/_ai_service.py | 202 ------------------ .../src/markitdown_glmocr/_config.py | 108 ++-------- .../src/markitdown_glmocr/_converter.py | 190 +++++++++------- .../src/markitdown_glmocr/_page_analyzer.py | 117 ---------- .../src/markitdown_glmocr/_page_renderer.py | 32 --- .../src/markitdown_glmocr/_plugin.py | 41 +--- 9 files changed, 240 insertions(+), 647 deletions(-) delete mode 100644 packages/markitdown-glmocr/src/markitdown_glmocr/_ai_service.py delete mode 100644 packages/markitdown-glmocr/src/markitdown_glmocr/_page_analyzer.py delete mode 100644 packages/markitdown-glmocr/src/markitdown_glmocr/_page_renderer.py diff --git a/packages/markitdown-glmocr/README.md b/packages/markitdown-glmocr/README.md index 746f923fe..d0193d2ed 100644 --- a/packages/markitdown-glmocr/README.md +++ b/packages/markitdown-glmocr/README.md @@ -1,13 +1,14 @@ # markitdown-glmocr -智能 PDF 转 Markdown 插件,使用 glm-ocr AI 驱动的图片和表格提取。 +智能 PDF 转 Markdown 插件,使用 glmocr SDK(智谱 GLM-OCR)驱动的图片和表格提取。 ## 特性 - 🔍 **智能检测**:自动识别每页内容类型(纯文本 vs 图片/表格) - 📄 **默认解析**:纯文本页面使用 pdfplumber/pdfminer 提取,速度快、成本低 -- 🤖 **AI 增强**:复杂页面(图片、表格)使用 glm-ocr 转换为 Markdown -- ⚙️ **灵活配置**:支持配置文件、环境变量等多种配置方式 +- 🤖 **AI 增强**:复杂页面(图片、表格)使用 glmocr SDK 转换为 Markdown +- ⚡ **一行调用**:`glmocr.parse("document.pdf")` 完成 OCR,无需手动截图编码 +- 📊 **结构化输出**:返回 Markdown + JSON 结构(含区域标签、边界框) ## 安装 @@ -16,45 +17,35 @@ pip install markitdown-glmocr # 安装 AI 功能 -pip install markitdown-glmocr[zhipu] +pip install markitdown-glmocr[glmocr] ``` ## 配置 -### 本地敏感配置(推荐) - -项目根目录的 `.secrets.local` 文件存储敏感信息,此文件不会被提交到 Git: +### 环境变量(推荐) ```bash -# 创建 .secrets.local 文件 -echo 'GLMOCR_API_KEY="your-api-key"' > .secrets.local +# 必需:智谱 API Key +export ZHIPU_API_KEY="your-zhipu-api-key" -# 加载配置 -source .secrets.local +# 可选 +export GLMOCR_MODEL="glm-ocr" # 模型名称 +export GLMOCR_TIMEOUT="600" # 请求超时(秒) +export GLMOCR_ENABLE_LAYOUT="true" # 启用布局检测 +export GLMOCR_LOG_LEVEL="INFO" # 日志级别 ``` -### 环境变量 - -```bash -# 必需 -export GLMOCR_API_KEY="your-zhipu-api-key" +### 配置优先级 -# 可选 -export GLMOCR_MODEL="glm-ocr" -export GLMOCR_DPI="150" -export GLMOCR_TIMEOUT="120" +``` +构造函数参数 > 环境变量 > .env 文件 > config.yaml > 内置默认值 ``` -### 配置文件 - -在 `pyproject.toml` 中配置默认值: +### 本地敏感配置 -```toml -[tool.markitdown-glmocr] -model = "glm-ocr" -dpi = 150 -timeout = 120 -force_ai = false +```bash +# 创建 .env 文件(自动读取) +echo "ZHIPU_API_KEY=your-api-key" > .env ``` ## 使用方法 @@ -62,8 +53,8 @@ force_ai = false ### 命令行(推荐) ```bash -# 1. 加载敏感配置 -source .secrets.local +# 1. 设置 API Key +export ZHIPU_API_KEY="sk-xxx" # 2. 查看已安装插件 markitdown --list-plugins @@ -79,50 +70,73 @@ markitdown -p document.pdf -o output.md ```python from markitdown import MarkItDown +from markitdown_glmocr import GlmOcrConverter -# 方式1:自动加载配置 -md = MarkItDown(enable_plugins=True) +# 方式1:自动从环境变量读取 ZHIPU_API_KEY +converter = GlmOcrConverter() +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) result = md.convert("document.pdf") print(result.markdown) -# 方式2:手动配置 -from markitdown_glmocr import GlmOcrConfig, AIService, GlmOcrPdfConverter - -config = GlmOcrConfig.load() -ai_service = AIService( - api_key="your-api-key", - model="glm-ocr", -) - -converter = GlmOcrPdfConverter( - ai_service=ai_service, - dpi=150, -) - +# 方式2:手动传入 API Key +converter = GlmOcrConverter(api_key="sk-xxx") md = MarkItDown(enable_plugins=False) md.register_converter(converter, priority=-1.0) result = md.convert("document.pdf") +print(result.markdown) + +# 方式3:直接使用 glmocr SDK(更简单) +import glmocr +result = glmocr.parse("document.pdf") +print(result.markdown_result) # Markdown 输出 +print(result.json_result) # 结构化 JSON(区域标签、边界框) +``` + +### 处理结果 + +```python +import glmocr + +result = glmocr.parse("report.pdf") + +# 获取 Markdown +print(result.markdown_result) + +# 获取结构化数据(按页分组) +for page_idx, page_regions in enumerate(result.json_result): + print(f"Page {page_idx + 1}: {len(page_regions)} regions") + for region in page_regions: + print(f" [{region['label']}] {region['content'][:60]}") + +# 按标签筛选 +tables = [r for r in result.json_result[0] if r["label"] == "table"] +formulas = [r for r in result.json_result[0] if r["label"] == "formula"] + +# 保存到磁盘 +result.save(output_dir="./output") ``` ## 配置选项 -### GlmOcrConfig 参数 +### GlmOcrConverter 参数 | 参数 | 类型 | 默认值 | 说明 | |------|------|--------|------| -| `api_key` | str | 环境变量 `GLMOCR_API_KEY` | 智谱 API Key | -| `model` | str | "glm-ocr" | 模型名称 | -| `dpi` | int | 150 | 截图分辨率 | -| `timeout` | int | 120 | 请求超时(秒) | +| `api_key` | str | 环境变量 `ZHIPU_API_KEY` | 智谱 API Key | +| `timeout` | int | 1800 | 请求超时(秒) | +| `enable_layout` | bool | False | 启用布局检测 | | `force_ai` | bool | False | 强制所有页面使用 AI | -### GlmOcrPdfConverter 参数 +### 环境变量 -| 参数 | 类型 | 默认值 | 说明 | -|------|------|--------|------| -| `ai_service` | AIService | None | AI 服务实例 | -| `dpi` | int | 150 | 截图分辨率 | -| `force_ai` | bool | False | 强制所有页面使用 AI | +| 变量 | 说明 | 示例 | +|------|------|------| +| `ZHIPU_API_KEY` | API Key(必需) | `sk-abc123` | +| `GLMOCR_MODEL` | 模型名称 | `glm-ocr` | +| `GLMOCR_TIMEOUT` | 请求超时(秒) | `600` | +| `GLMOCR_ENABLE_LAYOUT` | 布局检测 | `true` | +| `GLMOCR_LOG_LEVEL` | 日志级别 | `INFO` | ## 工作原理 @@ -136,22 +150,38 @@ PDF 输入 │ └─ 复杂页面(图片/表格) │ - ├─ 截图渲染 (150 DPI) - │ - ├─ base64 编码 - │ - └─ 调用 glm-ocr API 转 Markdown + └─► glmocr.parse() 一行调用 + │ + ├─ 内置截图渲染 + ├─ 内置 base64 编码 + └─ 内置 OCR 识别 │ ▼ 合并输出完整 Markdown ``` +## 区域标签(json_result) + +glmocr SDK 返回的结构化数据支持以下标签: + +| 标签 | 说明 | +|------|------| +| `title` | 标题 | +| `text` | 正文文本 | +| `table` | 表格 | +| `figure` | 图片 | +| `formula` | 公式 | +| `header` | 页眉 | +| `footer` | 页脚 | +| `page_number` | 页码 | +| `reference` | 参考文献 | +| `seal` | 印章 | + ## 技术架构 -- **zai-sdk**: 智谱 AI 官方 SDK -- **glm-ocr**: 智谱 OCR 模型,支持表格、图片识别 -- **pdfplumber**: PDF 页面分析和截图 -- **pdfminer**: 纯文本页面提取 +- **glmocr**: 智谱 OCR SDK,一行代码完成 PDF/图片解析 +- **pdfplumber**: PDF 页面分析和纯文本提取 +- **pdfminer**: 纯文本页面提取备用 ## 依赖 @@ -159,8 +189,8 @@ PDF 输入 - `pdfplumber>=0.11.9` - PDF 解析和截图 - `pdfminer.six>=20251230` - 文本提取备用 - `Pillow>=9.0.0` - 图像处理 -- `zai-sdk>=0.2.2` - 智谱 AI SDK(可选,AI 功能需要) +- `glmocr` - 智谱 OCR SDK(可选,AI 功能需要) ## 许可证 -MIT +MIT \ No newline at end of file diff --git a/packages/markitdown-glmocr/pyproject.toml b/packages/markitdown-glmocr/pyproject.toml index a9277c272..ea06823ce 100644 --- a/packages/markitdown-glmocr/pyproject.toml +++ b/packages/markitdown-glmocr/pyproject.toml @@ -5,11 +5,11 @@ build-backend = "hatchling.build" [project] name = "markitdown-glmocr" dynamic = ["version"] -description = "Intelligent PDF to Markdown converter with glm-ocr AI-powered image/table extraction" +description = "Intelligent PDF to Markdown converter using glmocr SDK" readme = "README.md" requires-python = ">=3.10" license = "MIT" -keywords = ["markitdown", "pdf", "ocr", "ai", "llm", "vision", "glm-ocr"] +keywords = ["markitdown", "pdf", "ocr", "ai", "llm", "vision", "glm-ocr", "glmocr"] authors = [ { name = "Contributors", email = "noreply@github.com" }, ] @@ -27,12 +27,11 @@ dependencies = [ "pdfminer.six>=20251230", "pdfplumber>=0.11.9", "Pillow>=9.0.0", - "tomli>=2.0.0;python_version<'3.11'", ] [project.optional-dependencies] -zhipu = [ - "zai-sdk>=0.2.2", +glmocr = [ + "glmocr", ] dev = [ "pytest>=7.0.0", @@ -58,13 +57,4 @@ packages = ["src/markitdown_glmocr"] [tool.pytest.ini_options] testpaths = ["tests"] -python_files = ["test_*.py"] - -# markitdown-glmocr configuration -[tool.markitdown-glmocr] -# API key - set via environment variable GLMOCR_API_KEY -api_key = "" -model = "glm-ocr" -dpi = 150 -timeout = 120 -force_ai = false \ No newline at end of file +python_files = ["test_*.py"] \ No newline at end of file diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/__init__.py b/packages/markitdown-glmocr/src/markitdown_glmocr/__init__.py index 55b9caf87..45512966a 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/__init__.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/__init__.py @@ -1,13 +1,12 @@ +"""markitdown-glmocr: Intelligent PDF to Markdown converter using glmocr SDK.""" + from ._plugin import register_converters from ._config import GlmOcrConfig -from ._ai_service import AIService, AIResult -from ._converter import GlmOcrPdfConverter +from ._converter import GlmOcrConverter __plugin_interface_version__ = 1 __all__ = [ "register_converters", "GlmOcrConfig", - "AIService", - "AIResult", - "GlmOcrPdfConverter", + "GlmOcrConverter", ] \ No newline at end of file diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_ai_service.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_ai_service.py deleted file mode 100644 index 6c8e362a4..000000000 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_ai_service.py +++ /dev/null @@ -1,202 +0,0 @@ -"""AI service using zai-sdk and glm-ocr.""" - -import base64 -import re -from dataclasses import dataclass -from typing import BinaryIO, Optional - -from ._config import GlmOcrConfig - -try: - from zai import ZhipuAiClient -except ImportError: - ZhipuAiClient = None - - -@dataclass -class AIResult: - """Result from AI conversion.""" - text: str - success: bool = True - error: Optional[str] = None - - -class AIService: - """ - AI Service using zai-sdk + glm-ocr. - - Features: - - Direct API call to glm-ocr layout_parsing - - Support image bytes via base64 data URI - - Return Markdown or HTML format content - """ - - def __init__( - self, - api_key: Optional[str] = None, - model: str = "glm-ocr", - timeout: int = 120, - config: Optional[GlmOcrConfig] = None, - ): - if ZhipuAiClient is None: - raise ImportError( - "zai-sdk is required. Install with: pip install markitdown-glmocr[zhipu]" - ) - - if config: - self.api_key = api_key or config.api_key - self.model = model or config.model - self.timeout = timeout or config.timeout - else: - config = GlmOcrConfig.load() - self.api_key = api_key or config.api_key - self.model = model - self.timeout = timeout - - if not self.api_key: - raise ValueError( - "API key is required. Set GLMOCR_API_KEY environment variable" - ) - - self.client = ZhipuAiClient(api_key=self.api_key) - - def image_to_markdown( - self, - image_stream: BinaryIO, - filename: str = "page.png", - keep_html: bool = False, - ) -> AIResult: - """ - Convert image to Markdown using glm-ocr. - - Args: - image_stream: Image stream - filename: Filename (for content type detection) - keep_html: Keep HTML format for complex tables (default: False, convert to MD) - - Returns: - AIResult: Conversion result - """ - try: - image_stream.seek(0) - image_bytes = image_stream.read() - - base64_image = base64.b64encode(image_bytes).decode("utf-8") - content_type = "image/jpeg" if filename.lower().endswith((".jpg", ".jpeg")) else "image/png" - data_uri = f"data:{content_type};base64,{base64_image}" - - response = self.client.layout_parsing.create( - model=self.model, - file=data_uri - ) - - # Get HTML content - html = response.md_results or "" - - if not html and response.layout_details: - parts = [] - for detail_list in response.layout_details: - for detail in detail_list: - if detail.content: - parts.append(detail.content) - html = "\n".join(parts) - - # Convert to Markdown or keep HTML - if keep_html: - text = html.strip() - else: - text = self._html_to_markdown(html.strip()) if html else "" - - return AIResult(text=text, success=True) - - except Exception as e: - return AIResult(text="", success=False, error=str(e)) - finally: - image_stream.seek(0) - - def _html_to_markdown(self, html: str) -> str: - """Convert HTML to Markdown.""" - if not html: - return "" - - # Extract titles from
- titles = [] - div_pattern = r']*>(.*?)
' - for match in re.finditer(div_pattern, html, re.DOTALL | re.IGNORECASE): - title = re.sub(r'<[^>]+>', '', match.group(1)).strip() - if title: - titles.append(title) - - # Remove
from HTML - html = re.sub(div_pattern, '', html, flags=re.DOTALL | re.IGNORECASE) - - # Check for table - if ']+>', '', html).strip() - if titles: - return f"**{' '.join(titles)}**\n\n{text}" - return text - - def _convert_html_table(self, html: str) -> str: - """Convert HTML table to Markdown table.""" - # Parse rows - rows = [] - rowspan_cells = {} - - for row_idx, row_match in enumerate(re.finditer(r']*>(.*?)', html, re.DOTALL | re.IGNORECASE)): - cells = [] - col_idx = 0 - - # Fill rowspan cells - while (row_idx, col_idx) in rowspan_cells: - cells.append(rowspan_cells[(row_idx, col_idx)]) - col_idx += 1 - - # Parse cells - for cell_match in re.finditer(r']*)>(.*?)', row_match.group(1), re.DOTALL | re.IGNORECASE): - attrs, content = cell_match.groups() - content = re.sub(r'<[^>]+>', '', content).strip().replace('\n', ' ') - - rowspan = int(r.group(1)) if (r := re.search(r'rowspan\s*=\s*["\']?(\d+)', attrs, re.IGNORECASE)) else 1 - colspan = int(c.group(1)) if (c := re.search(r'colspan\s*=\s*["\']?(\d+)', attrs, re.IGNORECASE)) else 1 - - cells.append(content) - cells.extend([""] * (colspan - 1)) - - if rowspan > 1: - for r in range(1, rowspan): - for c in range(colspan): - rowspan_cells[(row_idx + r, col_idx + c)] = content - - col_idx += colspan - - # Fill remaining rowspan - while (row_idx, col_idx) in rowspan_cells: - cells.append(rowspan_cells[(row_idx, col_idx)]) - col_idx += 1 - - rows.append(cells) - - if not rows: - return "" - - # Normalize - max_cols = max(len(row) for row in rows) - for row in rows: - row.extend([""] * (max_cols - len(row))) - - # Simple output: first row as header - md_lines = [] - for i, row in enumerate(rows): - md_row = "| " + " | ".join(c or " " for c in row) + " |" - md_lines.append(md_row) - if i == 0: - md_lines.append("|" + "|".join(["---"] * max_cols) + "|") - - return "\n".join(md_lines) \ No newline at end of file diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py index 44c72da22..d1122524b 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py @@ -1,101 +1,25 @@ -"""Configuration management for markitdown-glmocr.""" +"""Configuration for markitdown-glmocr.""" -import os -from pathlib import Path -from typing import Optional -from dataclasses import dataclass - -try: - import tomllib # Python 3.11+ -except ImportError: - import tomli as tomllib +from dataclasses import dataclass, field @dataclass class GlmOcrConfig: - """markitdown-glmocr configuration.""" - - # API 配置 - api_key: str = "" - - # OCR 配置 - model: str = "glm-ocr" - dpi: int = 150 - timeout: int = 120 - - # 处理策略 - force_ai: bool = False + """markitdown-glmocr configuration. - @classmethod - def load(cls, config_path: Optional[str] = None) -> "GlmOcrConfig": - """ - Load configuration from multiple sources (priority high to low): - 1. Environment variables - 2. Config file (pyproject.toml or markitdown-glmocr.toml) - 3. Default values - """ - config = cls() - - # 1. Load from config file - config._load_from_file(config_path) - - # 2. Environment variables override - config._load_from_env() - - return config + Configuration priority (high to low): + 1. Constructor kwargs + 2. Environment variables + 3. .env file + 4. Built-in defaults + """ - def _load_from_file(self, config_path: Optional[str] = None): - """Load from config file.""" - search_paths = [] - - if config_path: - search_paths.append(Path(config_path)) - - # Current directory - search_paths.append(Path("pyproject.toml")) - search_paths.append(Path("markitdown-glmocr.toml")) - - # User config directory - search_paths.append(Path.home() / ".config" / "markitdown-glmocr" / "config.toml") - - for path in search_paths: - if path.exists(): - try: - with open(path, "rb") as f: - data = tomllib.load(f) - - # Read [tool.markitdown-glmocr] section - if "tool" in data and "markitdown-glmocr" in data["tool"]: - self._apply_config(data["tool"]["markitdown-glmocr"]) - elif "markitdown-glmocr" in data: - self._apply_config(data["markitdown-glmocr"]) - - break - except Exception: - pass + # API configuration + api_key: str = "" # Reads from ZHIPU_API_KEY by default - def _apply_config(self, data: dict): - """Apply config from dict.""" - if "api_key" in data: - self.api_key = data["api_key"] - if "model" in data: - self.model = data["model"] - if "dpi" in data: - self.dpi = data["dpi"] - if "timeout" in data: - self.timeout = data["timeout"] - if "force_ai" in data: - self.force_ai = data["force_ai"] + # OCR configuration + timeout: int = 1800 + enable_layout: bool = False - def _load_from_env(self): - """Load from environment variables (highest priority).""" - if os.environ.get("GLMOCR_API_KEY"): - self.api_key = os.environ["GLMOCR_API_KEY"] - if os.environ.get("GLMOCR_MODEL"): - self.model = os.environ["GLMOCR_MODEL"] - if os.environ.get("GLMOCR_DPI"): - self.dpi = int(os.environ["GLMOCR_DPI"]) - if os.environ.get("GLMOCR_TIMEOUT"): - self.timeout = int(os.environ["GLMOCR_TIMEOUT"]) - if os.environ.get("GLMOCR_FORCE_AI"): - self.force_ai = os.environ["GLMOCR_FORCE_AI"].lower() in ("true", "1", "yes") \ No newline at end of file + # Processing strategy + force_ai: bool = False \ No newline at end of file diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py index f1434c115..60a488704 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py @@ -7,9 +7,7 @@ from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE -from ._page_analyzer import PageType, analyze_page -from ._page_renderer import render_page_to_image -from ._ai_service import AIService +from ._config import GlmOcrConfig # Import dependencies _dependency_exc_info = None @@ -20,6 +18,14 @@ except ImportError: _dependency_exc_info = sys.exc_info() +# glmocr SDK +try: + import glmocr + from glmocr import GlmOcr +except ImportError: + glmocr = None + GlmOcr = None + ACCEPTED_MIME_TYPE_PREFIXES = [ "application/pdf", @@ -29,33 +35,63 @@ ACCEPTED_FILE_EXTENSIONS = [".pdf"] -class GlmOcrPdfConverter(DocumentConverter): +class GlmOcrConverter(DocumentConverter): """ - Intelligent PDF converter using glm-ocr. + Intelligent PDF converter using glmocr SDK. Features: - Auto-detect page content type (plain text vs images/tables) - - Plain text pages use default parser (pdfplumber/pdfminer) - - Complex pages use AI screenshot conversion to Markdown + - Plain text pages use pdfplumber/pdfminer (fast, free) + - Complex pages use glmocr SDK for AI-powered OCR + - One-liner: glmocr.parse("document.pdf") handles everything """ def __init__( self, - ai_service: Optional[AIService] = None, - dpi: int = 150, + api_key: Optional[str] = None, + timeout: int = 1800, + enable_layout: bool = False, force_ai: bool = False, + config: Optional[GlmOcrConfig] = None, ): """ Initialize converter. Args: - ai_service: AI service instance - dpi: Screenshot DPI (default: 150) + api_key: Zhipu API key (reads from ZHIPU_API_KEY env var if not provided) + timeout: Request timeout in seconds (default: 1800) + enable_layout: Enable layout detection (default: False) force_ai: Force all pages to use AI (default: False) + config: Optional GlmOcrConfig instance """ - self.ai_service = ai_service - self.dpi = dpi - self.force_ai = force_ai + if glmocr is None: + raise ImportError( + "glmocr is required. Install with: pip install markitdown-glmocr[glmocr]" + ) + + # Use config if provided + if config: + self.api_key = api_key or config.api_key + self.timeout = timeout if timeout != 1800 else config.timeout + self.enable_layout = enable_layout if enable_layout else config.enable_layout + self.force_ai = force_ai or config.force_ai + else: + self.api_key = api_key + self.timeout = timeout + self.enable_layout = enable_layout + self.force_ai = force_ai + + # Lazy init GlmOcr instance + self._glmocr: Optional[GlmOcr] = None + + def _get_glmocr(self) -> GlmOcr: + """Get or create GlmOcr instance.""" + if self._glmocr is None: + kwargs = {"timeout": self.timeout, "enable_layout": self.enable_layout} + if self.api_key: + kwargs["api_key"] = self.api_key + self._glmocr = GlmOcr(**kwargs) + return self._glmocr def accepts( self, @@ -92,9 +128,6 @@ def convert( _dependency_exc_info[2] ) - # Get AI service (from kwargs or instance) - ai_service = kwargs.get("ai_service") or self.ai_service - # Read PDF pdf_stream = io.BytesIO(file_stream.read()) markdown_parts = [] @@ -103,32 +136,25 @@ def convert( with pdfplumber.open(pdf_stream) as pdf: for page_num, page in enumerate(pdf.pages): # Analyze page type - page_type = analyze_page(page) - - # Choose processing method based on type - if self.force_ai or page_type != PageType.PLAIN_TEXT: - # Complex content: screenshot + AI - if ai_service: - markdown = self._convert_with_ai( - page, page_num, ai_service - ) - else: - # No AI service, fallback to default - markdown = self._extract_text_with_tables(page) + page_type = self._analyze_page(page) + + # Choose processing method + if self.force_ai or page_type != "plain_text": + # Complex content: use glmocr + markdown = self._convert_with_glmocr(page, page_num) else: - # Plain text: default parser + # Plain text: use pdfplumber markdown = self._extract_text_with_tables(page) if markdown.strip(): markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}") - # Release page resources page.close() markdown = "\n\n".join(markdown_parts).strip() except Exception: - # Exception: fallback to pdfminer + # Fallback to pdfminer pdf_stream.seek(0) markdown = pdfminer.high_level.extract_text(pdf_stream) or "" @@ -139,51 +165,47 @@ def convert( return DocumentConverterResult(markdown=markdown) - def _convert_with_ai( - self, - page: Any, - page_num: int, - ai_service: AIService, - ) -> str: - """ - Convert page using AI. - - Args: - page: pdfplumber page object - page_num: Page number - ai_service: AI service - - Returns: - str: Markdown content - """ + def _analyze_page(self, page: Any) -> str: + """Analyze page content type.""" + # Check for images + if hasattr(page, "images") and page.images: + return "complex" + + # Check for tables + tables = page.find_tables() + if tables: + return "complex" + + # Check for graphics/curves + if hasattr(page, "curves") and page.curves: + return "complex" + + return "plain_text" + + def _convert_with_glmocr(self, page: Any, page_num: int) -> str: + """Convert page using glmocr SDK.""" try: - # Screenshot - img_stream = render_page_to_image(page, self.dpi) - - # Call AI (filename uses page number) - filename = f"page_{page_num + 1}.png" - result = ai_service.image_to_markdown(img_stream, filename=filename) - - if result.success and result.text.strip(): - return result.text - else: - # AI failed, fallback to default + # Render page to image + img = page.to_image(resolution=150) + img_bytes = io.BytesIO() + img.save(img_bytes, format="PNG") + img_bytes.seek(0) + + # Use glmocr to parse the image + result = self._get_glmocr().parse(img_bytes) + + # Check for errors + d = result.to_dict() + if "error" in d: return self._extract_text_with_tables(page) - + + return result.markdown_result or "" + except Exception: - # Exception, fallback to default return self._extract_text_with_tables(page) def _extract_text_with_tables(self, page: Any) -> str: - """ - Extract text and tables. - - Args: - page: pdfplumber page object - - Returns: - str: Markdown content - """ + """Extract text and tables from page.""" parts = [] # Extract text @@ -206,15 +228,7 @@ def _extract_text_with_tables(self, page: Any) -> str: return "\n\n".join(parts) def _table_to_markdown(self, table: list[list[str]]) -> str: - """ - Convert table to Markdown. - - Args: - table: 2D list - - Returns: - str: Markdown table - """ + """Convert table to Markdown.""" if not table: return "" @@ -236,16 +250,26 @@ def _table_to_markdown(self, table: list[list[str]]) -> str: # Format table lines = [] for row_idx, row in enumerate(table): - # Pad columns padded_row = row + [""] * (len(col_widths) - len(row)) line = "| " + " | ".join( str(cell).ljust(width) for cell, width in zip(padded_row, col_widths) ) + " |" lines.append(line) - # Add separator if row_idx == 0: sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|" lines.append(sep) - return "\n".join(lines) \ No newline at end of file + return "\n".join(lines) + + def close(self): + """Close the GlmOcr instance.""" + if self._glmocr: + self._glmocr.close() + self._glmocr = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() \ No newline at end of file diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_page_analyzer.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_page_analyzer.py deleted file mode 100644 index 1aa014043..000000000 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_page_analyzer.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Page content analyzer for detecting images and tables.""" - -from enum import Enum -from typing import Any - - -class PageType(Enum): - """Page content type classification.""" - PLAIN_TEXT = "plain_text" # 纯文本,使用默认解析 - HAS_IMAGES = "has_images" # 包含图片 - HAS_TABLES = "has_tables" # 包含表格 - COMPLEX = "complex" # 复杂内容(图片+表格+混合) - - -def detect_images(page: Any) -> bool: - """ - 检测页面是否包含图片 - - Args: - page: pdfplumber 页面对象 - - Returns: - bool: 是否包含图片 - """ - # 方法1: 直接检测 page.images - if hasattr(page, 'images') and len(page.images) > 0: - return True - - # 方法2: 检测页面对象中的图像资源 - if hasattr(page, 'objects'): - objects = page.objects - if 'image' in objects and len(objects['image']) > 0: - return True - # 检测 XObject (可能包含内嵌图像) - if 'xobject' in objects and len(objects['xobject']) > 0: - for obj in objects['xobject']: - if isinstance(obj, dict) and obj.get('subtype') == 'Image': - return True - - # 方法3: 检测页面资源字典 - try: - if hasattr(page, 'page') and hasattr(page.page, 'get_resources'): - resources = page.page.get_resources() - if resources and 'XObject' in resources: - return True - except Exception: - pass - - return False - - -def detect_tables(page: Any) -> bool: - """ - 检测页面是否包含表格 - - Args: - page: pdfplumber 页面对象 - - Returns: - bool: 是否包含表格 - """ - # 方法1: 使用 pdfplumber 的 extract_tables - try: - tables = page.extract_tables() - if tables and len(tables) > 0: - # 过滤空表格 - for table in tables: - if table and any(any(cell for cell in row if cell) for row in table): - return True - except Exception: - pass - - # 方法2: 检测表格线(边框线) - try: - if hasattr(page, 'objects') and 'line' in page.objects: - lines = page.objects['line'] - if len(lines) > 10: # 大量线条可能构成表格 - # 分析线条是否形成网格结构 - h_lines = [] - v_lines = [] - for line in lines: - # 水平线:高度很小 - if abs(line.get('height', 1)) < 2: - h_lines.append(line) - # 垂直线:宽度很小 - elif abs(line.get('width', 1)) < 2: - v_lines.append(line) - - if len(h_lines) > 2 and len(v_lines) > 2: - return True - except Exception: - pass - - return False - - -def analyze_page(page: Any) -> PageType: - """ - 分析页面类型 - - Args: - page: pdfplumber 页面对象 - - Returns: - PageType: 页面类型 - """ - has_images = detect_images(page) - has_tables = detect_tables(page) - - if has_images and has_tables: - return PageType.COMPLEX - elif has_images: - return PageType.HAS_IMAGES - elif has_tables: - return PageType.HAS_TABLES - else: - return PageType.PLAIN_TEXT diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_page_renderer.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_page_renderer.py deleted file mode 100644 index d517e8780..000000000 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_page_renderer.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Page renderer for converting PDF pages to images.""" - -import io -from typing import Any - - -def render_page_to_image(page: Any, dpi: int = 150) -> io.BytesIO: - """ - 将 PDF 页面渲染为图片 - - Args: - page: pdfplumber 页面对象 - dpi: 渲染分辨率,默认 150(平衡质量和速度) - - Returns: - io.BytesIO: PNG 图片流 - """ - # 使用 pdfplumber 的 to_image 方法 - page_image = page.to_image(resolution=dpi) - - # 转换为 BytesIO - img_stream = io.BytesIO() - page_image.original.save(img_stream, format="PNG") - img_stream.seek(0) - - return img_stream - - -# DPI 预设值 -DPI_LOW = 72 # 快速预览,文件小 -DPI_MEDIUM = 150 # 平衡质量和速度(默认) -DPI_HIGH = 300 # 高质量,适合复杂图表 diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py index 8abf29ccb..a940acf7d 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py @@ -3,9 +3,7 @@ from typing import Any from markitdown import MarkItDown -from ._config import GlmOcrConfig -from ._ai_service import AIService -from ._converter import GlmOcrPdfConverter +from ._converter import GlmOcrConverter __plugin_interface_version__ = 1 @@ -17,40 +15,19 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: Config sources (priority high to low): 1. kwargs parameters - 2. Environment variables - 3. Config file (pyproject.toml) - 4. Default values + 2. Environment variables (ZHIPU_API_KEY) + 3. .env file + 4. Built-in defaults """ - # Load config - config = GlmOcrConfig.load() - - # kwargs override config - api_key = kwargs.get("api_key") or config.api_key - model = kwargs.get("model", config.model) - dpi = kwargs.get("dpi", config.dpi) - force_ai = kwargs.get("force_ai", config.force_ai) - timeout = kwargs.get("timeout", config.timeout) - - # Create AI service - ai_service = None - if api_key: - try: - ai_service = AIService( - api_key=api_key, - model=model, - timeout=timeout, - ) - except Exception: - pass - # Register converter PRIORITY_GLMOCR = -1.0 markitdown.register_converter( - GlmOcrPdfConverter( - ai_service=ai_service, - dpi=dpi, - force_ai=force_ai, + GlmOcrConverter( + api_key=kwargs.get("api_key"), + timeout=kwargs.get("timeout", 1800), + enable_layout=kwargs.get("enable_layout", False), + force_ai=kwargs.get("force_ai", False), ), priority=PRIORITY_GLMOCR, ) \ No newline at end of file From 972fbeda96f597c4af5eddc40260a9b7ee74e8e3 Mon Sep 17 00:00:00 2001 From: hankl Date: Tue, 12 May 2026 18:14:12 +0800 Subject: [PATCH 06/15] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=9B=BE=E7=89=87ocr?= =?UTF-8?q?=E8=83=BD=E5=8A=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/markitdown_glmocr/_converter.py | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py index 60a488704..cafee2ec0 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py @@ -1,4 +1,4 @@ -"""GlmOcr PDF Converter - Intelligent PDF to Markdown conversion.""" +"""GlmOcr PDF/Image Converter - Intelligent PDF and Image to Markdown conversion.""" import io import sys @@ -30,19 +30,22 @@ ACCEPTED_MIME_TYPE_PREFIXES = [ "application/pdf", "application/x-pdf", + "image/jpeg", + "image/png", ] -ACCEPTED_FILE_EXTENSIONS = [".pdf"] +ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"] class GlmOcrConverter(DocumentConverter): """ - Intelligent PDF converter using glmocr SDK. + Intelligent PDF/Image converter using glmocr SDK. Features: - Auto-detect page content type (plain text vs images/tables) - Plain text pages use pdfplumber/pdfminer (fast, free) - Complex pages use glmocr SDK for AI-powered OCR + - Image files (PNG, JPG) use glmocr SDK directly - One-liner: glmocr.parse("document.pdf") handles everything """ @@ -128,7 +131,36 @@ def convert( _dependency_exc_info[2] ) - # Read PDF + extension = (stream_info.extension or "").lower() + + # Image files: use glmocr directly + if extension in (".jpg", ".jpeg", ".png"): + return self._convert_image(file_stream, extension) + + # PDF files: use hybrid approach + return self._convert_pdf(file_stream) + + def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult: + """Convert image file using glmocr SDK.""" + img_bytes = file_stream.read() + + try: + result = self._get_glmocr().parse(img_bytes) + + # Check for errors + d = result.to_dict() + if "error" in d: + return DocumentConverterResult(markdown="") + + return DocumentConverterResult( + markdown=result.markdown_result or "" + ) + except Exception as e: + return DocumentConverterResult( + markdown=f"" + ) + + def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: pdf_stream = io.BytesIO(file_stream.read()) markdown_parts = [] @@ -189,10 +221,7 @@ def _convert_with_glmocr(self, page: Any, page_num: int) -> str: img = page.to_image(resolution=150) img_bytes = io.BytesIO() img.save(img_bytes, format="PNG") - img_bytes.seek(0) - - # Use glmocr to parse the image - result = self._get_glmocr().parse(img_bytes) + result = self._get_glmocr().parse(img_bytes.getvalue()) # Check for errors d = result.to_dict() From bc349838ca6788eaf3063b88430758fc93458ef7 Mon Sep 17 00:00:00 2001 From: hankl Date: Wed, 20 May 2026 18:08:13 +0800 Subject: [PATCH 07/15] =?UTF-8?q?=E6=96=B0=E5=A2=9Epaddleocr=E6=8F=92?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + docs/distribution-and-publishing.md | 679 ++++++++++ docs/nova-markitdown/SKILL.md | 173 +++ .../references/advanced-usage.md | 253 ++++ docs/nova-pdf-refactor-zhipu.md | 565 -------- docs/nova-pdf-technical-design.md | 1175 ----------------- docs/paddleocr-plugin-design.md | 102 ++ ...72\344\276\213\344\273\243\347\240\201.md" | 122 ++ packages/markitdown-paddleocr/README.md | 157 +++ packages/markitdown-paddleocr/pyproject.toml | 58 + .../src/markitdown_paddleocr/__about__.py | 1 + .../src/markitdown_paddleocr/__init__.py | 16 + .../src/markitdown_paddleocr/_config.py | 46 + .../src/markitdown_paddleocr/_converter.py | 304 +++++ .../markitdown_paddleocr/_dual_converter.py | 160 +++ .../markitdown_paddleocr/_paddle_client.py | 189 +++ .../src/markitdown_paddleocr/_plugin.py | 35 + .../markitdown-paddleocr/tests/__init__.py | 1 + .../tests/test_converter.py | 214 +++ .../tests/test_paddle_client.py | 241 ++++ 20 files changed, 2752 insertions(+), 1740 deletions(-) create mode 100644 docs/distribution-and-publishing.md create mode 100644 docs/nova-markitdown/SKILL.md create mode 100644 docs/nova-markitdown/references/advanced-usage.md delete mode 100644 docs/nova-pdf-refactor-zhipu.md delete mode 100644 docs/nova-pdf-technical-design.md create mode 100644 docs/paddleocr-plugin-design.md create mode 100644 "docs/panddle\347\244\272\344\276\213\344\273\243\347\240\201.md" create mode 100644 packages/markitdown-paddleocr/README.md create mode 100644 packages/markitdown-paddleocr/pyproject.toml create mode 100644 packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py create mode 100644 packages/markitdown-paddleocr/src/markitdown_paddleocr/__init__.py create mode 100644 packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py create mode 100644 packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py create mode 100644 packages/markitdown-paddleocr/src/markitdown_paddleocr/_dual_converter.py create mode 100644 packages/markitdown-paddleocr/src/markitdown_paddleocr/_paddle_client.py create mode 100644 packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py create mode 100644 packages/markitdown-paddleocr/tests/__init__.py create mode 100644 packages/markitdown-paddleocr/tests/test_converter.py create mode 100644 packages/markitdown-paddleocr/tests/test_paddle_client.py diff --git a/.gitignore b/.gitignore index 5a6b7d117..dc473bfb9 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,4 @@ src/.DS_Store .secrets.local *.secrets .env.local +test-data/ diff --git a/docs/distribution-and-publishing.md b/docs/distribution-and-publishing.md new file mode 100644 index 000000000..c2240b0db --- /dev/null +++ b/docs/distribution-and-publishing.md @@ -0,0 +1,679 @@ +# MarkItDown 分发与发布方案 + +## 背景 + +本地 fork 版本包含两个核心包: +- **markitdown** `0.1.6b2`(官方 PyPI 最新为 `0.1.5`) +- **markitdown-glmocr** `0.1.0`(PyPI 上不存在,纯本地新增插件) + +目标:让其他人能方便使用包含 glmocr 插件的 markitdown,不依赖官方是否合并 PR。 + +--- + +## 方案总览 + +| 方案 | 适用场景 | 用户体验 | 维护成本 | 分发方式 | +|------|---------|----------|---------|---------| +| **A. PyPI 独立发布** | 面向 Python 开发者 | `pip install` 即用 | 低 | PyPI | +| **B. Pyx 打包独立可执行文件** | 面向非技术用户 | 双击/命令行直接运行 | 中 | GitHub Releases | +| **C. Docker 镜像** | 服务端/CI 场景 | `docker run` 即用 | 低 | Docker Hub / GHCR | +| **D. 混合方案(推荐)** | 覆盖所有场景 | 按需选择 | 中 | PyPI + GitHub Releases | + +--- + +## 方案 A:PyPI 独立发布(推荐优先执行) + +### 核心思路 + +不改动 `markitdown` 主包名,仅将 `markitdown-glmocr` 发布到 PyPI。用户安装方式: + +```bash +pip install markitdown[all] markitdown-glmocr[glmocr] +``` + +使用时加 `-p` 参数启用插件: + +```bash +markitdown -p document.pdf +``` + +### 为什么不 fork 一个 `markitdown-glmocr-all` 包? + +1. `markitdown` 的插件机制(entry_points)已经设计好,`markitdown-glmocr` 作为插件包完全解耦 +2. 避免维护 markitdown 核心代码的 fork 副本 +3. 官方更新 markitdown 核心时,用户直接 `pip install -U markitdown` 即可升级 + +### 详细步骤 + +#### 1. 修改 `markitdown-glmocr` 的 pyproject.toml + +```toml +[project] +name = "markitdown-glmocr" +version = "0.1.0" # 改为静态版本,首次发布不用 dynamic +description = "Intelligent PDF/Image to Markdown converter using GLM-OCR SDK" +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +authors = [ + { name = "Your Name", email = "your@email.com" }, +] + +# 关键:声明对 markitdown 的版本范围依赖 +dependencies = [ + "markitdown>=0.1.0,<1.0.0", + "pdfminer.six>=20251230", + "pdfplumber>=0.11.9", + "Pillow>=9.0.0", +] + +[project.optional-dependencies] +glmocr = ["glmocr>=0.1.0"] +all = [ + "glmocr>=0.1.0", + "markitdown[all]", +] +dev = ["pytest>=7.0.0", "build", "twine"] + +# 插件入口点(已有,无需修改) +[project.entry-points."markitdown.plugin"] +markitdown_glmocr = "markitdown_glmocr" +``` + +#### 2. 编写 README.md + +在 `packages/markitdown-glmocr/` 下创建完善的 README: + +```markdown +# markitdown-glmocr + +Intelligent PDF/Image to Markdown converter plugin for [markitdown](https://github.com/microsoft/markitdown), +powered by [GLM-OCR](https://github.com/zai-org/glm-ocr) SDK. + +## Installation + +pip install markitdown-glmocr[glmocr] + +## Usage + +# Enable plugins with -p flag +markitdown -p document.pdf +markitdown -p image.png + +# Or use programmatically +from markitdown import MarkItDown +md = MarkItDown(enable_plugins=True) +result = md.convert("document.pdf") +print(result.markdown) + +## Configuration + +Set your Zhipu API key: + +export ZHIPU_API_KEY=your_api_key_here +``` + +#### 3. 构建并发布 + +```bash +cd packages/markitdown-glmocr + +# 安装构建工具 +pip install build twine + +# 构建 wheel 和 sdist +python -m build + +# 检查包 +twine check dist/* + +# 上传到 TestPyPI 先验证 +twine upload --repository testpypi dist/* + +# 验证安装 +pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr[glmocr] + +# 正式发布到 PyPI +twine upload dist/* +``` + +#### 4. PyPI 账号准备 + +- 注册 https://pypi.org 账号 +- 配置 API Token:Account settings → API tokens → Add API token +- 配置 `~/.pypirc`: + +```ini +[pypi] +username = __token__ +password = pypi-xxxxxxxxxxxx + +[testpypi] +username = __token__ +password = pypi-test-xxxxxxxxxxxx +``` + +### 优缺点 + +| 优点 | 缺点 | +|------|------| +| 标准Python生态分发方式 | 需要用户有Python环境 | +| 插件机制天然解耦,官方更新不受影响 | glmocr SDK 依赖较多(numpy, pymupdf等) | +| 版本管理清晰 | 需要维护PyPI账号和token | +| `pip install` 一行搞定 | | + +--- + +## 方案 B:PyInstaller 打包独立可执行文件 + +### 核心思路 + +将 markitdown + markitdown-glmocr + glmocr + 所有依赖打包成单个可执行文件,用户无需安装 Python。 + +### 详细步骤 + +#### 1. 创建打包配置 + +在项目根目录创建 `build_standalone/` 目录: + +``` +build_standalone/ +├── build.py # 构建脚本 +├── markitdown.spec # PyInstaller spec 文件 +└── README.md # 使用说明 +``` + +#### 2. 编写 PyInstaller spec 文件 + +`build_standalone/markitdown.spec`: + +```python +# -*- mode: python ; coding: utf-8 -*- +import sys +from pathlib import Path + +block_cipher = None + +# 收集所有隐式导入的模块 +hiddenimports = [ + 'markitdown', + 'markitdown.converters', + 'markitdown_glmocr', + 'glmocr', + 'pdfminer', + 'pdfminer.high_level', + 'pdfminer.layout', + 'pdfminer.utils', + 'pdfplumber', + 'PIL', + 'magika', + 'charset_normalizer', + 'markdownify', + 'beautifulsoup4', + 'bs4', + 'mammoth', + 'openpyxl', + 'pandas', + 'python_pptx', + 'lxml', + 'numpy', + 'pydantic', + 'pymupdf', + 'fitz', # pymupdf 的内部名 + 'tqdm', + 'yaml', + 'dotenv', + 'requests', + 'defusedxml', +] + +a = Analysis( + ['entry_point.py'], + pathex=[], + binaries=[], + datas=[ + # 包含 magika 的模型文件 + ('magika/models', 'magika/models'), + ], + hiddenimports=hiddenimports, + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False, +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE( + pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + [], + name='markitdown', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + upx_exclude=[], + runtime_tmpdir=None, + console=True, + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, + icon=None, +) +``` + +#### 3. 编写入口文件 + +`build_standalone/entry_point.py`: + +```python +"""Entry point for PyInstaller build.""" +import sys +import os + +# 确保插件被启用 +if '-p' not in sys.argv and '--use-plugins' not in sys.argv: + # 自动启用 glmocr 插件 + sys.argv.insert(1, '-p') + +from markitdown.__main__ import main + +if __name__ == '__main__': + main() +``` + +#### 4. 编写构建脚本 + +`build_standalone/build.py`: + +```python +#!/usr/bin/env python3 +"""Build standalone markitdown executable with PyInstaller.""" +import subprocess +import sys +import platform +import shutil +from pathlib import Path + +def main(): + project_root = Path(__file__).parent.parent + build_dir = Path(__file__).parent + + # 1. 确保依赖已安装 + print(">>> Installing dependencies...") + subprocess.run([ + sys.executable, "-m", "pip", "install", "-e", + str(project_root / "packages" / "markitdown[all]"), + ], check=True) + subprocess.run([ + sys.executable, "-m", "pip", "install", "-e", + str(project_root / "packages" / "markitdown-glmocr[glmocr]"), + ], check=True) + subprocess.run([ + sys.executable, "-m", "pip", "install", "pyinstaller", + ], check=True) + + # 2. 执行 PyInstaller + print(">>> Building executable...") + subprocess.run([ + sys.executable, "-m", "PyInstaller", + "--clean", + "--noconfirm", + str(build_dir / "markitdown.spec"), + ], cwd=str(build_dir), check=True) + + # 3. 输出结果 + dist_dir = build_dir / "dist" + exe_name = "markitdown.exe" if platform.system() == "Windows" else "markitdown" + exe_path = dist_dir / exe_name + + if exe_path.exists(): + size_mb = exe_path.stat().st_size / (1024 * 1024) + print(f"\n✅ Build successful!") + print(f" Executable: {exe_path}") + print(f" Size: {size_mb:.1f} MB") + print(f" Platform: {platform.system()} {platform.machine()}") + else: + print("\n❌ Build failed - executable not found") + sys.exit(1) + +if __name__ == "__main__": + main() +``` + +#### 5. GitHub Actions 自动构建多平台 + +`.github/workflows/build-standalone.yml`: + +```yaml +name: Build Standalone Executable + +on: + push: + tags: ['v*'] + workflow_dispatch: + +jobs: + build: + strategy: + matrix: + include: + - os: windows-latest + artifact: markitdown-windows-x64.exe + - os: ubuntu-latest + artifact: markitdown-linux-x64 + - os: macos-latest + artifact: markitdown-macos-x64 + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Install dependencies + run: | + pip install -e ./packages/markitdown[all] + pip install -e ./packages/markitdown-glmocr[glmocr] + pip install pyinstaller + + - name: Build with PyInstaller + run: | + pyinstaller --clean --noconfirm build_standalone/markitdown.spec + working-directory: . + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.artifact }} + path: dist/markitdown* + + release: + needs: build + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/v') + steps: + - uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Create Release + uses: softprops/action-gh-release@v2 + with: + files: artifacts/** + generate_release_notes: true +``` + +### 预估产物大小 + +| 平台 | 预估大小 | 说明 | +|------|---------|------| +| Windows x64 | ~80-120 MB | 含 Python 运行时 + numpy + pymupdf 等 | +| Linux x64 | ~60-90 MB | | +| macOS x64 | ~70-100 MB | | + +### 优缺点 + +| 优点 | 缺点 | +|------|------| +| 无需Python环境,双击可用 | 产物体积大(80-120MB) | +| 非技术用户友好 | 每次更新需重新打包 | +| 可离线使用 | PyInstaller 隐式导入容易遗漏,调试成本高 | +| 可通过 GitHub Releases 分发 | 跨平台需分别构建 | +| | 杀毒软件可能误报 | + +### 替代方案:Nuitka + +如果 PyInstaller 遇到问题,可考虑 [Nuitka](https://nuitka.net/): + +```bash +pip install nuitka +python -m nuitka --standalone --onefile \ + --enable-plugin=numpy,pandas \ + --include-data-dir=magika/models=magika/models \ + entry_point.py +``` + +Nuitka 编译为真正的机器码,性能更好,但构建时间更长。 + +--- + +## 方案 C:Docker 镜像 + +### 核心思路 + +基于官方 Dockerfile 扩展,加入 glmocr 插件。 + +### Dockerfile + +```dockerfile +FROM python:3.13-slim-bullseye + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg exiftool && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY packages/markitdown /app/packages/markitdown +COPY packages/markitdown-glmocr /app/packages/markitdown-glmocr + +RUN pip --no-cache-dir install \ + /app/packages/markitdown[all] \ + /app/packages/markitdown-glmocr[glmocr] + +ENTRYPOINT ["markitdown"] +``` + +### 使用方式 + +```bash +# 构建 +docker build -t markitdown-glmocr . + +# 使用 +docker run --rm -v $(pwd):/data markitdown-glmocr -p /data/document.pdf + +# 发布到 GHCR +docker tag markitdown-glmocr ghcr.io/yourname/markitdown-glmocr:latest +docker push ghcr.io/yourname/markitdown-glmocr:latest +``` + +### 优缺点 + +| 优点 | 缺点 | +|------|------| +| 环境完全隔离 | 需要 Docker 环境 | +| 适合 CI/CD 集成 | 镜像体积 ~500MB+ | +| 服务端部署友好 | 桌面用户不友好 | + +--- + +## 方案 D:混合方案(推荐) + +### 执行优先级 + +``` +1️⃣ 方案A:PyPI 发布 markitdown-glmocr → Python 开发者首选 +2️⃣ 方案B:PyInstaller 打包 → 非技术用户 / 离线场景 +3️⃣ 方案C:Docker 镜像 → 服务端 / CI 场景(可选) +``` + +### 具体执行计划 + +#### Phase 1:PyPI 发布(1-2 天) + +1. **完善 markitdown-glmocr 包** + - [ ] 补充 README.md(安装、使用、配置说明) + - [ ] 补充 LICENSE 文件 + - [ ] 添加 `py.typed` 标记(如需类型提示支持) + - [ ] 修复 `__about__.py` 版本号为 `0.1.0` + - [ ] 确保所有依赖版本范围合理 + +2. **本地验证** + - [ ] 在全新虚拟环境中测试安装流程 + ```bash + python -m venv /tmp/test-env + source /tmp/test-env/bin/activate + pip install markitdown[all] markitdown-glmocr[glmocr] + markitdown -p --list-plugins # 应显示 markitdown_glmocr + markitdown -p test.pdf # 功能测试 + ``` + +3. **发布到 TestPyPI 验证** + - [ ] `python -m build` + - [ ] `twine upload --repository testpypi dist/*` + - [ ] 从 TestPyPI 安装并测试 + +4. **正式发布到 PyPI** + - [ ] `twine upload dist/*` + +5. **发布后验证** + - [ ] `pip install markitdown-glmocr[glmocr]` + - [ ] 功能测试通过 + +#### Phase 2:独立可执行文件(2-3 天) + +1. **搭建 PyInstaller 构建流程** + - [ ] 创建 `build_standalone/` 目录和配置 + - [ ] 本地 Windows 构建测试 + - [ ] 解决隐式导入问题(最耗时) + +2. **GitHub Actions CI/CD** + - [ ] 配置多平台构建 workflow + - [ ] 打 tag 触发自动构建和 Release + +3. **分发** + - [ ] GitHub Releases 页面提供下载 + - [ ] README 中添加下载链接 + +#### Phase 3:Docker 镜像(可选,0.5 天) + +1. **编写 Dockerfile** +2. **发布到 GHCR** +3. **文档补充** + +--- + +## 关于 PR 合并的判断 + +### 官方接受 PR 的可能性分析 + +| 因素 | 评估 | +|------|------| +| markitdown 已有插件机制 | ✅ 架构上完全兼容 | +| glmocr 是第三方商业API | ⚠️ 官方可能不愿绑定特定商业服务 | +| 官方已有 azure-doc-intel 集成 | ✅ 有先例,但 Azure 是微软自家产品 | +| PR 贡献者不是微软员工 | ⚠️ 可能需要较长时间审核 | +| markitdown 版本还在 0.x (Beta) | ✅ 正是引入新功能的阶段 | + +**结论**:官方大概率不会直接接受 glmocr 插件 PR(因为绑定了非微软的商业 API),但插件机制的存在意味着**不需要官方接受 PR**,独立发布到 PyPI 是完全合理的路径。 + +### 建议策略 + +1. **先独立发布到 PyPI**(方案A),不依赖官方 +2. **同时提交 PR**,作为"贡献回社区"的姿态,即使被拒也无所谓 +3. PR 描述中强调: + - 完全通过插件机制扩展,不修改核心代码 + - 可作为"第三方插件集成"的参考实现 + - 有完整的测试和文档 + +--- + +## 快速开始:5分钟发布到 PyPI + +如果你现在就想发布,执行以下命令: + +```bash +# 1. 进入 glmocr 插件目录 +cd D:/15-AI-Coding/markitdown/packages/markitdown-glmocr + +# 2. 安装构建工具 +pip install build twine + +# 3. 构建 +python -m build + +# 4. 检查 +twine check dist/* + +# 5. 发布到 TestPyPI(先测试) +twine upload --repository testpypi dist/* + +# 6. 确认无误后发布到正式 PyPI +twine upload dist/* +``` + +发布后,其他人只需: + +```bash +pip install markitdown-glmocr[glmocr] +export ZHIPU_API_KEY=your_key +markitdown -p your-file.pdf +``` + +--- + +## 附录:常见问题 + +### Q1: 用户不装 glmocr SDK,只装 markitdown-glmocr 会怎样? + +不会报错。`_converter.py` 中 glmocr 是 lazy import,只在实际转换时才检查。 +但建议用户安装 `markitdown-glmocr[glmocr]` 以获得完整功能。 + +### Q2: 如何处理 markitdown 核心包的版本兼容性? + +`markitdown-glmocr` 的 `pyproject.toml` 中声明 `markitdown>=0.1.0,<1.0.0`。 +markitdown 的插件接口(entry_points)是稳定的,0.x 版本间不会 breaking change。 + +### Q3: PyInstaller 打包后 API Key 如何配置? + +通过环境变量 `ZHIPU_API_KEY` 传入,或在运行时通过 `.env` 文件: +```bash +# 方式1:环境变量 +set ZHIPU_API_KEY=your_key +markitdown -p document.pdf + +# 方式2:.env 文件(glmocr SDK 自动读取) +echo ZHIPU_API_KEY=your_key > .env +markitdown -p document.pdf +``` + +### Q4: 能否做一个"一键安装包"给非技术用户? + +可以结合 PyInstaller + Inno Setup(Windows)或 create-dmg(macOS)做安装向导: + +``` +Windows: PyInstaller → .exe → Inno Setup → .exe 安装向导 +macOS: PyInstaller → binary → create-dmg → .dmg +Linux: PyInstaller → binary → AppImage → .AppImage +``` + +但这增加了维护成本,建议先只提供裸 executable,待有需求再加安装向导。 + +### Q5: uvx / pipx 支持吗? + +支持!发布到 PyPI 后: + +```bash +# 一次性运行(无需安装) +uvx --from markitdown-glmocr[glmocr] markitdown -p document.pdf + +# 或用 pipx +pipx run markitdown -p document.pdf +``` + +这是最推荐的非技术用户使用方式——比 PyInstaller 更轻量,且始终使用最新版。 diff --git a/docs/nova-markitdown/SKILL.md b/docs/nova-markitdown/SKILL.md new file mode 100644 index 000000000..c9c53a7dc --- /dev/null +++ b/docs/nova-markitdown/SKILL.md @@ -0,0 +1,173 @@ +--- +name: nova-markitdown +description: + Convert various file formats (PDF, Word, Excel, PPT, images, HTML, audio, video) to Markdown using markitdown CLI with dual OCR fallback:glmocr (primary) → paddleocr (fallback). Activate when users need file-to-markdown conversion, OCR recognition, content extraction, structured data from documents, or batch document processing. Keywords:PDF to markdown, image OCR, document conversion, markitdown, glmocr, paddleocr, file extraction. +compatibility: + Python 3.10+, pip packages:markitdown[all], markitdown-glmocr[glmocr], markitdown-paddleocr. Requires ZHIPU_API_KEY for glmocr, BAIDU_PADDLE_TOKEN for paddleocr fallback. Network access to Zhipu AI API and Baidu PaddleOCR API. +metadata: + author: hankl + version: "2.0.0" +--- + +# nova-markitdown + +使用 markitdown 命令行工具将各种文件格式转换为 Markdown,**双 OCR 引擎自动降级**:glmocr(主)→ paddleocr(备)。 + +## 触发条件 + +当用户需要以下操作时激活此技能: + +- 将文件(PDF、Word、Excel、PPT、图片、HTML、音频、视频等)转换为 Markdown 文本 +- 提取文件中的文本内容、表格、图片描述等 +- 对 PDF 或图片进行 OCR 识别和结构化提取 +- 批量转换多个文件为 Markdown + +## 环境设置 + +### 安装依赖 + +```bash +# 基础 markitdown(支持大部分文件格式) +pip install 'markitdown' + +# markitdown-glmocr 插件(主 OCR,智谱 GLM-OCR) +pip install 'markitdown-glmocr[glmocr]' + +# markitdown-paddleocr 插件(备 OCR,百度 PaddleOCR) +pip install 'markitdown-paddleocr' +``` + +### 环境变量 + +```bash +# 主 OCR:智谱 API Key(glmocr) +export ZHIPU_API_KEY="your-zhipu-api-key" + +# 备 OCR:百度 PaddleOCR Token(paddleocr,glmocr 失败时自动切换) +export BAIDU_PADDLE_TOKEN="your-paddle-token" + +# 可选配置 +export GLMOCR_MODEL="glm-ocr" # glmocr 模型名称 +export GLMOCR_TIMEOUT="600" # glmocr 请求超时秒数 +export PADDLE_OCR_MODEL="PaddleOCR-VL-1.5" # paddleocr 模型名称 +``` + +> **重要**:`ZHIPU_API_KEY` 用于 glmocr(主),`BAIDU_PADDLE_TOKEN` 用于 paddleocr(备)。两者都设置可实现自动降级。 + +### 验证安装 + +```bash +markitdown --version +markitdown --list-plugins # 输出中应包含 markitdown_glmocr 和 markitdown_paddleocr +``` + +## 核心规则 + +1. **优先使用 markitdown 命令行**:所有文件转换优先通过 `markitdown` CLI 完成。 +2. **PDF 和图片使用双 OCR 降级策略**: + - **第一步**:使用 `markitdown -p`(glmocr 插件)尝试解析 + - **第二步**:若 glmocr 报错(API 错误、超时、Key 失效等),自动切换到 paddleocr 插件重试 + - **实现方式**:通过 Python 脚本封装,捕获异常后切换 +3. **其他文件类型不使用 `-p`**:Word、Excel、PPT、HTML、音频等使用不带 `-p` 的 markitdown 命令。 +4. **复杂场景回退到 Python SDK**:需要结构化 JSON 输出、按区域筛选、自定义处理流程时,使用 Python 代码。详见 [advanced-usage.md](references/advanced-usage.md)。 + +## 快速参考 + +| 文件类型 | 命令 | `-p` | 说明 | +|----------|------|:---:|------| +| PDF | `markitdown -p file.pdf -o out.md` | Yes | glmocr AI OCR | +| 图片 (.jpg/.png) | `markitdown -p image.png -o out.md` | Yes | glmocr AI OCR | +| Word (.docx) | `markitdown file.docx -o out.md` | No | 内置转换器 | +| Excel (.xlsx/.xls) | `markitdown file.xlsx -o out.md` | No | 内置转换器 | +| PPT (.pptx) | `markitdown file.pptx -o out.md` | No | 内置转换器 | +| HTML | `markitdown file.html -o out.md` | No | 内置转换器 | +| CSV/JSON/XML | `markitdown file.csv -o out.md` | No | 内置转换器 | +| 音频 | `markitdown audio.mp3 -o out.md` | No | 内置转换器 | +| ZIP | `markitdown archive.zip -o out.md` | No | 自动遍历 | +| YouTube | `markitdown "https://youtube.com/..." -o out.md` | No | 视频转录 | + +## 使用指南 + +### PDF 转换(双 OCR 降级) + +```bash +# 方式1:CLI 直接调用(仅 glmocr,无降级) +markitdown -p document.pdf -o output.md + +# 方式2:Python 双 OCR 降级(推荐,glmocr 失败自动切 paddleocr) +python -c " +from markitdown import MarkItDown +from markitdown_glmocr import GlmOcrConverter +from markitdown_paddleocr import PaddleOcrConverter + +md = MarkItDown(enable_plugins=False) +try: + md.register_converter(GlmOcrConverter(), priority=-1.0) + result = md.convert('document.pdf') + if not result.markdown.strip(): + raise Exception('Empty result') +except Exception as e: + print(f'glmocr failed: {e}, falling back to paddleocr...') + md = MarkItDown(enable_plugins=False) + md.register_converter(PaddleOcrConverter(), priority=-1.0) + result = md.convert('document.pdf') +print(result.markdown) +" +``` + +工作原理:纯文本页面使用 pdfplumber/pdfminer 快速提取;复杂页面(含图片、表格、公式)自动使用 AI OCR。glmocr 失败时自动降级到 paddleocr。 + +### 图片转换(双 OCR 降级) + +```bash +# CLI 直接调用(仅 glmocr) +markitdown -p photo.jpg -o photo.md + +# Python 双 OCR 降级(推荐) +python -c " +from markitdown import MarkItDown +from markitdown_glmocr import GlmOcrConverter +from markitdown_paddleocr import PaddleOcrConverter + +md = MarkItDown(enable_plugins=False) +try: + md.register_converter(GlmOcrConverter(), priority=-1.0) + result = md.convert('photo.jpg') + if not result.markdown.strip(): + raise Exception('Empty result') +except Exception as e: + print(f'glmocr failed: {e}, falling back to paddleocr...') + md = MarkItDown(enable_plugins=False) + md.register_converter(PaddleOcrConverter(), priority=-1.0) + result = md.convert('photo.jpg') +print(result.markdown) +" +``` + +### 其他文件格式 + +```bash +markitdown document.docx -o document.md # Word +markitdown spreadsheet.xlsx -o data.md # Excel +markitdown presentation.pptx -o slides.md # PPT +markitdown webpage.html -o webpage.md # HTML +markitdown data.csv -o data.md # CSV +markitdown config.json -o config.md # JSON +markitdown archive.zip -o archive.md # ZIP +``` + +## 故障排查 + +**插件未发现**:运行 `markitdown --list-plugins`,若无 glmocr 则 `pip install 'markitdown-glmocr[glmocr]'`,若无 paddleocr 则 `pip install markitdown-paddleocr`。 + +**glmocr API Key 错误**:检查 `echo $ZHIPU_API_KEY`,或在 `.env` 中设置。glmocr 失败时会自动降级到 paddleocr。 + +**paddleocr Token 错误**:检查 `echo $BAIDU_PADDLE_TOKEN`,或在 `.env` 中设置。 + +**PDF 输出为空或质量差**:确保使用 `-p` 参数,检查 API Key/Token,可设置 `GLMOCR_ENABLE_LAYOUT=true` 提升结构化输出。 + +**两个 OCR 都失败**:检查网络连接,确认两个 API Key/Token 都有效。 + +## 高级用法 + +需要结构化 JSON 输出、按区域筛选、批量处理、自定义参数、**双 OCR 降级封装**等高级场景,请参考 [advanced-usage.md](references/advanced-usage.md),包含 Python SDK 的完整示例和 `DualOcrConverter` 统一封装。 diff --git a/docs/nova-markitdown/references/advanced-usage.md b/docs/nova-markitdown/references/advanced-usage.md new file mode 100644 index 000000000..f21a3699d --- /dev/null +++ b/docs/nova-markitdown/references/advanced-usage.md @@ -0,0 +1,253 @@ +# 高级用法:Python SDK + 双 OCR 降级 + +当 markitdown 命令行无法满足需求时(如需要结构化 JSON 输出、按区域筛选、自定义处理流程、双 OCR 降级等),使用 Python 代码实现。 + +## 场景 0:DualOcrConverter — 双 OCR 自动降级(推荐) + +`DualOcrConverter` 封装了 glmocr(主)→ paddleocr(备)的自动降级逻辑,是 PDF/图片处理的推荐方式。 + +```python +from markitdown import MarkItDown +from markitdown_glmocr import GlmOcrConverter +from markitdown_paddleocr import PaddleOcrConverter + +class DualOcrConverter: + """双 OCR 转换器:glmocr(主)→ paddleocr(备)自动降级。""" + + def __init__(self, glmocr_kwargs=None, paddleocr_kwargs=None): + self.glmocr_kwargs = glmocr_kwargs or {} + self.paddleocr_kwargs = paddleocr_kwargs or {} + + def convert(self, file_path: str) -> str: + """转换文件,glmocr 失败自动降级到 paddleocr。""" + # 第一步:尝试 glmocr + try: + md = MarkItDown(enable_plugins=False) + md.register_converter(GlmOcrConverter(**self.glmocr_kwargs), priority=-1.0) + result = md.convert(file_path) + if result.markdown and result.markdown.strip(): + print("✓ glmocr 解析成功") + return result.markdown + raise Exception("glmocr returned empty result") + except Exception as e: + print(f"⚠ glmocr 失败: {e}") + + # 第二步:降级到 paddleocr + try: + md = MarkItDown(enable_plugins=False) + md.register_converter(PaddleOcrConverter(**self.paddleocr_kwargs), priority=-1.0) + result = md.convert(file_path) + if result.markdown and result.markdown.strip(): + print("✓ paddleocr 解析成功(降级)") + return result.markdown + raise Exception("paddleocr returned empty result") + except Exception as e: + print(f"✗ paddleocr 也失败: {e}") + raise RuntimeError(f"Both OCR engines failed. glmocr error preceded paddleocr fallback error.") + +# 使用 +converter = DualOcrConverter() +markdown = converter.convert("document.pdf") +``` + +### 自定义参数 + +```python +converter = DualOcrConverter( + glmocr_kwargs={ + "api_key": "sk-xxx", + "enable_layout": True, + "force_ai": True, + }, + paddleocr_kwargs={ + "token": "your-paddle-token", + "model": "PaddleOCR-VL-1.5", + "use_chart_recognition": True, + } +) +markdown = converter.convert("complex_report.pdf") +``` + +### 批量处理 + 双 OCR + +```python +from pathlib import Path + +converter = DualOcrConverter() +pdf_dir = Path("./documents") +output_dir = pdf_dir / "output" +output_dir.mkdir(exist_ok=True) + +for pdf_file in pdf_dir.glob("*.pdf"): + try: + markdown = converter.convert(str(pdf_file)) + (output_dir / f"{pdf_file.stem}.md").write_text(markdown, encoding="utf-8") + print(f"✓ {pdf_file.name}") + except RuntimeError: + print(f"✗ {pdf_file.name} — both OCR engines failed") +``` + +## 场景 1:结构化 JSON 输出(glmocr 区域标签、边界框) + +```python +import glmocr + +# 一行调用完成 OCR +result = glmocr.parse("report.pdf") + +# 获取 Markdown 文本 +print(result.markdown_result) + +# 获取结构化数据(按页分组,每页包含多个区域) +for page_idx, page_regions in enumerate(result.json_result): + print(f"Page {page_idx + 1}: {len(page_regions)} regions") + for region in page_regions: + print(f" [{region['label']}] {region['content'][:60]}") + +# 按标签筛选特定类型内容 +tables = [r for r in result.json_result[0] if r["label"] == "table"] +formulas = [r for r in result.json_result[0] if r["label"] == "formula"] +titles = [r for r in result.json_result[0] if r["label"] == "title"] + +# 保存到磁盘(Markdown + JSON 同时保存) +result.save(output_dir="./output") +``` + +### 支持的区域标签 + +| 标签 | 说明 | +|------|------| +| `title` | 标题 | +| `text` | 正文文本 | +| `table` | 表格 | +| `figure` | 图片 | +| `formula` | 公式 | +| `header` | 页眉 | +| `footer` | 页脚 | +| `page_number` | 页码 | +| `reference` | 参考文献 | +| `seal` | 印章 | + +## 场景 2:单独使用 PaddleClient(paddleocr 直接调用) + +```python +from markitdown_paddleocr import PaddleClient + +client = PaddleClient(token="your-paddle-token") + +# 本地文件 OCR +with open("image.png", "rb") as f: + markdown = client.ocr(file_bytes=f.read(), filename="image.png") +print(markdown) + +# URL 模式 OCR +markdown = client.ocr(file_url="https://example.com/document.pdf") +print(markdown) +``` + +## 场景 3:MarkItDown Python API + 单个 Converter + +```python +from markitdown import MarkItDown +from markitdown_glmocr import GlmOcrConverter +# 或 from markitdown_paddleocr import PaddleOcrConverter + +# glmocr +converter = GlmOcrConverter() +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) +result = md.convert("document.pdf") +print(result.text_content) + +# paddleocr +from markitdown_paddleocr import PaddleOcrConverter +converter = PaddleOcrConverter() +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) +result = md.convert("document.pdf") +print(result.text_content) +``` + +## 场景 4:自定义转换器参数 + +```python +from markitdown import MarkItDown +from markitdown_glmocr import GlmOcrConverter +from markitdown_paddleocr import PaddleOcrConverter + +# glmocr 自定义 +glmocr_converter = GlmOcrConverter( + api_key="sk-xxx", + timeout=600, + enable_layout=True, + force_ai=True, +) + +# paddleocr 自定义 +paddleocr_converter = PaddleOcrConverter( + token="your-token", + model="PaddleOCR-VL-1.5", + poll_interval=3.0, + poll_timeout=600.0, + force_ai=True, + use_chart_recognition=True, +) + +# 使用 DualOcrConverter 封装 +converter = DualOcrConverter( + glmocr_kwargs={"api_key": "sk-xxx", "enable_layout": True}, + paddleocr_kwargs={"token": "your-token", "use_chart_recognition": True}, +) +markdown = converter.convert("complex_document.pdf") +``` + +## 场景 5:只处理图片(不经过 PDF) + +```python +import glmocr + +# glmocr 直接对图片 OCR +result = glmocr.parse("screenshot.png") +print(result.markdown_result) + +# paddleocr 直接对图片 OCR +from markitdown_paddleocr import PaddleClient +client = PaddleClient(token="your-token") +with open("photo.jpg", "rb") as f: + markdown = client.ocr(file_bytes=f.read(), filename="photo.jpg") +print(markdown) +``` + +## 场景 6:批量处理多个文件 + +```python +from pathlib import Path + +# 使用 DualOcrConverter 批量处理(推荐) +converter = DualOcrConverter() + +pdf_dir = Path("./documents") +for pdf_file in pdf_dir.glob("*.pdf"): + try: + markdown = converter.convert(str(pdf_file)) + output_path = pdf_dir / "output" / f"{pdf_file.stem}.md" + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(markdown, encoding="utf-8") + print(f"✓ {pdf_file.name}") + except RuntimeError: + print(f"✗ {pdf_file.name} — both OCR engines failed") +``` + +## OCR 引擎对比 + +| 维度 | glmocr | paddleocr | +|------|--------|-----------| +| API 风格 | 同步 SDK 调用 | 异步 Job 轮询(submit → poll → fetch) | +| 认证 | `ZHIPU_API_KEY` | `BAIDU_PADDLE_TOKEN` | +| 结果格式 | SDK 封装对象 | JSONL 流 | +| 结构化输出 | ✅ 区域标签 + 边界框 | ❌ 仅 Markdown | +| 表格识别 | ✅ HTML → Markdown | ✅ HTML 表格 | +| 公式识别 | ✅ LaTeX | ✅ LaTeX | +| 印章识别 | ✅ | ✅ | +| 响应速度 | 快(同步) | 较慢(需轮询,2-30s) | +| 适用场景 | 首选,结构化需求 | 降级备选,glmocr 不可用时 | diff --git a/docs/nova-pdf-refactor-zhipu.md b/docs/nova-pdf-refactor-zhipu.md deleted file mode 100644 index cf6b2b5ff..000000000 --- a/docs/nova-pdf-refactor-zhipu.md +++ /dev/null @@ -1,565 +0,0 @@ -# Nova-PDF 重构方案:使用 zai-sdk + glm-ocr - -## 1. 重构目标 - -将现有的自定义 AI 服务替换为 zai-sdk + glm-ocr,简化代码并提升 OCR 能力。 - -## 2. 技术对比 - -| 项目 | 原方案 | 新方案 | -|------|--------|--------| -| SDK | requests (手动调用) | zai-sdk (官方 SDK) | -| 模型 | 自定义 Workflow | glm-ocr | -| 接口 | 两步上传(上传+调用) | 直接调用 layout_parsing | -| 认证 | 双 token (upload + workflow) | 单 API key | -| 配置 | 环境变量 | 配置文件 + 环境变量 | - -## 3. 接口分析 - -### 3.1 glm-ocr API - -```python -from zai import ZhipuAiClient - -client = ZhipuAiClient(api_key="your-api-key") - -# 支持图片 URL -response = client.layout_parsing.create( - model="glm-ocr", - file="https://example.com/image.png" -) - -# 支持本地文件路径 -response = client.layout_parsing.create( - model="glm-ocr", - file="/path/to/image.png" -) - -# 返回结果(包含 Markdown 格式的内容) -print(response) -``` - -### 3.2 响应结构 - -```python -# response 包含解析后的结构化内容 -# 具体字段需查看实际返回,通常包括: -# - 文本内容 -# - 布局信息 -# - 表格识别结果 -# - Markdown 格式输出 -``` - -## 4. 架构设计 - -### 4.1 组件变更 - -``` -原架构: -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ Page Renderer │────►│ File Uploader │────►│ Workflow API │ -│ (截图) │ │ (上传获取URL) │ │ (自定义接口) │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ - -新架构: -┌─────────────────┐ ┌─────────────────┐ -│ Page Renderer │────►│ glm-ocr API │ -│ (截图→临时文件) │ │ (layout_parsing)│ -└─────────────────┘ └─────────────────┘ -``` - -### 4.2 文件变更清单 - -| 文件 | 变更类型 | 说明 | -|------|----------|------| -| `_ai_service.py` | **重写** | 使用 zai-sdk + glm-ocr | -| `_converter.py` | 微调 | 适配新 AIService 接口 | -| `_plugin.py` | 微调 | 简化配置参数 | -| `pyproject.toml` | 更新 | 添加 zai-sdk 依赖 | -| `_config.py` | **新增** | 配置文件读取 | -| `README.md` | 更新 | 新的使用说明 | - -## 5. 详细设计 - -### 5.1 配置模块 (_config.py) - -```python -"""Configuration management for nova-pdf.""" - -import os -from pathlib import Path -from typing import Optional -from dataclasses import dataclass - -try: - import tomllib # Python 3.11+ -except ImportError: - import tomli as tomllib - - -@dataclass -class NovaPdfConfig: - """nova-pdf configuration.""" - - # API 配置 - zhipu_api_key: str = "" - - # OCR 配置 - model: str = "glm-ocr" - dpi: int = 150 - timeout: int = 120 - - # 处理策略 - force_ai: bool = False - - @classmethod - def load(cls, config_path: Optional[str] = None) -> "NovaPdfConfig": - """ - 从多个来源加载配置(优先级从高到低): - 1. 环境变量 - 2. 配置文件 (pyproject.toml 或 nova-pdf.toml) - 3. 默认值 - """ - config = cls() - - # 1. 从配置文件加载 - config._load_from_file(config_path) - - # 2. 环境变量覆盖 - config._load_from_env() - - return config - - def _load_from_file(self, config_path: Optional[str] = None): - """从配置文件加载""" - # 查找配置文件 - search_paths = [] - - if config_path: - search_paths.append(Path(config_path)) - - # 当前目录的 pyproject.toml - search_paths.append(Path("pyproject.toml")) - - # 当前目录的 nova-pdf.toml - search_paths.append(Path("nova-pdf.toml")) - - # 用户目录 - search_paths.append(Path.home() / ".config" / "nova-pdf" / "config.toml") - - for path in search_paths: - if path.exists(): - try: - with open(path, "rb") as f: - data = tomllib.load(f) - - # 读取 [tool.nova-pdf] 配置段 - if "tool" in data and "nova-pdf" in data["tool"]: - self._apply_config(data["tool"]["nova-pdf"]) - elif "nova-pdf" in data: - self._apply_config(data["nova-pdf"]) - - break - except Exception: - pass - - def _apply_config(self, data: dict): - """应用配置""" - if "api_key" in data: - self.zhipu_api_key = data["api_key"] - if "model" in data: - self.model = data["model"] - if "dpi" in data: - self.dpi = data["dpi"] - if "timeout" in data: - self.timeout = data["timeout"] - if "force_ai" in data: - self.force_ai = data["force_ai"] - - def _load_from_env(self): - """从环境变量加载(优先级最高)""" - if os.environ.get("NOVA_ZHIPU_API_KEY"): - self.zhipu_api_key = os.environ["NOVA_ZHIPU_API_KEY"] - if os.environ.get("NOVA_MODEL"): - self.model = os.environ["NOVA_MODEL"] - if os.environ.get("NOVA_DPI"): - self.dpi = int(os.environ["NOVA_DPI"]) - if os.environ.get("NOVA_TIMEOUT"): - self.timeout = int(os.environ["NOVA_TIMEOUT"]) - if os.environ.get("NOVA_FORCE_AI"): - self.force_ai = os.environ["NOVA_FORCE_AI"].lower() in ("true", "1", "yes") -``` - -### 5.2 AI 服务模块 (_ai_service.py) - -```python -"""AI service using zai-sdk and glm-ocr.""" - -import io -import os -import tempfile -from dataclasses import dataclass -from typing import BinaryIO, Optional - -try: - from zai import ZhipuAiClient -except ImportError: - ZhipuAiClient = None - -from ._config import NovaPdfConfig - - -@dataclass -class AIResult: - """Result from AI conversion.""" - text: str - success: bool = True - error: Optional[str] = None - - -class AIService: - """ - AI 服务 - 使用 zai-sdk + glm-ocr - - 特点: - - 直接调用 glm-ocr 的 layout_parsing API - - 支持本地文件路径或图片 URL - - 自动处理图片格式转换 - """ - - def __init__( - self, - api_key: Optional[str] = None, - model: str = "glm-ocr", - timeout: int = 120, - config: Optional[NovaPdfConfig] = None, - ): - """ - 初始化 AI 服务 - - Args: - api_key: 智谱 API Key,默认从配置读取 - model: 模型名称,默认 glm-ocr - timeout: 请求超时时间(秒) - config: 配置对象 - """ - if ZhipuAiClient is None: - raise ImportError( - "zai-sdk is required for AIService. " - "Install with: pip install nova-pdf[zhipu]" - ) - - # 从配置加载 - if config: - self.api_key = api_key or config.zhipu_api_key - self.model = model or config.model - self.timeout = timeout or config.timeout - else: - config = NovaPdfConfig.load() - self.api_key = api_key or config.zhipu_api_key - self.model = model - self.timeout = timeout - - if not self.api_key: - raise ValueError( - "API key is required. Set NOVA_ZHIPU_API_KEY environment variable " - "or add 'api_key' to [tool.nova-pdf] in pyproject.toml" - ) - - # 初始化客户端 - self.client = ZhipuAiClient(api_key=self.api_key) - - def image_to_markdown( - self, - image_stream: BinaryIO, - filename: str = "page.png", - ) -> AIResult: - """ - 将图片转换为 Markdown - - Args: - image_stream: 图片流 - filename: 文件名(用于临时文件) - - Returns: - AIResult: 转换结果 - """ - try: - # 方案1:保存为临时文件,传文件路径 - with tempfile.NamedTemporaryFile( - suffix=".png", - delete=False - ) as tmp: - tmp.write(image_stream.read()) - tmp_path = tmp.name - - image_stream.seek(0) - - # 调用 glm-ocr API - response = self.client.layout_parsing.create( - model=self.model, - file=tmp_path - ) - - # 清理临时文件 - try: - os.unlink(tmp_path) - except Exception: - pass - - # 解析响应 - # 响应格式可能是字符串或对象,需要适配 - if hasattr(response, 'content'): - text = response.content - elif hasattr(response, 'text'): - text = response.text - elif isinstance(response, str): - text = response - else: - text = str(response) - - return AIResult( - text=text.strip() if text else "", - success=True, - ) - - except Exception as e: - return AIResult( - text="", - success=False, - error=str(e), - ) -``` - -### 5.3 插件注册 (_plugin.py) - -```python -"""Plugin registration for nova-pdf.""" - -from typing import Any -from markitdown import MarkItDown - -from ._config import NovaPdfConfig -from ._ai_service import AIService -from ._converter import NovaPdfConverter - - -__plugin_interface_version__ = 1 - - -def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: - """ - 注册 nova-pdf 转换器 - - 配置来源(优先级从高到低): - 1. kwargs 参数 - 2. 环境变量 - 3. 配置文件 (pyproject.toml) - 4. 默认值 - """ - # 加载配置 - config = NovaPdfConfig.load() - - # kwargs 覆盖配置 - api_key = kwargs.get("api_key") or kwargs.get("zhipu_api_key") or config.zhipu_api_key - model = kwargs.get("model", config.model) - dpi = kwargs.get("dpi", config.dpi) - force_ai = kwargs.get("force_ai", config.force_ai) - timeout = kwargs.get("timeout", config.timeout) - - # 创建 AI 服务 - ai_service = None - if api_key: - try: - ai_service = AIService( - api_key=api_key, - model=model, - timeout=timeout, - ) - except Exception: - pass - - # 注册转换器 - PRIORITY_NOVA_PDF = -1.0 - - markitdown.register_converter( - NovaPdfConverter( - ai_service=ai_service, - dpi=dpi, - force_ai=force_ai, - ), - priority=PRIORITY_NOVA_PDF, - ) -``` - -### 5.4 pyproject.toml 更新 - -```toml -[project] -name = "nova-pdf" -dependencies = [ - "markitdown>=0.1.0", - "pdfminer.six>=20251230", - "pdfplumber>=0.11.9", - "Pillow>=9.0.0", - "tomli>=2.0.0;python_version<'3.11'", -] - -[project.optional-dependencies] -zhipu = [ - "zai-sdk>=0.2.2", -] -dev = [ - "pytest>=7.0.0", -] - -[project.entry-points."markitdown.plugin"] -nova_pdf = "nova_pdf" - -[tool.nova-pdf] -# API 配置 -api_key = "" -model = "glm-ocr" -dpi = 150 -timeout = 120 -force_ai = false -``` - -## 6. 配置方式 - -### 6.1 本地敏感配置文件(推荐) - -项目根目录下的 `.secrets.local` 文件存储敏感信息,此文件不会被提交到 Git: - -```bash -# .secrets.local -NOVA_ZHIPU_API_KEY="your-api-key-here" -``` - -使用方式: -```bash -# 加载敏感配置 -source .secrets.local - -# 或使用脚本 -source scripts/load_secrets.sh - -# 然后运行 -markitdown -p document.pdf -``` - -### 6.2 配置文件 (pyproject.toml) - -```toml -[tool.nova-pdf] -# API key 请通过环境变量或 .secrets.local 文件设置,不要硬编码 -api_key = "" -model = "glm-ocr" -dpi = 150 -timeout = 120 -``` - -### 6.3 环境变量(推荐) - -```bash -export NOVA_ZHIPU_API_KEY="your-api-key-here" -export NOVA_MODEL="glm-ocr" -export NOVA_DPI="150" -``` - -### 6.3 Python API - -```python -from markitdown import MarkItDown - -md = MarkItDown( - enable_plugins=True, - api_key="your-api-key", -) -``` - -### 6.4 命令行 - -```bash -export NOVA_ZHIPU_API_KEY="your-api-key" -markitdown -p document.pdf -``` - -## 7. 使用示例 - -```python -from markitdown import MarkItDown -from nova_pdf import AIService, NovaPdfConverter - -# 方式1:自动加载配置 -md = MarkItDown(enable_plugins=True) -result = md.convert("document.pdf") - -# 方式2:手动配置 -from nova_pdf import NovaPdfConfig, AIService - -config = NovaPdfConfig.load() -ai_service = AIService( - api_key="your-api-key", - model="glm-ocr", -) - -converter = NovaPdfConverter( - ai_service=ai_service, - dpi=150, -) - -md = MarkItDown(enable_plugins=False) -md.register_converter(converter, priority=-1.0) -result = md.convert("document.pdf") -``` - -## 8. 迁移路径 - -### 8.1 从旧版本迁移 - -| 旧配置 | 新配置 | -|--------|--------| -| `NOVA_UPLOAD_TOKEN` | `NOVA_ZHIPU_API_KEY` | -| `NOVA_WORKFLOW_TOKEN` | (删除) | -| `NOVA_BASE_URL` | (删除) | -| `NOVA_APP_ID` | (删除) | - -### 8.2 API 兼容性 - -- 旧版 `AIService(upload_token, workflow_token, ...)` → 废弃 -- 新版 `AIService(api_key, ...)` → 推荐 - -## 9. 实施计划 - -### ✅ Phase 1: 核心实现(已完成) -- [x] 设计配置模块 -- [x] 实现 `_config.py` -- [x] 重写 `_ai_service.py`(使用 zai-sdk + glm-ocr) -- [x] 更新 `_plugin.py` - -### ✅ Phase 2: 集成测试(已完成) -- [x] 更新 `pyproject.toml` -- [x] 测试 glm-ocr API -- [x] 测试插件集成 - -### Phase 3: 文档更新(进行中) -- [x] 更新 README.md -- [ ] 更新技术方案文档 -- [ ] 添加迁移指南 - -## 10. 风险与缓解 - -| 风险 | 缓解措施 | -|------|----------| -| zai-sdk 接口变化 | 封装适配层,隔离 SDK 细节 | -| glm-ocr 返回格式不确定 | 做多种格式兼容处理 | -| 临时文件清理失败 | 使用 try-finally 确保清理 | -| API key 泄露 | 支持环境变量,避免硬编码 | - -## 11. 待确认事项 - -- [ ] glm-ocr 返回的具体数据结构 -- [ ] 是否支持直接传图片字节流(不保存临时文件) -- [ ] 超时和重试策略 -- [ ] 并发请求限制 diff --git a/docs/nova-pdf-technical-design.md b/docs/nova-pdf-technical-design.md deleted file mode 100644 index 25128e33a..000000000 --- a/docs/nova-pdf-technical-design.md +++ /dev/null @@ -1,1175 +0,0 @@ -# Nova-PDF 插件技术方案 - -## 1. 概述 - -### 1.1 目标 -开发一个智能 PDF 解析插件 `nova-pdf`,实现: -- 自动检测 PDF 每页内容类型(纯文本 vs 包含图片/表格) -- 对纯文本页面使用默认解析能力(pdfminer/pdfplumber) -- 对包含图片/表格的页面截图后调用 AI 接口转 Markdown - -### 1.2 核心价值 -- **提升复杂 PDF 解析质量**:图表、扫描件等传统方法效果差的内容 -- **降低成本**:纯文本页面不调用 AI,节省 API 费用 -- **灵活配置**:支持自定义 AI 模型、分辨率、提示词等 - ---- - -## 2. 架构设计 - -### 2.1 插件结构 -``` -packages/nova-pdf/ -├── src/ -│ └── nova_pdf/ -│ ├── __init__.py # 导出和版本信息 -│ ├── __about__.py # 版本号 -│ ├── _plugin.py # 插件注册入口 -│ ├── _converter.py # PDF 转换器核心实现 -│ ├── _page_analyzer.py # 页面内容分析器 -│ ├── _page_renderer.py # 页面截图渲染器 -│ └── _ai_service.py # AI 接口封装 -├── tests/ -│ ├── __init__.py -│ ├── test_converter.py -│ ├── test_analyzer.py -│ └── fixtures/ -│ ├── text_only.pdf -│ ├── with_images.pdf -│ └── mixed_content.pdf -├── pyproject.toml -└── README.md -``` - -### 2.2 组件职责 - -| 组件 | 职责 | -|------|------| -| `_plugin.py` | 实现 `register_converters` 入口,注册转换器 | -| `_converter.py` | 继承 `DocumentConverter`,协调整体流程 | -| `_page_analyzer.py` | 分析页面是否包含图片/表格 | -| `_page_renderer.py` | 将 PDF 页面渲染为图片 | -| `_ai_service.py` | 调用 AI Vision API 转换图片为 Markdown | - -### 2.3 流程图 - -``` -┌──────────────────────────────────────────────────────────────────┐ -│ PDF 文件输入 │ -└──────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌──────────────────────────────────────────────────────────────────┐ -│ 逐页分析 (PageAnalyzer) │ -│ ┌────────────────────────────────────────────────────────────┐ │ -│ │ 对每一页: │ │ -│ │ 1. 检测是否包含图片 (images) │ │ -│ │ 2. 检测是否包含表格 (tables) │ │ -│ │ 3. 标记页面类型: PLAIN_TEXT / COMPLEX │ │ -│ └────────────────────────────────────────────────────────────┘ │ -└──────────────────────────────────────────────────────────────────┘ - │ - ┌─────────────────────┴─────────────────────┐ - ▼ ▼ -┌─────────────────────┐ ┌─────────────────────┐ -│ PLAIN_TEXT 页面 │ │ COMPLEX 页面 │ -│ │ │ │ -│ 使用默认解析: │ │ 1. 截图渲染 │ -│ - pdfplumber 提取 │ │ 2. 调用 AI 接口 │ -│ - pdfminer 备用 │ │ 3. 转换为 Markdown │ -└─────────────────────┘ └─────────────────────┘ - │ │ - └─────────────────────┬─────────────────────┘ - │ - ▼ -┌──────────────────────────────────────────────────────────────────┐ -│ 合并所有页面结果 │ -│ 输出完整 Markdown │ -└──────────────────────────────────────────────────────────────────┘ -``` - ---- - -## 3. 核心算法设计 - -### 3.1 页面内容检测 (PageAnalyzer) - -#### 检测策略 -```python -class PageType(Enum): - PLAIN_TEXT = "plain_text" # 纯文本,使用默认解析 - HAS_IMAGES = "has_images" # 包含图片 - HAS_TABLES = "has_tables" # 包含表格 - COMPLEX = "complex" # 复杂内容(图片+表格+混合) -``` - -#### 图片检测方法 -使用 **pdfplumber** 的页面对象检测: - -```python -def detect_images(page) -> bool: - """检测页面是否包含图片""" - # 方法1: 直接检测 page.images - if hasattr(page, 'images') and len(page.images) > 0: - return True - - # 方法2: 检测页面对象中的图像资源 - if hasattr(page, 'objects'): - if 'image' in page.objects and len(page.objects['image']) > 0: - return True - # 检测 XObject (可能包含内嵌图像) - if 'xobject' in page.objects and len(page.objects['xobject']) > 0: - for obj in page.objects['xobject']: - if obj.get('subtype') == 'Image': - return True - - # 方法3: 检测页面资源字典 - try: - if hasattr(page.page, 'get_resources'): - resources = page.page.get_resources() - if resources and 'XObject' in resources: - return True - except Exception: - pass - - return False -``` - -#### 表格检测方法 -```python -def detect_tables(page) -> bool: - """检测页面是否包含表格""" - # 方法1: 使用 pdfplumber 的 extract_tables - tables = page.extract_tables() - if tables and len(tables) > 0: - # 过滤空表格 - for table in tables: - if table and any(any(cell for cell in row) for row in table): - return True - - # 方法2: 检测表格线(边框线) - if hasattr(page, 'objects') and 'line' in page.objects: - lines = page.objects['line'] - if len(lines) > 10: # 大量线条可能构成表格 - # 分析线条是否形成网格结构 - h_lines = [l for l in lines if l.get('height', 1) < 2] - v_lines = [l for l in lines if l.get('width', 1) < 2] - if len(h_lines) > 2 and len(v_lines) > 2: - return True - - return False -``` - -#### 综合判断 -```python -def analyze_page(page) -> PageType: - """分析页面类型""" - has_images = detect_images(page) - has_tables = detect_tables(page) - - if has_images and has_tables: - return PageType.COMPLEX - elif has_images: - return PageType.HAS_IMAGES - elif has_tables: - return PageType.HAS_TABLES - else: - return PageType.PLAIN_TEXT -``` - -### 3.2 页面截图渲染 (PageRenderer) - -#### 技术选型 - -使用 **pdfplumber.to_image**,理由: -- 已是项目依赖,无需额外安装 -- 实现简单,代码量少 -- 底层使用 PIL,满足需求 - -#### 实现方案 -```python -import io - -def render_page_to_image(page, dpi: int = 150) -> io.BytesIO: - """ - 将 PDF 页面渲染为图片 - - Args: - page: pdfplumber 页面对象 - dpi: 渲染分辨率,默认 150(平衡质量和速度) - - Returns: - BytesIO: PNG 图片流 - """ - # 使用 pdfplumber 的 to_image 方法 - page_image = page.to_image(resolution=dpi) - - # 转换为 BytesIO - img_stream = io.BytesIO() - page_image.original.save(img_stream, format="PNG") - img_stream.seek(0) - - return img_stream -``` - -#### DPI 推荐值 -```python -DPI_SETTINGS = { - "low": 72, # 快速预览,文件小 - "medium": 150, # 平衡质量和速度(默认) - "high": 300, # 高质量,适合复杂图表 -} -``` - -### 3.3 AI 接口调用 (AIService) - -#### 复用 markitdown 的 LLM 客户端机制 -```python -from markitdown.converters._llm_caption import llm_caption - -class AIService: - """AI Vision 服务封装""" - - def __init__( - self, - client, # OpenAI 兼容客户端 - model: str = "gpt-4o", # 模型名称 - prompt: str | None = None, # 自定义提示词 - ): - self.client = client - self.model = model - self.prompt = prompt or self._default_prompt() - - def _default_prompt(self) -> str: - return """请将这张图片的内容转换为 Markdown 格式。 - -要求: -1. 保持原有的文档结构(标题、段落、列表等) -2. 表格使用 Markdown 表格语法 -3. 图片中的文字清晰转写 -4. 数学公式使用 LaTeX 语法 -5. 如有图表,用文字描述其内容 -6. 不要添加任何额外的解释或评论""" - - def image_to_markdown( - self, - image_stream: io.BytesIO, - stream_info: StreamInfo, - ) -> str: - """调用 AI 将图片转为 Markdown""" - result = llm_caption( - image_stream, - stream_info, - client=self.client, - model=self.model, - prompt=self.prompt, - ) - return result or "" -``` - ---- - -## 4. 转换器实现 (_converter.py) - -### 4.1 核心流程 -```python -class NovaPdfConverter(DocumentConverter): - """智能 PDF 转换器""" - - def __init__( - self, - ai_service: AIService | None = None, - dpi: int = 150, - force_ai: bool = False, # 强制所有页面使用 AI - ): - self.ai_service = ai_service - self.dpi = dpi - self.force_ai = force_ai - - def convert( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, - ) -> DocumentConverterResult: - # 读取 PDF - pdf_stream = io.BytesIO(file_stream.read()) - - markdown_parts = [] - - with pdfplumber.open(pdf_stream) as pdf: - for page_num, page in enumerate(pdf.pages): - # 分析页面类型 - page_type = analyze_page(page) - - # 根据类型选择处理方式 - if self.force_ai or page_type != PageType.PLAIN_TEXT: - # 复杂内容:截图 + AI - if self.ai_service: - img = render_page_to_image(page, self.dpi) - md = self.ai_service.image_to_markdown(img, StreamInfo()) - else: - # 无 AI 服务,回退到默认解析 - md = page.extract_text() or "" - else: - # 纯文本:默认解析 - md = page.extract_text() or "" - - if md.strip(): - markdown_parts.append(f"## Page {page_num + 1}\n\n{md}") - - return DocumentConverterResult( - markdown="\n\n".join(markdown_parts), - ) -``` - ---- - -## 5. 配置选项 - -### 5.1 初始化参数 -```python -class NovaPdfConfig: - """nova-pdf 配置""" - - # AI 服务配置 - llm_client: Any = None # OpenAI 兼容客户端(必需) - llm_model: str = "gpt-4o" # 模型名称 - llm_prompt: str | None = None # 自定义提示词 - - # 渲染配置 - dpi: int = 150 # 截图分辨率 - image_format: str = "png" # 图片格式 - - # 处理策略 - force_ai: bool = False # 强制所有页面使用 AI - skip_tables: bool = False # 跳过表格检测(表格用默认解析) - skip_images: bool = False # 跳过图片检测(图片用默认解析) - - # 性能配置 - max_concurrent: int = 5 # 并发请求数 - timeout: int = 60 # 单页 AI 调用超时(秒) -``` - -### 5.2 使用示例 -```python -from openai import OpenAI -from markitdown import MarkItDown - -# 初始化 LLM 客户端 -client = OpenAI(api_key="your-api-key") - -# 创建 MarkItDown 实例并启用 nova-pdf 插件 -md = MarkItDown( - enable_plugins=True, - llm_client=client, - llm_model="gpt-4o", -) - -# 转换 PDF -result = md.convert("complex_document.pdf") -print(result.markdown) -``` - ---- - -## 6. 依赖管理 - -### 6.1 pyproject.toml -```toml -[project] -name = "nova-pdf" -dependencies = [ - "markitdown>=0.1.0", - "pdfminer.six>=20251230", - "pdfplumber>=0.11.9", # 页面解析和截图渲染 - "Pillow>=9.0.0", # 图像处理(pdfplumber.to_image 底层依赖) -] - -[project.optional-dependencies] -dev = [ - "pytest>=7.0.0", - "pytest-asyncio>=0.21.0", -] - -# 插件入口点 -[project.entry-points."markitdown.plugin"] -nova_pdf = "nova_pdf" -``` - ---- - -## 7. 错误处理 - -### 7.1 降级策略 -```python -def convert_with_fallback( - self, - pdf_bytes: bytes, - page_num: int, - page_type: PageType, -) -> str: - """带降级的转换""" - - # 尝试 AI 转换 - if self.ai_service and page_type != PageType.PLAIN_TEXT: - try: - img = render_page_to_image(pdf_bytes, page_num, self.dpi) - result = self.ai_service.image_to_markdown(img, StreamInfo()) - if result.strip(): - return result - except AIServiceError as e: - logger.warning(f"AI 转换失败,降级到默认解析: {e}") - - # 降级到默认解析 - with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: - page = pdf.pages[page_num] - text = page.extract_text() or "" - - # 尝试提取表格 - tables = page.extract_tables() - if tables: - for table in tables: - text += "\n\n" + self._table_to_markdown(table) - - return text -``` - ---- - -## 8. 性能优化 - -### 8.1 异步处理 -```python -import asyncio -from typing import List - -async def convert_pages_async( - self, - pdf_bytes: bytes, - pages: List[PageInfo], -) -> List[str]: - """异步并发处理多页""" - - async def process_page(page_info: PageInfo) -> str: - if page_info.type == PageType.PLAIN_TEXT: - return self._extract_text(pdf_bytes, page_info.num) - else: - return await self._ai_convert_async(pdf_bytes, page_info.num) - - # 使用信号量限制并发 - semaphore = asyncio.Semaphore(self.max_concurrent) - - async def limited_process(page_info): - async with semaphore: - return await process_page(page_info) - - tasks = [limited_process(p) for p in pages] - return await asyncio.gather(*tasks) -``` - -### 8.2 缓存机制 -```python -from functools import lru_cache -import hashlib - -class CachedAIService(AIService): - """带缓存的 AI 服务""" - - @lru_cache(maxsize=100) - def _get_cache_key(self, image_hash: str) -> str | None: - """获取缓存结果""" - # 可接入 Redis 等 - pass - - def image_to_markdown(self, image_stream: io.BytesIO, ...) -> str: - # 计算图片哈希 - image_hash = hashlib.md5(image_stream.read()).hexdigest() - image_stream.seek(0) - - # 检查缓存 - cached = self._get_cache_key(image_hash) - if cached: - return cached - - # 调用 AI - result = super().image_to_markdown(image_stream, ...) - - # 存入缓存 - self._cache_result(image_hash, result) - return result -``` - ---- - -## 9. 测试策略 - -### 9.1 测试用例设计 -```python -class TestNovaPdfConverter: - """nova-pdf 转换器测试""" - - def test_plain_text_pdf(self): - """纯文本 PDF 应使用默认解析""" - pass - - def test_pdf_with_images(self): - """包含图片的 PDF 应调用 AI""" - pass - - def test_pdf_with_tables(self): - """包含表格的 PDF 应调用 AI""" - pass - - def test_mixed_content_pdf(self): - """混合内容应正确区分处理""" - pass - - def test_ai_service_fallback(self): - """AI 服务失败时应降级""" - pass - - def test_dpi_settings(self): - """不同 DPI 设置的渲染质量""" - pass - - def test_concurrent_processing(self): - """并发处理性能测试""" - pass -``` - ---- - -## 10. 扩展性设计 - -### 10.1 自定义页面分析器 -```python -class PageAnalyzerPlugin(ABC): - """页面分析器插件接口""" - - @abstractmethod - def analyze(self, page) -> PageType: - """分析页面类型""" - pass - -# 允许用户注入自定义分析器 -class NovaPdfConverter(DocumentConverter): - def __init__( - self, - page_analyzer: PageAnalyzerPlugin | None = None, - ... - ): - self.page_analyzer = page_analyzer or DefaultPageAnalyzer() -``` - -### 10.2 自定义 AI 提示词模板 -```python -PROMPT_TEMPLATES = { - "default": "...", - "academic": "学术论文模板...", - "financial": "财务报表模板...", - "legal": "法律文档模板...", -} - -class AIService: - def __init__(self, prompt_template: str = "default", ...): - self.prompt = PROMPT_TEMPLATES.get(prompt_template, PROMPT_TEMPLATES["default"]) -``` - ---- - -## 11. 风险与缓解措施 - -| 风险 | 影响 | 缓解措施 | -|------|------|----------| -| AI API 调用失败 | 转换中断 | 实现降级策略,回退到默认解析 | -| 大文件内存溢出 | 程序崩溃 | 分页处理,控制内存占用 | -| AI 响应慢 | 用户体验差 | 异步处理、进度反馈、超时控制 | -| 解析质量不稳定 | 输出错误 | 多模型对比、人工审核机制 | -| API 费用过高 | 成本失控 | 智能跳过纯文本页面、缓存机制 | - ---- - -## 12. 实施计划 - -### ✅ Phase 1: 基础框架(已完成) -- [x] 创建项目结构 -- [x] 实现插件注册入口 -- [x] 实现基础转换器框架 - -### ✅ Phase 2: 核心功能(已完成) -- [x] 实现页面内容检测 (`_page_analyzer.py`) -- [x] 实现页面截图渲染 (`_page_renderer.py`) -- [x] 实现 AI 服务接口 (`_ai_service.py`) -- [x] 实现完整转换流程 (`_converter.py`) - -### ⏳ Phase 3: 测试与优化(待进行) -- [ ] 运行单元测试 -- [ ] 添加测试 PDF 样本 -- [ ] 性能测试和优化 - -### ⏳ Phase 4: 文档与发布(待进行) -- [x] 编写 README 和使用文档 -- [x] 准备示例代码 -- [ ] 打包发布 - ---- - -## 代码结构 - -``` -packages/nova-pdf/ -├── src/nova_pdf/ -│ ├── __about__.py # 版本号 (0.1.0) -│ ├── __init__.py # 导出 register_converters -│ ├── _plugin.py # 插件注册入口 -│ ├── _converter.py # PDF 转换器核心 -│ ├── _page_analyzer.py # 图片/表格检测 -│ ├── _page_renderer.py # 页面截图 (pdfplumber.to_image) -│ └── _ai_service.py # AI 接口封装(两步上传) -├── tests/ -│ ├── test_analyzer.py # 分析器测试 -│ ├── test_converter.py # 转换器测试 -│ └── test_ai_service.py # AI 服务测试 -├── pyproject.toml # 项目配置 + nova-pdf 配置 -└── README.md # 使用文档 -``` - -**语法验证**: ✓ 所有 Python 文件通过语法检查 - ---- - -## 15. 改造完成总结 - -### 15.1 主要变更 - -| 文件 | 变更内容 | -|------|----------| -| `_ai_service.py` | 重写为两步调用:上传 → Workflow | -| `_plugin.py` | 适配新 AIService 初始化参数 | -| `_converter.py` | 传递文件名给 AI 服务 | -| `pyproject.toml` | 添加 `[tool.nova-pdf]` 配置段 | -| `README.md` | 更新环境变量和配置说明 | -| `tests/test_ai_service.py` | 新增 AI 服务测试(13 个用例)| - -### 15.2 环境变量 - -```bash -export NOVA_UPLOAD_TOKEN="your-fastgpt-token" # 必需 -export NOVA_WORKFLOW_TOKEN="your-workflow-token" # 必需 -export NOVA_BASE_URL="https://xny-test.glodon.com/jsf-ai" # 可选 -export NOVA_APP_ID="69fc37113fedac1eaaf65c82" # 可选 -``` - -### 15.3 快速开始 - -```python -from markitdown import MarkItDown - -# 启用插件 -md = MarkItDown(enable_plugins=True) - -# 转换 PDF(复杂页面自动调用 AI) -result = md.convert("document.pdf") -print(result.markdown) -``` - -### 15.4 实测结果 - -**测试图片**: `数位顺序表.png` (22KB) - -**测试结果**: ✓ 成功转换 - -```markdown -| | 整数部分 | | | | | | | 小数部分 | | | | | -|:---:|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---| -| 数位 | ...... | 万位 | 千位 | 百位 | 十位 | 个位 | . | 十分位 | 百分位 | 千分位 | 万分位 | ...... | -| 单位 | ...... | 万 | 千 | 百 | 十 | 个 | | 十分之一 0.1 | 百分之一 0.01 | 千分之一 0.001 | 万分之一 0.0001 | ...... | -``` - -**关键修正**: -1. 上传接口返回 `code: 200`(不是 0) -2. Workflow 接口需要 `messages` 字段(OpenAI 兼容格式) -3. SSL 验证跳过(`verify=False`)以适配内部 API - ---- - -## 13. 附录 - -### 13.1 参考实现 -- `markitdown-ocr`: 已有的 OCR 插件,可参考架构 -- `markitdown-sample-plugin`: 官方插件示例 -- `_pdf_converter.py`: 默认 PDF 转换器实现 - -### 13.2 关键代码参考 -```python -# 参考 markitdown-ocr 的插件注册方式 -def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: - PRIORITY_NOVA_PDF = -1.0 # 优先于默认 PDF 转换器 - - llm_client = kwargs.get("llm_client") - llm_model = kwargs.get("llm_model", "gpt-4o") - - ai_service = None - if llm_client: - ai_service = AIService(client=llm_client, model=llm_model) - - markitdown.register_converter( - NovaPdfConverter(ai_service=ai_service), - priority=PRIORITY_NOVA_PDF, - ) - -# 页面截图渲染(简化版) -def render_page_to_image(page, dpi: int = 150) -> io.BytesIO: - """使用 pdfplumber.to_image 渲染页面""" - page_image = page.to_image(resolution=dpi) - img_stream = io.BytesIO() - page_image.original.save(img_stream, format="PNG") - img_stream.seek(0) - return img_stream -``` - ---- - -## 14. AI 接口改造方案(自定义两步调用) - -### 14.1 背景 - -原方案使用 OpenAI 兼容的 base64 图片上传方式,现需改造为自定义两步流程: -1. 上传图片到文件服务,获取 URL -2. 调用 Workflow 接口处理图片 - -### 14.2 接口分析 - -#### Step 1: 文件上传接口 - -**请求** -``` -POST https://xny-test.glodon.com/jsf-ai/api/common/file/upload -Content-Type: multipart/form-data -Cookie: fastgpt_token= -``` - -**表单参数** -| 字段 | 类型 | 必填 | 说明 | -|------|------|------|------| -| metadata | string | ✓ | JSON 字符串,如 `{"chatId":""}`,每次动态生成 | -| bucketName | string | ✓ | 固定值 `chat` | -| file | binary | ✓ | 图片文件(PNG/JPEG) | -| data | string | ✓ | JSON 字符串,如 `{"appId":"69fc37113fedac1eaaf65c82"}` | - -**响应示例** -```json -{ - "code": 200, - "data": { - "previewUrl": "https://xny-test.glodon.com/jsf-ai/api/common/file/read/xxx.png?token=...", - "fileId": "69fc42e024457b47b7e22b4a" - } -} -``` - -> 注意:接口返回 `code: 200` 表示成功(不是 0) - -#### Step 2: Workflow 调用接口 - -**请求** -``` -POST https://xny-test.glodon.com/jsf-ai/api/v1/chat/completions -Content-Type: application/json -Authorization: Bearer -``` - -**请求体**(OpenAI 兼容格式) -```json -{ - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "请将这张图片的内容转换为 Markdown 格式。"}, - {"type": "image_url", "image_url": {"url": "https://...previewUrl..."}} - ] - } - ] -} -``` - -**响应示例**(OpenAI 兼容格式) -```json -{ - "id": "", - "model": "", - "choices": [ - { - "message": { - "role": "assistant", - "content": "| 数位顺序表 |\n|---|" - } - } - ] -} -``` - -> 注意:Workflow 接口使用 OpenAI 兼容的消息格式,需要 `messages` 字段 - -### 14.3 改造后的 AIService - -```python -"""AI service with custom two-step API calls.""" - -import io -import json -import requests -from dataclasses import dataclass -from typing import Any, BinaryIO, Optional - - -@dataclass -class AIResult: - """Result from AI conversion.""" - text: str - success: bool = True - error: Optional[str] = None - - -class AIService: - """ - AI 服务 - 自定义两步调用方式 - - 流程: - 1. 上传图片到文件服务,获取 previewUrl - 2. 调用 Workflow 接口,传入 fileUrls 参数 - """ - - def __init__( - self, - base_url: str = "https://xny-test.glodon.com/jsf-ai", - upload_token: str = "", # fastgpt_token (Cookie) - workflow_token: str = "", # workflow_image2markdown_key (Authorization) - chat_id: str = "", # 用于上传接口的 chatId - app_id: str = "", # 用于上传接口的 appId - timeout: int = 60, - ): - """ - 初始化 AI 服务 - - Args: - base_url: API 基础地址 - upload_token: 文件上传认证 token(fastgpt_token) - workflow_token: Workflow 接口认证 token - chat_id: 会话 ID - app_id: 应用 ID - timeout: 请求超时时间(秒) - """ - self.base_url = base_url.rstrip("/") - self.upload_token = upload_token - self.workflow_token = workflow_token - self.chat_id = chat_id - self.app_id = app_id - self.timeout = timeout - - def image_to_markdown( - self, - image_stream: BinaryIO, - filename: str = "page.png", - ) -> AIResult: - """ - 将图片转换为 Markdown(两步调用) - - Args: - image_stream: 图片流 - filename: 文件名 - - Returns: - AIResult: 转换结果 - """ - try: - # Step 1: 上传图片 - upload_result = self._upload_file(image_stream, filename) - if not upload_result["success"]: - return AIResult( - text="", - success=False, - error=f"Upload failed: {upload_result.get('error')}" - ) - - file_url = upload_result["preview_url"] - - # Step 2: 调用 Workflow - workflow_result = self._call_workflow(file_url) - if not workflow_result["success"]: - return AIResult( - text="", - success=False, - error=f"Workflow failed: {workflow_result.get('error')}" - ) - - return AIResult( - text=workflow_result["text"], - success=True, - ) - - except Exception as e: - return AIResult( - text="", - success=False, - error=str(e), - ) - - def _upload_file( - self, - image_stream: BinaryIO, - filename: str, - ) -> dict: - """ - 上传文件到文件服务 - - Args: - image_stream: 图片流 - filename: 文件名 - - Returns: - dict: {"success": bool, "preview_url": str, "error": str} - """ - url = f"{self.base_url}/api/common/file/upload" - - # 准备 multipart/form-data - files = { - "file": (filename, image_stream, "image/png") - } - - data = { - "metadata": json.dumps({"chatId": self.chat_id}), - "bucketName": "chat", - "data": json.dumps({"appId": self.app_id}), - } - - headers = { - "Cookie": f"fastgpt_token={self.upload_token}", - } - - try: - response = requests.post( - url, - files=files, - data=data, - headers=headers, - timeout=self.timeout, - ) - response.raise_for_status() - - result = response.json() - - if result.get("code") == 0 and result.get("data", {}).get("previewUrl"): - return { - "success": True, - "preview_url": result["data"]["previewUrl"], - } - else: - return { - "success": False, - "error": result.get("message", "Unknown error"), - } - - except requests.RequestException as e: - return { - "success": False, - "error": str(e), - } - - def _call_workflow(self, file_url: str) -> dict: - """ - 调用 Workflow 接口处理图片 - - Args: - file_url: 文件 URL - - Returns: - dict: {"success": bool, "text": str, "error": str} - """ - url = f"{self.base_url}/api/v1/chat/completions" - - headers = { - "Authorization": f"Bearer {self.workflow_token}", - "Content-Type": "application/json", - } - - payload = { - "fileUrls": [file_url], - } - - try: - response = requests.post( - url, - json=payload, - headers=headers, - timeout=self.timeout, - ) - response.raise_for_status() - - result = response.json() - - # 解析 OpenAI 兼容响应格式 - choices = result.get("choices", []) - if choices: - content = choices[0].get("message", {}).get("content", "") - return { - "success": True, - "text": content.strip(), - } - else: - return { - "success": False, - "error": "No response content", - } - - except requests.RequestException as e: - return { - "success": False, - "error": str(e), - } -``` - -### 14.4 使用示例 - -```python -from markitdown import MarkItDown -from nova_pdf import AIService, NovaPdfConverter - -# 创建自定义 AI 服务 -ai_service = AIService( - base_url="https://xny-test.glodon.com/jsf-ai", - upload_token="", # fastgpt_token - workflow_token="your-workflow-token", - chat_id="tv1cyJFTt4wEKLqTKEx1KPEN", - app_id="69fc37113fedac1eaaf65c82", - timeout=120, -) - -# 创建转换器 -converter = NovaPdfConverter( - ai_service=ai_service, - dpi=150, -) - -# 手动注册 -md = MarkItDown(enable_plugins=False) -md.register_converter(converter, priority=-1.0) - -# 转换 PDF -result = md.convert("document.pdf") -print(result.markdown) -``` - -### 14.5 配置参数说明 - -| 参数 | 类型 | 必填 | 说明 | -|------|------|------|------| -| `base_url` | str | ✓ | API 基础地址 | -| `upload_token` | str | ✓ | 文件上传认证 token(fastgpt_token) | -| `workflow_token` | str | ✓ | Workflow 接口认证 token | -| `chat_id` | str | ✓ | 会话 ID(用于上传接口) | -| `app_id` | str | ✓ | 应用 ID(用于上传接口) | -| `timeout` | int | | 超时时间,默认 60 秒 | - -### 14.6 错误处理 - -```python -def image_to_markdown(self, image_stream, filename="page.png") -> AIResult: - """带完善错误处理的转换""" - try: - # Step 1: 上传 - upload_result = self._upload_file(image_stream, filename) - if not upload_result["success"]: - # 上传失败,返回详细错误 - return AIResult( - text="", - success=False, - error=f"上传失败: {upload_result.get('error')}" - ) - - # Step 2: Workflow - workflow_result = self._call_workflow(upload_result["preview_url"]) - if not workflow_result["success"]: - # Workflow 失败,返回详细错误 - return AIResult( - text="", - success=False, - error=f"AI 处理失败: {workflow_result.get('error')}" - ) - - return AIResult( - text=workflow_result["text"], - success=True, - ) - - except requests.Timeout: - return AIResult( - text="", - success=False, - error="请求超时,请检查网络或增加 timeout 设置" - ) - except requests.ConnectionError: - return AIResult( - text="", - success=False, - error="网络连接失败,请检查网络设置" - ) - except json.JSONDecodeError: - return AIResult( - text="", - success=False, - error="响应解析失败,接口返回非 JSON 格式" - ) - except Exception as e: - return AIResult( - text="", - success=False, - error=f"未知错误: {str(e)}" - ) -``` - -### 14.7 与原方案的对比 - -| 对比项 | 原方案(base64) | 新方案(两步上传) | -|--------|-----------------|-------------------| -| 图片传输 | base64 内嵌 | URL 引用 | -| 请求大小 | 大(含图片数据) | 小(仅 URL) | -| 适用场景 | 小图片 | 大图片、多图片 | -| 依赖 | OpenAI SDK | requests | -| 认证方式 | API Key | Token + Cookie | -| 接口格式 | OpenAI 标准 | 自定义 | - -### 14.8 配置确认 - -- [x] ~~`chat_id` 是否需要每次动态生成?~~ **是的,每次生成 UUID** -- [x] ~~`app_id` 是否固定?~~ **是的,固定值** -- [x] ~~`workflow_image2markdown_key` 如何获取?~~ **在 pyproject.toml 中配置** -- [x] ~~是否需要支持并发上传?~~ **否** - -### 14.9 配置文件设计 - -**pyproject.toml 新增配置项** -```toml -[project.optional-dependencies] -nova-api = [ - "requests>=2.28.0", -] - -[tool.nova-pdf] -# AI 服务配置 -base_url = "https://xny-test.glodon.com/jsf-ai" -app_id = "69fc37113fedac1eaaf65c82" -timeout = 120 - -# 认证配置(建议通过环境变量覆盖) -# upload_token = "" # 环境变量: NOVA_UPLOAD_TOKEN -# workflow_token = "" # 环境变量: NOVA_WORKFLOW_TOKEN -``` - -**环境变量** -- `NOVA_UPLOAD_TOKEN`: 上传接口认证 token (fastgpt_token) -- `NOVA_WORKFLOW_TOKEN`: Workflow 接口认证 token -- `NOVA_BASE_URL`: API 基础地址(可选,覆盖配置文件) -- `NOVA_APP_ID`: 应用 ID(可选,覆盖配置文件) diff --git a/docs/paddleocr-plugin-design.md b/docs/paddleocr-plugin-design.md new file mode 100644 index 000000000..8adeb8cfa --- /dev/null +++ b/docs/paddleocr-plugin-design.md @@ -0,0 +1,102 @@ +# markitdown-paddleocr 方案设计 + +## 概述 + +基于百度 PaddleOCR 云端 API 实现的 markitdown OCR 插件,参考 markitdown-glmocr 架构。 + +## 与 glmocr 的核心差异 + +| 维度 | glmocr | paddleocr | +|------|--------|-----------| +| API 风格 | 同步 SDK 调用 | 异步 Job 轮询(submit → poll → fetch result) | +| 认证 | `ZHIPU_API_KEY` | `BAIDU_PADDLE_TOKEN` (bearer token) | +| 结果格式 | SDK 封装对象 | JSONL 流(逐行 JSON,含 layoutParsingResults) | +| 图片处理 | SDK 内置 base64 编码 | 需手动上传文件或传 fileUrl | +| 模型 | glm-ocr | PaddleOCR-VL-1.5 | + +## 架构 + +``` +markitdown-paddleocr/ +├── pyproject.toml +├── README.md +└── src/markitdown_paddleocr/ + ├── __init__.py # 导出 + __plugin_interface_version__ + ├── __about__.py # __version__ + ├── _config.py # PaddleOcrConfig dataclass + ├── _paddle_client.py # PaddleOCR API 客户端(submit/poll/fetch) + ├── _converter.py # PaddleOcrConverter(DocumentConverter) + └── _plugin.py # register_converters 入口 +``` + +## 核心流程 + +``` +文件输入 (PDF/图片) + │ + ▼ +PaddleOcrConverter.convert() + │ + ├─ 图片文件 ──► _convert_image() ──► PaddleClient.ocr() ──► markdown + │ + └─ PDF 文件 ──► _convert_pdf() + │ + ├─ 逐页分析 (pdfplumber) + ├─ 纯文本页 ──► pdfplumber 提取 + └─ 复杂页 ──► 渲染为图片 ──► PaddleClient.ocr() ──► markdown +``` + +## PaddleClient 核心逻辑 + +```python +class PaddleClient: + JOB_URL = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs" + + def ocr(self, file_bytes, filename=None, file_url=None) -> str: + # 1. 提交 Job(本地文件用 multipart,URL 用 JSON) + job_id = self._submit(file_bytes, filename, file_url) + # 2. 轮询 Job 状态(pending → running → done) + result_url = self._poll(job_id) + # 3. 获取 JSONL 结果,拼接 markdown + return self._fetch_markdown(result_url) +``` + +## 关键设计决策 + +1. **异步轮询间隔**: 默认 2s,可配置,最大等待 300s +2. **PDF 处理策略**: 与 glmocr 一致,纯文本页用 pdfplumber,复杂页用 OCR +3. **图片上传**: 使用 multipart/form-data 上传本地文件;支持 fileUrl 模式 +4. **结果解析**: 从 JSONL 的 `layoutParsingResults[].markdown.text` 提取 markdown +5. **环境变量**: `BAIDU_PADDLE_TOKEN`(必需),`PADDLE_OCR_MODEL`(默认 PaddleOCR-VL-1.5) +6. **可选参数**: `useDocOrientationClassify`, `useDocUnwarping`, `useChartRecognition` + +## 依赖 + +``` +markitdown>=0.1.0 +pdfminer.six>=20251230 +pdfplumber>=0.11.9 +Pillow>=9.0.0 +requests>=2.28.0 +``` + +## 入口点 + +```toml +[project.entry-points."markitdown.plugin"] +markitdown_paddleocr = "markitdown_paddleocr" +``` + +## 使用方式 + +```bash +# 环境变量 +export BAIDU_PADDLE_TOKEN="your-token" + +# CLI +markitdown -p document.pdf + +# Python +from markitdown_paddleocr import PaddleOcrConverter +converter = PaddleOcrConverter(token="your-token") +``` diff --git "a/docs/panddle\347\244\272\344\276\213\344\273\243\347\240\201.md" "b/docs/panddle\347\244\272\344\276\213\344\273\243\347\240\201.md" new file mode 100644 index 000000000..b1d68059a --- /dev/null +++ "b/docs/panddle\347\244\272\344\276\213\344\273\243\347\240\201.md" @@ -0,0 +1,122 @@ +# Please make sure the requests library is installed +# pip install requests +import json +import os +import requests +import sys +import time + +JOB_URL = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs" +TOKEN = "7963b85a6bac7a4f5243d26210f1b8fa86daf5ef" +MODEL = "PaddleOCR-VL-1.5" + +file_path = "" + +headers = { + "Authorization": f"bearer {TOKEN}", +} + +optional_payload = { + "useDocOrientationClassify": False, + "useDocUnwarping": False, + "useChartRecognition": False, +} + +print(f"Processing file: {file_path}") + +if file_path.startswith("http"): + # URL Mode + headers["Content-Type"] = "application/json" + payload = { + "fileUrl": file_path, + "model": MODEL, + "optionalPayload": optional_payload + } + job_response = requests.post(JOB_URL, json=payload, headers=headers) +else: + # Local File Mode + if not os.path.exists(file_path): + print(f"Error: File not found at {file_path}") + sys.exit(1) + + data = { + "model": MODEL, + "optionalPayload": json.dumps(optional_payload) + } + + with open(file_path, "rb") as f: + files = {"file": f} + job_response = requests.post(JOB_URL, headers=headers, data=data, files=files) + +print(f"Response status: {job_response.status_code}") +if job_response.status_code != 200: + print(f"Response content: {job_response.text}") + +assert job_response.status_code == 200 +jobId = job_response.json()["data"]["jobId"] +print(f"Job submitted successfully. job id: {jobId}") +print("Start polling for results") + +jsonl_url = "" +while True: + job_result_response = requests.get(f"{JOB_URL}/{jobId}", headers=headers) + assert job_result_response.status_code == 200 + state = job_result_response.json()["data"]["state"] + if state == 'pending': + print("The current status of the job is pending") + elif state == 'running': + try: + total_pages = job_result_response.json()['data']['extractProgress']['totalPages'] + extracted_pages = job_result_response.json()['data']['extractProgress']['extractedPages'] + print(f"The current status of the job is running, total pages: {total_pages}, extracted pages: {extracted_pages}") + except KeyError: + print("The current status of the job is running...") + elif state == 'done': + extracted_pages = job_result_response.json()['data']['extractProgress']['extractedPages'] + start_time = job_result_response.json()['data']['extractProgress']['startTime'] + end_time = job_result_response.json()['data']['extractProgress']['endTime'] + print(f"Job completed, successfully extracted pages: {extracted_pages}, start time: {start_time}, end time: {end_time}") + jsonl_url = job_result_response.json()['data']['resultUrl']['jsonUrl'] + break + elif state == "failed": + error_msg = job_result_response.json()['data']['errorMsg'] + print(f"Job failed, failure reason:{error_msg}") + sys.exit() + + time.sleep(5) + +if jsonl_url: + jsonl_response = requests.get(jsonl_url) + jsonl_response.raise_for_status() + lines = jsonl_response.text.strip().split('\n') + output_dir = "output" + os.makedirs(output_dir, exist_ok=True) + page_num = 0 + for line_num, line in enumerate(lines, start=1): + line = line.strip() + if not line: + continue + result = json.loads(line)["result"] + for i, res in enumerate(result["layoutParsingResults"]): + md_filename = os.path.join(output_dir, f"doc_{page_num}.md") + with open(md_filename, "w", encoding="utf-8") as md_file: + md_file.write(res["markdown"]["text"]) + print(f"Markdown document saved at {md_filename}") + for img_path, img in res["markdown"]["images"].items(): + full_img_path = os.path.join(output_dir, img_path) + os.makedirs(os.path.dirname(full_img_path), exist_ok=True) + img_bytes = requests.get(img).content + with open(full_img_path, "wb") as img_file: + img_file.write(img_bytes) + print(f"Image saved to: {full_img_path}") + for img_name, img in res["outputImages"].items(): + img_response = requests.get(img) + if img_response.status_code == 200: + # Save image to local + filename = os.path.join(output_dir, f"{img_name}_{page_num}.jpg") + with open(filename, "wb") as f: + f.write(img_response.content) + print(f"Image saved to: {filename}") + else: + print(f"Failed to download image, status code: {img_response.status_code}") + page_num += 1 diff --git a/packages/markitdown-paddleocr/README.md b/packages/markitdown-paddleocr/README.md new file mode 100644 index 000000000..4685a343f --- /dev/null +++ b/packages/markitdown-paddleocr/README.md @@ -0,0 +1,157 @@ +# markitdown-paddleocr + +智能 PDF/图片转 Markdown 插件,使用百度 PaddleOCR 云端 API 驱动的 OCR 识别。 + +## 特性 + +- 🔍 **智能检测**:自动识别每页内容类型(纯文本 vs 图片/表格) +- 📄 **默认解析**:纯文本页面使用 pdfplumber/pdfminer 提取,速度快、成本低 +- 🤖 **AI 增强**:复杂页面(图片、表格)使用 PaddleOCR API 转换为 Markdown +- 🔄 **异步 Job 模型**:提交 OCR 任务 → 轮询状态 → 获取结果 +- 📊 **结构化输出**:返回 Markdown(含表格、公式、图表等) + +## 安装 + +```bash +pip install markitdown-paddleocr +``` + +## 配置 + +### 环境变量(推荐) + +```bash +# 必需:百度 PaddleOCR Token +export BAIDU_PADDLE_TOKEN="your-paddle-token" + +# 可选 +export PADDLE_OCR_MODEL="PaddleOCR-VL-1.5" # 模型名称 +``` + +### 配置优先级 + +``` +构造函数参数 > 环境变量 > 内置默认值 +``` + +## 使用方法 + +### 命令行(推荐) + +```bash +# 1. 设置 Token +export BAIDU_PADDLE_TOKEN="your-token" + +# 2. 查看已安装插件 +markitdown --list-plugins + +# 3. 使用插件转换 PDF +markitdown -p document.pdf + +# 4. 保存到文件 +markitdown -p document.pdf -o output.md +``` + +### Python API + +```python +from markitdown import MarkItDown +from markitdown_paddleocr import PaddleOcrConverter + +# 方式1:自动从环境变量读取 BAIDU_PADDLE_TOKEN +converter = PaddleOcrConverter() +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) +result = md.convert("document.pdf") +print(result.markdown) + +# 方式2:手动传入 Token +converter = PaddleOcrConverter(token="your-token") +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) +result = md.convert("document.pdf") +print(result.markdown) + +# 方式3:强制所有页面使用 OCR +converter = PaddleOcrConverter(token="your-token", force_ai=True) +md = MarkItDown(enable_plugins=False) +md.register_converter(converter, priority=-1.0) +result = md.convert("document.pdf") +print(result.markdown) +``` + +### 直接使用 PaddleClient + +```python +from markitdown_paddleocr import PaddleClient + +client = PaddleClient(token="your-token") + +# 本地文件 +markdown = client.ocr(file_bytes=open("image.png", "rb").read(), filename="image.png") +print(markdown) + +# URL 模式 +markdown = client.ocr(file_url="https://example.com/document.pdf") +print(markdown) +``` + +## 配置选项 + +### PaddleOcrConverter 参数 + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token | +| `model` | str | `PaddleOCR-VL-1.5` | OCR 模型名称 | +| `poll_interval` | float | 2.0 | 轮询间隔(秒) | +| `poll_timeout` | float | 300.0 | 轮询超时(秒) | +| `force_ai` | bool | False | 强制所有页面使用 OCR | +| `use_doc_orientation_classify` | bool | False | 文档方向分类 | +| `use_doc_unwarping` | bool | False | 文档去扭曲 | +| `use_chart_recognition` | bool | False | 图表识别 | + +### 环境变量 + +| 变量 | 说明 | 示例 | +|------|------|------| +| `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` | +| `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.5` | + +## 工作原理 + +``` +PDF/图片 输入 + │ + ▼ +PaddleOcrConverter.convert() + │ + ├─ 图片文件 ──► PaddleClient.ocr() ──► markdown + │ + └─ PDF 文件 ──► 逐页分析内容类型 + │ + ├─ 纯文本页 ──► pdfplumber 提取文本 + │ + └─ 复杂页(图片/表格) + │ + └─► 渲染为图片 ──► PaddleClient.ocr() + │ + ├─ POST /api/v2/ocr/jobs (提交 Job) + ├─ GET /api/v2/ocr/jobs/{id} (轮询状态) + └─ GET jsonUrl (获取 JSONL 结果) + │ + ▼ +合并输出完整 Markdown +``` + +## 依赖 + +- `markitdown>=0.1.0` - 基础框架 +- `pdfplumber>=0.11.9` - PDF 解析和截图 +- `pdfminer.six>=20251230` - 文本提取备用 +- `Pillow>=9.0.0` - 图像处理 +- `requests>=2.28.0` - HTTP 请求 + +## 许可证 + +MIT diff --git a/packages/markitdown-paddleocr/pyproject.toml b/packages/markitdown-paddleocr/pyproject.toml new file mode 100644 index 000000000..f3326cd04 --- /dev/null +++ b/packages/markitdown-paddleocr/pyproject.toml @@ -0,0 +1,58 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "markitdown-paddleocr" +dynamic = ["version"] +description = "Intelligent PDF/Image to Markdown converter using PaddleOCR cloud API" +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +keywords = ["markitdown", "pdf", "ocr", "paddleocr", "baidu", "vision"] +authors = [ + { name = "Contributors", email = "noreply@github.com" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] + +dependencies = [ + "markitdown>=0.1.0", + "pdfminer.six>=20251230", + "pdfplumber>=0.11.9", + "Pillow>=9.0.0", + "requests>=2.28.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", +] + +[project.urls] +Documentation = "https://github.com/microsoft/markitdown#readme" +Issues = "https://github.com/microsoft/markitdown/issues" +Source = "https://github.com/microsoft/markitdown" + +[tool.hatch.version] +path = "src/markitdown_paddleocr/__about__.py" + +# Plugin entry point - MarkItDown will discover this plugin +[project.entry-points."markitdown.plugin"] +markitdown_paddleocr = "markitdown_paddleocr" + +[tool.hatch.build.targets.sdist] +only-include = ["src/markitdown_paddleocr"] + +[tool.hatch.build.targets.wheel] +packages = ["src/markitdown_paddleocr"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__init__.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__init__.py new file mode 100644 index 000000000..00b431621 --- /dev/null +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__init__.py @@ -0,0 +1,16 @@ +"""markitdown-paddleocr: PDF/Image to Markdown converter using PaddleOCR cloud API.""" + +from ._plugin import register_converters +from ._config import PaddleOcrConfig +from ._converter import PaddleOcrConverter +from ._paddle_client import PaddleClient +from ._dual_converter import DualOcrConverter + +__plugin_interface_version__ = 1 +__all__ = [ + "register_converters", + "PaddleOcrConfig", + "PaddleOcrConverter", + "PaddleClient", + "DualOcrConverter", +] diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py new file mode 100644 index 000000000..51fc00d60 --- /dev/null +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py @@ -0,0 +1,46 @@ +"""Configuration for markitdown-paddleocr.""" + +import os +from dataclasses import dataclass + + +@dataclass +class PaddleOcrConfig: + """markitdown-paddleocr configuration. + + Configuration priority (high to low): + 1. Constructor kwargs + 2. Environment variables + 3. Built-in defaults + """ + + # API configuration + token: str = "" # Reads from BAIDU_PADDLE_TOKEN by default + + # OCR model + model: str = "PaddleOCR-VL-1.5" + + # API endpoint + job_url: str = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs" + + # Polling configuration + poll_interval: float = 2.0 # seconds between polls + poll_timeout: float = 300.0 # max seconds to wait for job completion + + # Optional OCR features + use_doc_orientation_classify: bool = False + use_doc_unwarping: bool = False + use_chart_recognition: bool = False + + # Processing strategy + force_ai: bool = False + + @classmethod + def from_env(cls, **overrides) -> "PaddleOcrConfig": + """Create config from environment variables with optional overrides.""" + defaults = { + "token": os.environ.get("BAIDU_PADDLE_TOKEN", ""), + "model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.5"), + } + defaults.update(overrides) + return cls(**defaults) diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py new file mode 100644 index 000000000..6d8ae5e63 --- /dev/null +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py @@ -0,0 +1,304 @@ +"""PaddleOcr Converter - PDF/Image to Markdown using PaddleOCR cloud API.""" + +import io +import sys +from typing import Any, BinaryIO, Optional + +from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo +from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +from ._config import PaddleOcrConfig +from ._paddle_client import PaddleClient + +# Import PDF dependencies +_dependency_exc_info = None +try: + import pdfminer + import pdfminer.high_level + import pdfplumber +except ImportError: + _dependency_exc_info = sys.exc_info() + + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/pdf", + "application/x-pdf", + "image/jpeg", + "image/png", +] + +ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"] + + +class PaddleOcrConverter(DocumentConverter): + """Intelligent PDF/Image converter using PaddleOCR cloud API. + + Features: + - Auto-detect page content type (plain text vs images/tables) + - Plain text pages use pdfplumber/pdfminer (fast, free) + - Complex pages use PaddleOCR API for AI-powered OCR + - Image files (PNG, JPG) use PaddleOCR API directly + - Asynchronous job model: submit → poll → fetch result + """ + + def __init__( + self, + token: Optional[str] = None, + model: str = "PaddleOCR-VL-1.5", + poll_interval: float = 2.0, + poll_timeout: float = 300.0, + force_ai: bool = False, + use_doc_orientation_classify: bool = False, + use_doc_unwarping: bool = False, + use_chart_recognition: bool = False, + config: Optional[PaddleOcrConfig] = None, + ): + """Initialize converter. + + Args: + token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided) + model: OCR model name (default: PaddleOCR-VL-1.5) + poll_interval: Seconds between status polls (default: 2.0) + poll_timeout: Max seconds to wait for job completion (default: 300.0) + force_ai: Force all pages to use OCR (default: False) + use_doc_orientation_classify: Enable document orientation classification + use_doc_unwarping: Enable document unwarping + use_chart_recognition: Enable chart recognition + config: Optional PaddleOcrConfig instance + """ + # Build config from explicit params or provided config + if config: + self.token = token or config.token + self.model = model if model != "PaddleOCR-VL-1.5" else config.model + self.poll_interval = poll_interval if poll_interval != 2.0 else config.poll_interval + self.poll_timeout = poll_timeout if poll_timeout != 300.0 else config.poll_timeout + self.force_ai = force_ai or config.force_ai + self.use_doc_orientation_classify = use_doc_orientation_classify or config.use_doc_orientation_classify + self.use_doc_unwarping = use_doc_unwarping or config.use_doc_unwarping + self.use_chart_recognition = use_chart_recognition or config.use_chart_recognition + else: + self.token = token + self.model = model + self.poll_interval = poll_interval + self.poll_timeout = poll_timeout + self.force_ai = force_ai + self.use_doc_orientation_classify = use_doc_orientation_classify + self.use_doc_unwarping = use_doc_unwarping + self.use_chart_recognition = use_chart_recognition + + # Lazy init client + self._client: Optional[PaddleClient] = None + + def _get_client(self) -> PaddleClient: + """Get or create PaddleClient instance.""" + if self._client is None: + config = PaddleOcrConfig( + token=self.token or "", + model=self.model, + poll_interval=self.poll_interval, + poll_timeout=self.poll_timeout, + force_ai=self.force_ai, + use_doc_orientation_classify=self.use_doc_orientation_classify, + use_doc_unwarping=self.use_doc_unwarping, + use_chart_recognition=self.use_chart_recognition, + ) + self._client = PaddleClient(config=config) + return self._client + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".pdf", + feature="pdf", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) + + extension = (stream_info.extension or "").lower() + + # Image files: use PaddleOCR directly + if extension in (".jpg", ".jpeg", ".png"): + return self._convert_image(file_stream, extension) + + # PDF files: use hybrid approach + return self._convert_pdf(file_stream) + + def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult: + """Convert image file using PaddleOCR API.""" + img_bytes = file_stream.read() + filename = f"image{extension}" + + try: + markdown = self._get_client().ocr(file_bytes=img_bytes, filename=filename) + return DocumentConverterResult(markdown=markdown) + except Exception as e: + return DocumentConverterResult( + markdown=f"" + ) + + def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: + """Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages).""" + pdf_stream = io.BytesIO(file_stream.read()) + markdown_parts = [] + + try: + with pdfplumber.open(pdf_stream) as pdf: + for page_num, page in enumerate(pdf.pages): + # Analyze page type + page_type = self._analyze_page(page) + + # Choose processing method + if self.force_ai or page_type != "plain_text": + # Complex content: use PaddleOCR + markdown = self._convert_with_paddleocr(page, page_num) + else: + # Plain text: use pdfplumber + markdown = self._extract_text_with_tables(page) + + if markdown.strip(): + markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}") + + page.close() + + markdown = "\n\n".join(markdown_parts).strip() + + except Exception: + # Fallback to pdfminer + pdf_stream.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_stream) or "" + + # Final fallback + if not markdown: + pdf_stream.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_stream) or "" + + return DocumentConverterResult(markdown=markdown) + + def _analyze_page(self, page: Any) -> str: + """Analyze page content type.""" + # Check for images + if hasattr(page, "images") and page.images: + return "complex" + + # Check for tables + tables = page.find_tables() + if tables: + return "complex" + + # Check for graphics/curves + if hasattr(page, "curves") and page.curves: + return "complex" + + return "plain_text" + + def _convert_with_paddleocr(self, page: Any, page_num: int) -> str: + """Convert page using PaddleOCR API.""" + try: + # Render page to image + img = page.to_image(resolution=150) + img_bytes = io.BytesIO() + img.save(img_bytes, format="PNG") + + markdown = self._get_client().ocr( + file_bytes=img_bytes.getvalue(), + filename=f"page_{page_num + 1}.png", + ) + return markdown + + except Exception: + # Fallback to pdfplumber text extraction + return self._extract_text_with_tables(page) + + def _extract_text_with_tables(self, page: Any) -> str: + """Extract text and tables from page.""" + parts = [] + + # Extract text + text = page.extract_text() or "" + if text.strip(): + parts.append(text.strip()) + + # Extract tables + try: + tables = page.extract_tables() + if tables: + for table in tables: + if table: + md_table = self._table_to_markdown(table) + if md_table.strip(): + parts.append(md_table) + except Exception: + pass + + return "\n\n".join(parts) + + def _table_to_markdown(self, table: list[list[str]]) -> str: + """Convert table to Markdown.""" + if not table: + return "" + + # Filter None values + table = [[cell if cell is not None else "" for cell in row] for row in table] + + # Filter empty rows + table = [row for row in table if any(cell.strip() for cell in row)] + + if not table: + return "" + + # Calculate column widths + col_widths = [ + max(len(str(row[i])) if i < len(row) else 0 for row in table) + for i in range(max(len(row) for row in table)) + ] + + # Format table + lines = [] + for row_idx, row in enumerate(table): + padded_row = row + [""] * (len(col_widths) - len(row)) + line = "| " + " | ".join( + str(cell).ljust(width) for cell, width in zip(padded_row, col_widths) + ) + " |" + lines.append(line) + + if row_idx == 0: + sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|" + lines.append(sep) + + return "\n".join(lines) + + def close(self): + """Close the client.""" + self._client = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_dual_converter.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_dual_converter.py new file mode 100644 index 000000000..e27395c4d --- /dev/null +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_dual_converter.py @@ -0,0 +1,160 @@ +"""DualOcrConverter - glmocr (primary) → paddleocr (fallback) automatic degradation.""" + +import logging +from typing import Optional + +from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo +from typing import BinaryIO, Any + +logger = logging.getLogger(__name__) + + +class DualOcrConverter(DocumentConverter): + """Dual OCR converter with automatic fallback: glmocr → paddleocr. + + Usage: + converter = DualOcrConverter() + md = MarkItDown(enable_plugins=False) + md.register_converter(converter, priority=-1.0) + result = md.convert("document.pdf") + """ + + def __init__( + self, + # glmocr kwargs + glmocr_api_key: Optional[str] = None, + glmocr_timeout: int = 1800, + glmocr_enable_layout: bool = False, + glmocr_force_ai: bool = False, + # paddleocr kwargs + paddleocr_token: Optional[str] = None, + paddleocr_model: str = "PaddleOCR-VL-1.5", + paddleocr_poll_interval: float = 2.0, + paddleocr_poll_timeout: float = 300.0, + paddleocr_force_ai: bool = False, + paddleocr_use_doc_orientation_classify: bool = False, + paddleocr_use_doc_unwarping: bool = False, + paddleocr_use_chart_recognition: bool = False, + ): + self.glmocr_kwargs = { + "api_key": glmocr_api_key, + "timeout": glmocr_timeout, + "enable_layout": glmocr_enable_layout, + "force_ai": glmocr_force_ai, + } + self.paddleocr_kwargs = { + "token": paddleocr_token, + "model": paddleocr_model, + "poll_interval": paddleocr_poll_interval, + "poll_timeout": paddleocr_poll_timeout, + "force_ai": paddleocr_force_ai, + "use_doc_orientation_classify": paddleocr_use_doc_orientation_classify, + "use_doc_unwarping": paddleocr_use_doc_unwarping, + "use_chart_recognition": paddleocr_use_chart_recognition, + } + + self._primary = None + self._fallback = None + self._init_converters() + + def _init_converters(self): + """Lazily init both converters.""" + try: + from markitdown_glmocr import GlmOcrConverter + # Filter out None values + kwargs = {k: v for k, v in self.glmocr_kwargs.items() if v is not None} + self._primary = GlmOcrConverter(**kwargs) + logger.info("glmocr converter initialized (primary)") + except Exception as e: + logger.warning("glmocr init failed: %s", e) + self._primary = None + + try: + from markitdown_paddleocr import PaddleOcrConverter + kwargs = {k: v for k, v in self.paddleocr_kwargs.items() if v is not None} + self._fallback = PaddleOcrConverter(**kwargs) + logger.info("paddleocr converter initialized (fallback)") + except Exception as e: + logger.warning("paddleocr init failed: %s", e) + self._fallback = None + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + """Accept if either converter accepts.""" + if self._primary: + try: + file_stream.seek(0) + if self._primary.accepts(file_stream, stream_info, **kwargs): + return True + except Exception: + pass + + if self._fallback: + try: + file_stream.seek(0) + if self._fallback.accepts(file_stream, stream_info, **kwargs): + return True + except Exception: + pass + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + """Convert with primary, fallback on failure.""" + data = file_stream.read() + + # Try primary (glmocr) + if self._primary: + try: + result = self._primary.convert(io_bytes(data), stream_info, **kwargs) + if result.markdown and result.markdown.strip(): + logger.info("✓ glmocr succeeded") + return result + logger.warning("glmocr returned empty result, falling back") + except Exception as e: + logger.warning("glmocr failed: %s, falling back to paddleocr", e) + + # Fallback (paddleocr) + if self._fallback: + try: + result = self._fallback.convert(io_bytes(data), stream_info, **kwargs) + if result.markdown and result.markdown.strip(): + logger.info("✓ paddleocr succeeded (fallback)") + return result + logger.warning("paddleocr returned empty result") + except Exception as e: + logger.error("paddleocr also failed: %s", e) + + # Both failed + return DocumentConverterResult( + markdown="" + ) + + def close(self): + if self._primary and hasattr(self._primary, "close"): + self._primary.close() + if self._fallback and hasattr(self._fallback, "close"): + self._fallback.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + +def io_bytes(data: bytes): + """Create a seekable BytesIO from bytes.""" + import io + buf = io.BytesIO(data) + buf.seek(0) + return buf diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_paddle_client.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_paddle_client.py new file mode 100644 index 000000000..ba12e51c9 --- /dev/null +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_paddle_client.py @@ -0,0 +1,189 @@ +"""PaddleOCR API Client - handles job submission, polling, and result fetching.""" + +import json +import logging +import time +from typing import Optional + +import requests + +from ._config import PaddleOcrConfig + +logger = logging.getLogger(__name__) + + +class PaddleOcrError(Exception): + """PaddleOCR API error.""" + + pass + + +class PaddleClient: + """Client for PaddleOCR cloud API. + + Workflow: submit job → poll status → fetch JSONL result → extract markdown. + """ + + def __init__(self, config: Optional[PaddleOcrConfig] = None, **kwargs): + if config is None: + config = PaddleOcrConfig(**kwargs) + self.config = config + + # Token from config or env + self.token = config.token + if not self.token: + import os + self.token = os.environ.get("BAIDU_PADDLE_TOKEN", "") + + def _headers(self) -> dict: + """Build authorization headers.""" + return {"Authorization": f"bearer {self.token}"} + + def _optional_payload(self) -> dict: + """Build optional payload flags.""" + return { + "useDocOrientationClassify": self.config.use_doc_orientation_classify, + "useDocUnwarping": self.config.use_doc_unwarping, + "useChartRecognition": self.config.use_chart_recognition, + } + + def ocr( + self, + file_bytes: Optional[bytes] = None, + filename: Optional[str] = None, + file_url: Optional[str] = None, + ) -> str: + """Run OCR on a file or URL, return concatenated markdown. + + Args: + file_bytes: File content bytes (for local file upload). + filename: Filename for multipart upload (e.g. "page.png"). + file_url: File URL (for URL mode, alternative to file_bytes). + + Returns: + Markdown text extracted from all pages. + + Raises: + PaddleOcrError: On API errors or timeout. + """ + # 1. Submit job + job_id = self._submit(file_bytes=file_bytes, filename=filename, file_url=file_url) + logger.info("Job submitted: %s", job_id) + + # 2. Poll until done + result_url = self._poll(job_id) + logger.info("Job completed, result URL obtained") + + # 3. Fetch and parse results + return self._fetch_markdown(result_url) + + def _submit( + self, + file_bytes: Optional[bytes] = None, + filename: Optional[str] = None, + file_url: Optional[str] = None, + ) -> str: + """Submit an OCR job, return job ID.""" + headers = self._headers() + + if file_url: + # URL mode + headers["Content-Type"] = "application/json" + payload = { + "fileUrl": file_url, + "model": self.config.model, + "optionalPayload": self._optional_payload(), + } + resp = requests.post(self.config.job_url, json=payload, headers=headers) + elif file_bytes is not None: + # Local file mode - multipart upload + data = { + "model": self.config.model, + "optionalPayload": json.dumps(self._optional_payload()), + } + fname = filename or "document" + files = {"file": (fname, file_bytes)} + resp = requests.post(self.config.job_url, headers=headers, data=data, files=files) + else: + raise PaddleOcrError("Either file_bytes or file_url must be provided") + + if resp.status_code != 200: + raise PaddleOcrError(f"Submit failed (HTTP {resp.status_code}): {resp.text}") + + result = resp.json() + job_id = result.get("data", {}).get("jobId") + if not job_id: + raise PaddleOcrError(f"No jobId in response: {result}") + + return job_id + + def _poll(self, job_id: str) -> str: + """Poll job status until done, return JSONL result URL.""" + headers = self._headers() + url = f"{self.config.job_url}/{job_id}" + start = time.time() + + while True: + resp = requests.get(url, headers=headers) + if resp.status_code != 200: + raise PaddleOcrError(f"Poll failed (HTTP {resp.status_code}): {resp.text}") + + data = resp.json().get("data", {}) + state = data.get("state", "") + + if state == "done": + result_url = data.get("resultUrl", {}).get("jsonUrl", "") + if not result_url: + raise PaddleOcrError("Job done but no resultUrl in response") + return result_url + + if state == "failed": + error_msg = data.get("errorMsg", "Unknown error") + raise PaddleOcrError(f"Job failed: {error_msg}") + + # Still pending or running + if state == "running": + progress = data.get("extractProgress", {}) + total = progress.get("totalPages", "?") + extracted = progress.get("extractedPages", "?") + logger.debug("Running: %s/%s pages", extracted, total) + else: + logger.debug("State: %s", state) + + # Check timeout + elapsed = time.time() - start + if elapsed > self.config.poll_timeout: + raise PaddleOcrError( + f"Job polling timed out after {self.config.poll_timeout}s (state={state})" + ) + + time.sleep(self.config.poll_interval) + + def _fetch_markdown(self, jsonl_url: str) -> str: + """Fetch JSONL result and extract markdown from all pages.""" + resp = requests.get(jsonl_url) + resp.raise_for_status() + + markdown_parts = [] + lines = resp.text.strip().split("\n") + + for line in lines: + line = line.strip() + if not line: + continue + + try: + page_data = json.loads(line) + except json.JSONDecodeError: + logger.warning("Skipping invalid JSONL line") + continue + + result = page_data.get("result", {}) + layout_results = result.get("layoutParsingResults", []) + + for layout in layout_results: + md_text = layout.get("markdown", {}).get("text", "") + if md_text.strip(): + markdown_parts.append(md_text.strip()) + + return "\n\n".join(markdown_parts) diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py new file mode 100644 index 000000000..09ae96e6d --- /dev/null +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py @@ -0,0 +1,35 @@ +"""Plugin registration for markitdown-paddleocr.""" + +from typing import Any +from markitdown import MarkItDown + +from ._converter import PaddleOcrConverter + + +__plugin_interface_version__ = 1 + + +def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: + """Register markitdown-paddleocr converter. + + Config sources (priority high to low): + 1. kwargs parameters + 2. Environment variables (BAIDU_PADDLE_TOKEN) + 3. Built-in defaults + """ + # Register converter with higher priority than default PDF converter + PRIORITY_PADDLEOCR = -1.0 + + markitdown.register_converter( + PaddleOcrConverter( + token=kwargs.get("token"), + model=kwargs.get("model", "PaddleOCR-VL-1.5"), + poll_interval=kwargs.get("poll_interval", 2.0), + poll_timeout=kwargs.get("poll_timeout", 300.0), + force_ai=kwargs.get("force_ai", False), + use_doc_orientation_classify=kwargs.get("use_doc_orientation_classify", False), + use_doc_unwarping=kwargs.get("use_doc_unwarping", False), + use_chart_recognition=kwargs.get("use_chart_recognition", False), + ), + priority=PRIORITY_PADDLEOCR, + ) diff --git a/packages/markitdown-paddleocr/tests/__init__.py b/packages/markitdown-paddleocr/tests/__init__.py new file mode 100644 index 000000000..4be5c24f3 --- /dev/null +++ b/packages/markitdown-paddleocr/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for markitdown-paddleocr.""" diff --git a/packages/markitdown-paddleocr/tests/test_converter.py b/packages/markitdown-paddleocr/tests/test_converter.py new file mode 100644 index 000000000..aaca74c8c --- /dev/null +++ b/packages/markitdown-paddleocr/tests/test_converter.py @@ -0,0 +1,214 @@ +"""Tests for PaddleOcrConverter.""" + +import io +import pytest +from unittest.mock import MagicMock, patch + +from markitdown_paddleocr._converter import PaddleOcrConverter + + +class TestPaddleOcrConverterAccepts: + """Accepts method tests.""" + + def test_accepts_pdf_extension(self): + """Accept .pdf extension.""" + converter = PaddleOcrConverter() + stream = io.BytesIO(b"%PDF-1.4") + stream_info = MagicMock(extension=".pdf", mimetype=None) + assert converter.accepts(stream, stream_info) is True + + def test_accepts_pdf_mimetype(self): + """Accept PDF MIME type.""" + converter = PaddleOcrConverter() + stream = io.BytesIO(b"%PDF-1.4") + stream_info = MagicMock(extension=None, mimetype="application/pdf") + assert converter.accepts(stream, stream_info) is True + + def test_accepts_image_extensions(self): + """Accept image extensions.""" + converter = PaddleOcrConverter() + for ext in [".jpg", ".jpeg", ".png"]: + stream = io.BytesIO(b"fake") + stream_info = MagicMock(extension=ext, mimetype=None) + assert converter.accepts(stream, stream_info) is True + + def test_rejects_non_supported(self): + """Reject non-supported files.""" + converter = PaddleOcrConverter() + stream = io.BytesIO(b"not a pdf") + stream_info = MagicMock(extension=".txt", mimetype="text/plain") + assert converter.accepts(stream, stream_info) is False + + +class TestPaddleOcrConverterTable: + """Table to Markdown conversion tests.""" + + def test_table_to_markdown(self): + """Table to Markdown conversion.""" + converter = PaddleOcrConverter() + table = [ + ["Name", "Age", "City"], + ["Alice", "25", "Beijing"], + ["Bob", "30", "Shanghai"], + ] + result = converter._table_to_markdown(table) + assert "|" in result + assert "Name" in result + assert "Alice" in result + assert "---" in result + + def test_empty_table(self): + """Empty table returns empty string.""" + converter = PaddleOcrConverter() + assert converter._table_to_markdown([]) == "" + + def test_table_with_none_values(self): + """Table with None values.""" + converter = PaddleOcrConverter() + table = [ + ["A", None, "C"], + ["1", "2", None], + ] + result = converter._table_to_markdown(table) + assert "|" in result + assert "A" in result + + +class TestPaddleOcrConverterImage: + """Image conversion tests.""" + + def test_convert_image_success(self): + """Convert image with PaddleOCR success.""" + converter = PaddleOcrConverter(token="test-token") + + mock_client = MagicMock() + mock_client.ocr.return_value = "# Image Title\n\nContent" + converter._client = mock_client + + stream = io.BytesIO(b"fake-image") + stream_info = MagicMock(extension=".png", mimetype="image/png") + result = converter.convert(stream, stream_info) + + assert "# Image Title" in result.markdown + mock_client.ocr.assert_called_once() + + def test_convert_image_error(self): + """Convert image with PaddleOCR error returns comment.""" + converter = PaddleOcrConverter(token="test-token") + + mock_client = MagicMock() + mock_client.ocr.side_effect = Exception("API Error") + converter._client = mock_client + + stream = io.BytesIO(b"fake-image") + stream_info = MagicMock(extension=".png", mimetype="image/png") + result = converter.convert(stream, stream_info) + + assert "Error converting image" in result.markdown + + +class TestPaddleOcrConverterPdf: + """PDF conversion tests.""" + + def test_plain_text_page(self): + """Plain text page uses pdfplumber.""" + converter = PaddleOcrConverter() + + page = MagicMock() + page.images = [] + page.find_tables.return_value = [] + page.extract_tables.return_value = [] + page.extract_text.return_value = "Hello World" + page.close = MagicMock() + + mock_pdf = MagicMock() + mock_pdf.pages = [page] + + with patch("markitdown_paddleocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = mock_pdf + stream = io.BytesIO(b"%PDF-1.4") + result = converter.convert(stream, MagicMock(extension=".pdf", mimetype=None)) + + assert "Hello World" in result.markdown + + def test_complex_page_uses_paddleocr(self): + """Complex page uses PaddleOCR.""" + converter = PaddleOcrConverter(token="test-token") + + mock_client = MagicMock() + mock_client.ocr.return_value = "OCR result for complex page" + converter._client = mock_client + + page = MagicMock() + page.images = [MagicMock()] + page.find_tables.return_value = [] + page.to_image.return_value.save = MagicMock( + side_effect=lambda buf, format: buf.write(b"fake-png") + ) + page.close = MagicMock() + + mock_pdf = MagicMock() + mock_pdf.pages = [page] + + with patch("markitdown_paddleocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = mock_pdf + stream = io.BytesIO(b"%PDF-1.4") + result = converter.convert(stream, MagicMock(extension=".pdf", mimetype=None)) + + mock_client.ocr.assert_called_once() + assert "OCR result" in result.markdown + + def test_force_ai_mode(self): + """Force AI mode uses PaddleOCR for all pages.""" + converter = PaddleOcrConverter(token="test-token", force_ai=True) + + mock_client = MagicMock() + mock_client.ocr.return_value = "AI result" + converter._client = mock_client + + page = MagicMock() + page.images = [] + page.find_tables.return_value = [] + page.to_image.return_value.save = MagicMock( + side_effect=lambda buf, format: buf.write(b"fake-png") + ) + page.close = MagicMock() + + mock_pdf = MagicMock() + mock_pdf.pages = [page] + + with patch("markitdown_paddleocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = mock_pdf + stream = io.BytesIO(b"%PDF-1.4") + result = converter.convert(stream, MagicMock(extension=".pdf", mimetype=None)) + + mock_client.ocr.assert_called_once() + + +class TestPaddleOcrConverterConfig: + """Config initialization tests.""" + + def test_default_config(self): + """Default configuration values.""" + converter = PaddleOcrConverter() + assert converter.model == "PaddleOCR-VL-1.5" + assert converter.poll_interval == 2.0 + assert converter.poll_timeout == 300.0 + assert converter.force_ai is False + + def test_custom_config(self): + """Custom configuration values.""" + converter = PaddleOcrConverter( + token="my-token", + model="custom-model", + poll_interval=5.0, + poll_timeout=600.0, + force_ai=True, + use_chart_recognition=True, + ) + assert converter.token == "my-token" + assert converter.model == "custom-model" + assert converter.poll_interval == 5.0 + assert converter.poll_timeout == 600.0 + assert converter.force_ai is True + assert converter.use_chart_recognition is True diff --git a/packages/markitdown-paddleocr/tests/test_paddle_client.py b/packages/markitdown-paddleocr/tests/test_paddle_client.py new file mode 100644 index 000000000..361a329b6 --- /dev/null +++ b/packages/markitdown-paddleocr/tests/test_paddle_client.py @@ -0,0 +1,241 @@ +"""Tests for PaddleClient.""" + +import json +import pytest +from unittest.mock import MagicMock, patch + +from markitdown_paddleocr._paddle_client import PaddleClient, PaddleOcrError +from markitdown_paddleocr._config import PaddleOcrConfig + + +class TestPaddleClientInit: + """Client initialization tests.""" + + def test_init_with_token(self): + """Init with explicit token.""" + client = PaddleClient(token="test-token") + assert client.token == "test-token" + + @patch.dict("os.environ", {"BAIDU_PADDLE_TOKEN": "env-token"}) + def test_init_from_env(self): + """Init from environment variable.""" + client = PaddleClient() + assert client.token == "env-token" + + def test_init_with_config(self): + """Init with PaddleOcrConfig.""" + config = PaddleOcrConfig(token="config-token", model="custom-model") + client = PaddleClient(config=config) + assert client.token == "config-token" + assert client.config.model == "custom-model" + + +class TestPaddleClientSubmit: + """Job submission tests.""" + + def test_submit_local_file(self): + """Submit local file via multipart upload.""" + client = PaddleClient(token="test-token") + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"data": {"jobId": "job-123"}} + + with patch("requests.post", return_value=mock_response) as mock_post: + job_id = client._submit(file_bytes=b"fake-image", filename="test.png") + + assert job_id == "job-123" + # Verify multipart upload was used (files parameter) + call_kwargs = mock_post.call_args + assert "files" in call_kwargs.kwargs or len(call_kwargs.args) > 0 + + def test_submit_url_mode(self): + """Submit file URL via JSON.""" + client = PaddleClient(token="test-token") + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"data": {"jobId": "job-456"}} + + with patch("requests.post", return_value=mock_response) as mock_post: + job_id = client._submit(file_url="https://example.com/doc.pdf") + + assert job_id == "job-456" + + def test_submit_error(self): + """Submit with API error.""" + client = PaddleClient(token="test-token") + + mock_response = MagicMock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + + with patch("requests.post", return_value=mock_response): + with pytest.raises(PaddleOcrError, match="Submit failed"): + client._submit(file_bytes=b"fake", filename="test.png") + + def test_submit_no_input(self): + """Submit without file or URL raises error.""" + client = PaddleClient(token="test-token") + with pytest.raises(PaddleOcrError, match="Either file_bytes or file_url"): + client._submit() + + +class TestPaddleClientPoll: + """Job polling tests.""" + + def test_poll_done_immediately(self): + """Job is done on first poll.""" + client = PaddleClient(token="test-token") + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "data": { + "state": "done", + "resultUrl": {"jsonUrl": "https://result.url/data.jsonl"}, + } + } + + with patch("requests.get", return_value=mock_response): + result_url = client._poll("job-123") + + assert result_url == "https://result.url/data.jsonl" + + def test_poll_failed(self): + """Job fails.""" + client = PaddleClient(token="test-token") + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "data": {"state": "failed", "errorMsg": "Processing error"} + } + + with patch("requests.get", return_value=mock_response): + with pytest.raises(PaddleOcrError, match="Job failed"): + client._poll("job-123") + + def test_poll_timeout(self): + """Polling timeout.""" + config = PaddleOcrConfig(token="test-token", poll_interval=0.01, poll_timeout=0.05) + client = PaddleClient(config=config) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"data": {"state": "pending"}} + + with patch("requests.get", return_value=mock_response): + with pytest.raises(PaddleOcrError, match="timed out"): + client._poll("job-123") + + +class TestPaddleClientFetchMarkdown: + """Result fetching tests.""" + + def test_fetch_single_page(self): + """Fetch single page result.""" + client = PaddleClient(token="test-token") + + jsonl_content = json.dumps({ + "result": { + "layoutParsingResults": [ + {"markdown": {"text": "# Title\n\nHello world"}} + ] + } + }) + + mock_response = MagicMock() + mock_response.text = jsonl_content + mock_response.raise_for_status = MagicMock() + + with patch("requests.get", return_value=mock_response): + markdown = client._fetch_markdown("https://result.url/data.jsonl") + + assert "# Title" in markdown + assert "Hello world" in markdown + + def test_fetch_multi_page(self): + """Fetch multi-page result.""" + client = PaddleClient(token="test-token") + + page1 = json.dumps({ + "result": { + "layoutParsingResults": [ + {"markdown": {"text": "Page 1 content"}} + ] + } + }) + page2 = json.dumps({ + "result": { + "layoutParsingResults": [ + {"markdown": {"text": "Page 2 content"}} + ] + } + }) + jsonl_content = f"{page1}\n{page2}" + + mock_response = MagicMock() + mock_response.text = jsonl_content + mock_response.raise_for_status = MagicMock() + + with patch("requests.get", return_value=mock_response): + markdown = client._fetch_markdown("https://result.url/data.jsonl") + + assert "Page 1 content" in markdown + assert "Page 2 content" in markdown + + def test_fetch_empty_result(self): + """Fetch empty result.""" + client = PaddleClient(token="test-token") + + mock_response = MagicMock() + mock_response.text = "" + mock_response.raise_for_status = MagicMock() + + with patch("requests.get", return_value=mock_response): + markdown = client._fetch_markdown("https://result.url/data.jsonl") + + assert markdown == "" + + +class TestPaddleClientOcr: + """Full OCR workflow tests.""" + + def test_ocr_workflow(self): + """Complete OCR workflow: submit → poll → fetch.""" + client = PaddleClient(token="test-token") + + # Mock submit + submit_resp = MagicMock() + submit_resp.status_code = 200 + submit_resp.json.return_value = {"data": {"jobId": "job-789"}} + + # Mock poll + poll_resp = MagicMock() + poll_resp.status_code = 200 + poll_resp.json.return_value = { + "data": { + "state": "done", + "resultUrl": {"jsonUrl": "https://result.url/data.jsonl"}, + } + } + + # Mock fetch + jsonl_content = json.dumps({ + "result": { + "layoutParsingResults": [ + {"markdown": {"text": "# OCR Result\n\nExtracted text."}} + ] + } + }) + fetch_resp = MagicMock() + fetch_resp.text = jsonl_content + fetch_resp.raise_for_status = MagicMock() + + with patch("requests.post", return_value=submit_resp), \ + patch("requests.get", side_effect=[poll_resp, fetch_resp]): + markdown = client.ocr(file_bytes=b"fake-image", filename="test.png") + + assert "# OCR Result" in markdown + assert "Extracted text." in markdown From 8420af6bb2996fc5f483cca2022a77d32e9ca724 Mon Sep 17 00:00:00 2001 From: hankl Date: Thu, 21 May 2026 11:13:38 +0800 Subject: [PATCH 08/15] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=8F=91=E5=B8=83?= =?UTF-8?q?=E5=88=B0Pypi=E7=9A=84=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/markitdown-glmocr/README.md | 48 +++++++++++++++++++++++++ packages/markitdown-paddleocr/README.md | 48 +++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/packages/markitdown-glmocr/README.md b/packages/markitdown-glmocr/README.md index d0193d2ed..15c2b819e 100644 --- a/packages/markitdown-glmocr/README.md +++ b/packages/markitdown-glmocr/README.md @@ -191,6 +191,54 @@ glmocr SDK 返回的结构化数据支持以下标签: - `Pillow>=9.0.0` - 图像处理 - `glmocr` - 智谱 OCR SDK(可选,AI 功能需要) +## 发布到 PyPI + +### 前置条件 + +- 确保已安装 `build` 和 `twine`: + +```bash +pip install build twine +``` + +- 确保环境变量 `PyPI_API_Token` 已设置为你的 PyPI API Token: + +```bash +export PyPI_API_Token="pypi-..." +``` + +### 发布步骤 + +```bash +# 1. 进入项目根目录(包含 pyproject.toml) +cd packages/markitdown-glmocr + +# 2. 构建分发包(生成 dist/ 目录下的 .tar.gz 和 .whl 文件) +python -m build + +# 3. 检查包的元数据和内容 +twine check dist/* + +# 4. 上传到 PyPI(使用环境变量中的 Token 认证) +twine upload dist/* -u __token__ -p "$PyPI_API_Token" +``` + +### 发布到 TestPyPI(测试) + +```bash +# 先上传到 TestPyPI 验证包是否正确 +twine upload --repository testpypi dist/* -u __token__ -p "$PyPI_API_Token" + +# 从 TestPyPI 安装验证 +pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr +``` + +### 注意事项 + +- 发布前确保 `pyproject.toml` 中的版本号已更新 +- 同一版本号不能重复上传,如需修正必须 bump 版本号 +- `PyPI_API_Token` 环境变量切勿硬编码到脚本或提交到代码仓库 + ## 许可证 MIT \ No newline at end of file diff --git a/packages/markitdown-paddleocr/README.md b/packages/markitdown-paddleocr/README.md index 4685a343f..2f4c49a13 100644 --- a/packages/markitdown-paddleocr/README.md +++ b/packages/markitdown-paddleocr/README.md @@ -152,6 +152,54 @@ PaddleOcrConverter.convert() - `Pillow>=9.0.0` - 图像处理 - `requests>=2.28.0` - HTTP 请求 +## 发布到 PyPI + +### 前置条件 + +- 确保已安装 `build` 和 `twine`: + +```bash +pip install build twine +``` + +- 确保环境变量 `PyPI_API_Token` 已设置为你的 PyPI API Token: + +```bash +export PyPI_API_Token="pypi-..." +``` + +### 发布步骤 + +```bash +# 1. 进入项目根目录(包含 pyproject.toml) +cd packages/markitdown-paddleocr + +# 2. 构建分发包(生成 dist/ 目录下的 .tar.gz 和 .whl 文件) +python -m build + +# 3. 检查包的元数据和内容 +twine check dist/* + +# 4. 上传到 PyPI(使用环境变量中的 Token 认证) +twine upload dist/* -u __token__ -p "$PyPI_API_Token" +``` + +### 发布到 TestPyPI(测试) + +```bash +# 先上传到 TestPyPI 验证包是否正确 +twine upload --repository testpypi dist/* -u __token__ -p "$PyPI_API_Token" + +# 从 TestPyPI 安装验证 +pip install --index-url https://test.pypi.org/simple/ markitdown-paddleocr +``` + +### 注意事项 + +- 发布前确保 `pyproject.toml` 中的版本号已更新 +- 同一版本号不能重复上传,如需修正必须 bump 版本号 +- `PyPI_API_Token` 环境变量切勿硬编码到脚本或提交到代码仓库 + ## 许可证 MIT From 7ba7e585c543d4f1ec7becd21348c5bb4a2a32ad Mon Sep 17 00:00:00 2001 From: hankl Date: Thu, 21 May 2026 13:34:35 +0800 Subject: [PATCH 09/15] =?UTF-8?q?=E4=BC=98=E5=85=88=E9=80=89=E6=8B=A9glmoc?= =?UTF-8?q?r=EF=BC=8Cglmocr=20=E5=A4=B1=E8=B4=A5=EF=BC=88=E6=8A=9B?= =?UTF-8?q?=E5=BC=82=E5=B8=B8=EF=BC=89=E2=86=92=20PaddleOcrConverter=20?= =?UTF-8?q?=E5=B0=9D=E8=AF=95=20=E2=86=92=20=E5=86=85=E7=BD=AE=20PdfConver?= =?UTF-8?q?ter=20=E5=85=9C=E5=BA=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/markitdown_glmocr/_converter.py | 202 +++++++++++------- .../src/markitdown_glmocr/_plugin.py | 31 ++- .../src/markitdown_paddleocr/_converter.py | 141 +++++++++--- .../src/markitdown_paddleocr/_plugin.py | 29 ++- .../tests/test_converter.py | 34 +-- .../markitdown/src/markitdown/__main__.py | 23 +- 6 files changed, 324 insertions(+), 136 deletions(-) diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py index cafee2ec0..afa7500d4 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py @@ -1,11 +1,15 @@ """GlmOcr PDF/Image Converter - Intelligent PDF and Image to Markdown conversion.""" import io +import logging import sys from typing import Any, BinaryIO, Optional from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo -from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from markitdown._exceptions import ( + MISSING_DEPENDENCY_MESSAGE, + MissingDependencyException, +) from ._config import GlmOcrConfig @@ -37,10 +41,13 @@ ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"] +logger = logging.getLogger(__name__) + + class GlmOcrConverter(DocumentConverter): """ Intelligent PDF/Image converter using glmocr SDK. - + Features: - Auto-detect page content type (plain text vs images/tables) - Plain text pages use pdfplumber/pdfminer (fast, free) @@ -71,19 +78,21 @@ def __init__( raise ImportError( "glmocr is required. Install with: pip install markitdown-glmocr[glmocr]" ) - + # Use config if provided if config: self.api_key = api_key or config.api_key self.timeout = timeout if timeout != 1800 else config.timeout - self.enable_layout = enable_layout if enable_layout else config.enable_layout + self.enable_layout = ( + enable_layout if enable_layout else config.enable_layout + ) self.force_ai = force_ai or config.force_ai else: self.api_key = api_key self.timeout = timeout self.enable_layout = enable_layout self.force_ai = force_ai - + # Lazy init GlmOcr instance self._glmocr: Optional[GlmOcr] = None @@ -127,12 +136,12 @@ def convert( extension=".pdf", feature="pdf", ) - ) from _dependency_exc_info[1].with_traceback( - _dependency_exc_info[2] - ) + ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) extension = (stream_info.extension or "").lower() + logger.info("GlmOcrConverter: 开始转换, 文件类型=%s", extension) + # Image files: use glmocr directly if extension in (".jpg", ".jpeg", ".png"): return self._convert_image(file_stream, extension) @@ -140,61 +149,77 @@ def convert( # PDF files: use hybrid approach return self._convert_pdf(file_stream) - def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult: + def _convert_image( + self, file_stream: BinaryIO, extension: str = ".png" + ) -> DocumentConverterResult: """Convert image file using glmocr SDK.""" img_bytes = file_stream.read() + logger.info("GlmOcrConverter: 开始 OCR 识别图片, 格式=%s", extension) try: result = self._get_glmocr().parse(img_bytes) - - # Check for errors - d = result.to_dict() - if "error" in d: - return DocumentConverterResult(markdown="") - - return DocumentConverterResult( - markdown=result.markdown_result or "" - ) except Exception as e: - return DocumentConverterResult( - markdown=f"" + logger.error( + "GlmOcrConverter: 图片 OCR 识别异常, 格式=%s, 错误=%s", extension, e + ) + raise + + # Check for errors + d = result.to_dict() + if "error" in d: + logger.error( + "GlmOcrConverter: 图片 OCR 返回错误, 格式=%s, 错误=%s", + extension, + d["error"], + ) + raise RuntimeError( + f"GlmOcrConverter: glmocr SDK returned error: {d['error']}" ) + markdown = result.markdown_result or "" + logger.info("GlmOcrConverter: 图片 OCR 识别完成, 输出长度=%d", len(markdown)) + return DocumentConverterResult(markdown=markdown) + def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: pdf_stream = io.BytesIO(file_stream.read()) markdown_parts = [] - try: - with pdfplumber.open(pdf_stream) as pdf: - for page_num, page in enumerate(pdf.pages): - # Analyze page type - page_type = self._analyze_page(page) - - # Choose processing method - if self.force_ai or page_type != "plain_text": - # Complex content: use glmocr - markdown = self._convert_with_glmocr(page, page_num) - else: - # Plain text: use pdfplumber - markdown = self._extract_text_with_tables(page) - - if markdown.strip(): - markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}") - - page.close() - - markdown = "\n\n".join(markdown_parts).strip() - - except Exception: - # Fallback to pdfminer - pdf_stream.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_stream) or "" - - # Final fallback - if not markdown: - pdf_stream.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_stream) or "" - + with pdfplumber.open(pdf_stream) as pdf: + total_pages = len(pdf.pages) + logger.info("GlmOcrConverter: 开始处理 PDF, 总页数=%d", total_pages) + + for page_num, page in enumerate(pdf.pages): + # Analyze page type + page_type = self._analyze_page(page) + + # Choose processing method + if self.force_ai or page_type != "plain_text": + # Complex content: use glmocr + # Let exceptions propagate so the framework can try the next converter + logger.info( + "GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 glmocr OCR", + page_num + 1, + total_pages, + page_type, + ) + markdown = self._convert_with_glmocr(page, page_num) + else: + # Plain text: use pdfplumber + logger.info( + "GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber", + page_num + 1, + total_pages, + page_type, + ) + markdown = self._extract_text_with_tables(page) + + if markdown.strip(): + markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}") + + page.close() + + markdown = "\n\n".join(markdown_parts).strip() + logger.info("GlmOcrConverter: PDF 转换完成, 输出长度=%d", len(markdown)) return DocumentConverterResult(markdown=markdown) def _analyze_page(self, page: Any) -> str: @@ -202,36 +227,56 @@ def _analyze_page(self, page: Any) -> str: # Check for images if hasattr(page, "images") and page.images: return "complex" - + # Check for tables tables = page.find_tables() if tables: return "complex" - + # Check for graphics/curves if hasattr(page, "curves") and page.curves: return "complex" - + return "plain_text" def _convert_with_glmocr(self, page: Any, page_num: int) -> str: - """Convert page using glmocr SDK.""" + """Convert page using glmocr SDK. + + Raises RuntimeError on OCR failure so the framework can try the next converter. + """ + # Render page to image + img = page.to_image(resolution=150) + img_bytes = io.BytesIO() + img.save(img_bytes, format="PNG") + + logger.info("GlmOcrConverter: glmocr SDK 开始识别第 %d 页", page_num + 1) try: - # Render page to image - img = page.to_image(resolution=150) - img_bytes = io.BytesIO() - img.save(img_bytes, format="PNG") result = self._get_glmocr().parse(img_bytes.getvalue()) - - # Check for errors - d = result.to_dict() - if "error" in d: - return self._extract_text_with_tables(page) - - return result.markdown_result or "" - - except Exception: - return self._extract_text_with_tables(page) + except Exception as e: + logger.error( + "GlmOcrConverter: glmocr SDK 第 %d 页识别异常, 错误=%s", page_num + 1, e + ) + raise + + # Check for errors + d = result.to_dict() + if "error" in d: + logger.error( + "GlmOcrConverter: glmocr SDK 第 %d 页返回错误, 错误=%s", + page_num + 1, + d["error"], + ) + raise RuntimeError( + f"GlmOcrConverter: glmocr SDK returned error on page {page_num + 1}: {d['error']}" + ) + + markdown = result.markdown_result or "" + logger.info( + "GlmOcrConverter: glmocr SDK 第 %d 页识别完成, 输出长度=%d", + page_num + 1, + len(markdown), + ) + return markdown def _extract_text_with_tables(self, page: Any) -> str: """Extract text and tables from page.""" @@ -280,9 +325,14 @@ def _table_to_markdown(self, table: list[list[str]]) -> str: lines = [] for row_idx, row in enumerate(table): padded_row = row + [""] * (len(col_widths) - len(row)) - line = "| " + " | ".join( - str(cell).ljust(width) for cell, width in zip(padded_row, col_widths) - ) + " |" + line = ( + "| " + + " | ".join( + str(cell).ljust(width) + for cell, width in zip(padded_row, col_widths) + ) + + " |" + ) lines.append(line) if row_idx == 0: @@ -290,15 +340,15 @@ def _table_to_markdown(self, table: list[list[str]]) -> str: lines.append(sep) return "\n".join(lines) - + def close(self): """Close the GlmOcr instance.""" if self._glmocr: self._glmocr.close() self._glmocr = None - + def __enter__(self): return self - + def __exit__(self, exc_type, exc_val, exc_tb): - self.close() \ No newline at end of file + self.close() diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py index a940acf7d..5963dd43c 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py @@ -1,33 +1,46 @@ """Plugin registration for markitdown-glmocr.""" +import logging from typing import Any + from markitdown import MarkItDown from ._converter import GlmOcrConverter - __plugin_interface_version__ = 1 +logger = logging.getLogger(__name__) + def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: """ Register markitdown-glmocr converter. - + Config sources (priority high to low): 1. kwargs parameters 2. Environment variables (ZHIPU_API_KEY) 3. .env file 4. Built-in defaults """ + logger.info("markitdown-glmocr: 开始注册插件") + # Register converter - PRIORITY_GLMOCR = -1.0 - - markitdown.register_converter( - GlmOcrConverter( + # Priority -2.0: higher priority than PaddleOcrConverter (-1.0), + # so glmocr is tried first and paddleocr serves as fallback. + PRIORITY_GLMOCR = -2.0 + + try: + converter = GlmOcrConverter( api_key=kwargs.get("api_key"), timeout=kwargs.get("timeout", 1800), enable_layout=kwargs.get("enable_layout", False), force_ai=kwargs.get("force_ai", False), - ), - priority=PRIORITY_GLMOCR, - ) \ No newline at end of file + ) + markitdown.register_converter( + converter, + priority=PRIORITY_GLMOCR, + ) + logger.info("markitdown-glmocr: 插件注册成功, priority=%.1f", PRIORITY_GLMOCR) + except Exception as e: + logger.error("markitdown-glmocr: 插件注册失败, 错误=%s", e) + raise diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py index 6d8ae5e63..627b2dfd5 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py @@ -1,11 +1,15 @@ """PaddleOcr Converter - PDF/Image to Markdown using PaddleOCR cloud API.""" import io +import logging import sys from typing import Any, BinaryIO, Optional from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo -from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from markitdown._exceptions import ( + MISSING_DEPENDENCY_MESSAGE, + MissingDependencyException, +) from ._config import PaddleOcrConfig from ._paddle_client import PaddleClient @@ -30,6 +34,9 @@ ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"] +logger = logging.getLogger(__name__) + + class PaddleOcrConverter(DocumentConverter): """Intelligent PDF/Image converter using PaddleOCR cloud API. @@ -70,12 +77,20 @@ def __init__( if config: self.token = token or config.token self.model = model if model != "PaddleOCR-VL-1.5" else config.model - self.poll_interval = poll_interval if poll_interval != 2.0 else config.poll_interval - self.poll_timeout = poll_timeout if poll_timeout != 300.0 else config.poll_timeout + self.poll_interval = ( + poll_interval if poll_interval != 2.0 else config.poll_interval + ) + self.poll_timeout = ( + poll_timeout if poll_timeout != 300.0 else config.poll_timeout + ) self.force_ai = force_ai or config.force_ai - self.use_doc_orientation_classify = use_doc_orientation_classify or config.use_doc_orientation_classify + self.use_doc_orientation_classify = ( + use_doc_orientation_classify or config.use_doc_orientation_classify + ) self.use_doc_unwarping = use_doc_unwarping or config.use_doc_unwarping - self.use_chart_recognition = use_chart_recognition or config.use_chart_recognition + self.use_chart_recognition = ( + use_chart_recognition or config.use_chart_recognition + ) else: self.token = token self.model = model @@ -105,12 +120,25 @@ def _get_client(self) -> PaddleClient: self._client = PaddleClient(config=config) return self._client + def _has_token(self) -> bool: + """Check if a valid token is available.""" + if self.token: + return True + import os + + return bool(os.environ.get("BAIDU_PADDLE_TOKEN", "")) + def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, ) -> bool: + # Without a token, PaddleOCR API cannot work — decline so other + # converters (e.g. GlmOcrConverter) get a chance. + if not self._has_token(): + return False + mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() @@ -136,12 +164,12 @@ def convert( extension=".pdf", feature="pdf", ) - ) from _dependency_exc_info[1].with_traceback( - _dependency_exc_info[2] - ) + ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) extension = (stream_info.extension or "").lower() + logger.info("PaddleOcrConverter: 开始转换, 文件类型=%s", extension) + # Image files: use PaddleOCR directly if extension in (".jpg", ".jpeg", ".png"): return self._convert_image(file_stream, extension) @@ -149,36 +177,68 @@ def convert( # PDF files: use hybrid approach return self._convert_pdf(file_stream) - def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult: + def _convert_image( + self, file_stream: BinaryIO, extension: str = ".png" + ) -> DocumentConverterResult: """Convert image file using PaddleOCR API.""" img_bytes = file_stream.read() filename = f"image{extension}" + logger.info("PaddleOcrConverter: 开始 OCR 识别图片, 格式=%s", extension) try: markdown = self._get_client().ocr(file_bytes=img_bytes, filename=filename) - return DocumentConverterResult(markdown=markdown) except Exception as e: - return DocumentConverterResult( - markdown=f"" + logger.error( + "PaddleOcrConverter: 图片 OCR 识别异常, 格式=%s, 错误=%s", extension, e ) + raise + + logger.info("PaddleOcrConverter: 图片 OCR 识别完成, 输出长度=%d", len(markdown)) + return DocumentConverterResult(markdown=markdown) def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: """Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages).""" pdf_stream = io.BytesIO(file_stream.read()) markdown_parts = [] + ocr_failed = False try: with pdfplumber.open(pdf_stream) as pdf: + total_pages = len(pdf.pages) + logger.info("PaddleOcrConverter: 开始处理 PDF, 总页数=%d", total_pages) + for page_num, page in enumerate(pdf.pages): # Analyze page type page_type = self._analyze_page(page) # Choose processing method if self.force_ai or page_type != "plain_text": - # Complex content: use PaddleOCR - markdown = self._convert_with_paddleocr(page, page_num) + # Complex content: try PaddleOCR, fallback to pdfplumber on failure + logger.info( + "PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 PaddleOCR", + page_num + 1, + total_pages, + page_type, + ) + try: + markdown = self._convert_with_paddleocr(page, page_num) + except Exception as e: + logger.warning( + "PaddleOcrConverter: 第 %d/%d 页 OCR 失败, 降级为 pdfplumber, 错误=%s", + page_num + 1, + total_pages, + e, + ) + ocr_failed = True + markdown = self._extract_text_with_tables(page) else: # Plain text: use pdfplumber + logger.info( + "PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber", + page_num + 1, + total_pages, + page_type, + ) markdown = self._extract_text_with_tables(page) if markdown.strip(): @@ -188,7 +248,10 @@ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: markdown = "\n\n".join(markdown_parts).strip() - except Exception: + except Exception as e: + logger.error( + "PaddleOcrConverter: PDF 处理异常, 降级为 pdfminer, 错误=%s", e + ) # Fallback to pdfminer pdf_stream.seek(0) markdown = pdfminer.high_level.extract_text(pdf_stream) or "" @@ -198,6 +261,15 @@ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: pdf_stream.seek(0) markdown = pdfminer.high_level.extract_text(pdf_stream) or "" + # If OCR failed and result is empty, raise so the framework can try + # the next converter (e.g. GlmOcrConverter) instead of returning empty. + if ocr_failed and not markdown.strip(): + logger.error("PaddleOcrConverter: OCR 失败且所有兜底结果为空, 抛出异常") + raise RuntimeError( + "PaddleOcrConverter: OCR failed and all fallbacks returned empty" + ) + + logger.info("PaddleOcrConverter: PDF 转换完成, 输出长度=%d", len(markdown)) return DocumentConverterResult(markdown=markdown) def _analyze_page(self, page: Any) -> str: @@ -219,21 +291,31 @@ def _analyze_page(self, page: Any) -> str: def _convert_with_paddleocr(self, page: Any, page_num: int) -> str: """Convert page using PaddleOCR API.""" - try: - # Render page to image - img = page.to_image(resolution=150) - img_bytes = io.BytesIO() - img.save(img_bytes, format="PNG") + # Render page to image + img = page.to_image(resolution=150) + img_bytes = io.BytesIO() + img.save(img_bytes, format="PNG") + logger.info("PaddleOcrConverter: PaddleOCR API 开始识别第 %d 页", page_num + 1) + try: markdown = self._get_client().ocr( file_bytes=img_bytes.getvalue(), filename=f"page_{page_num + 1}.png", ) - return markdown + except Exception as e: + logger.error( + "PaddleOcrConverter: PaddleOCR API 第 %d 页识别异常, 错误=%s", + page_num + 1, + e, + ) + raise - except Exception: - # Fallback to pdfplumber text extraction - return self._extract_text_with_tables(page) + logger.info( + "PaddleOcrConverter: PaddleOCR API 第 %d 页识别完成, 输出长度=%d", + page_num + 1, + len(markdown), + ) + return markdown def _extract_text_with_tables(self, page: Any) -> str: """Extract text and tables from page.""" @@ -282,9 +364,14 @@ def _table_to_markdown(self, table: list[list[str]]) -> str: lines = [] for row_idx, row in enumerate(table): padded_row = row + [""] * (len(col_widths) - len(row)) - line = "| " + " | ".join( - str(cell).ljust(width) for cell, width in zip(padded_row, col_widths) - ) + " |" + line = ( + "| " + + " | ".join( + str(cell).ljust(width) + for cell, width in zip(padded_row, col_widths) + ) + + " |" + ) lines.append(line) if row_idx == 0: diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py index 09ae96e6d..d24916ac5 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py @@ -1,13 +1,16 @@ """Plugin registration for markitdown-paddleocr.""" +import logging from typing import Any + from markitdown import MarkItDown from ._converter import PaddleOcrConverter - __plugin_interface_version__ = 1 +logger = logging.getLogger(__name__) + def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: """Register markitdown-paddleocr converter. @@ -17,19 +20,31 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: 2. Environment variables (BAIDU_PADDLE_TOKEN) 3. Built-in defaults """ + logger.info("markitdown-paddleocr: 开始注册插件") + # Register converter with higher priority than default PDF converter PRIORITY_PADDLEOCR = -1.0 - markitdown.register_converter( - PaddleOcrConverter( + try: + converter = PaddleOcrConverter( token=kwargs.get("token"), model=kwargs.get("model", "PaddleOCR-VL-1.5"), poll_interval=kwargs.get("poll_interval", 2.0), poll_timeout=kwargs.get("poll_timeout", 300.0), force_ai=kwargs.get("force_ai", False), - use_doc_orientation_classify=kwargs.get("use_doc_orientation_classify", False), + use_doc_orientation_classify=kwargs.get( + "use_doc_orientation_classify", False + ), use_doc_unwarping=kwargs.get("use_doc_unwarping", False), use_chart_recognition=kwargs.get("use_chart_recognition", False), - ), - priority=PRIORITY_PADDLEOCR, - ) + ) + markitdown.register_converter( + converter, + priority=PRIORITY_PADDLEOCR, + ) + logger.info( + "markitdown-paddleocr: 插件注册成功, priority=%.1f", PRIORITY_PADDLEOCR + ) + except Exception as e: + logger.error("markitdown-paddleocr: 插件注册失败, 错误=%s", e) + raise diff --git a/packages/markitdown-paddleocr/tests/test_converter.py b/packages/markitdown-paddleocr/tests/test_converter.py index aaca74c8c..0e569dc94 100644 --- a/packages/markitdown-paddleocr/tests/test_converter.py +++ b/packages/markitdown-paddleocr/tests/test_converter.py @@ -10,28 +10,35 @@ class TestPaddleOcrConverterAccepts: """Accepts method tests.""" - def test_accepts_pdf_extension(self): - """Accept .pdf extension.""" - converter = PaddleOcrConverter() + def test_accepts_pdf_extension_with_token(self): + """Accept .pdf extension when token is available.""" + converter = PaddleOcrConverter(token="test-token") stream = io.BytesIO(b"%PDF-1.4") stream_info = MagicMock(extension=".pdf", mimetype=None) assert converter.accepts(stream, stream_info) is True - def test_accepts_pdf_mimetype(self): - """Accept PDF MIME type.""" - converter = PaddleOcrConverter() + def test_accepts_pdf_mimetype_with_token(self): + """Accept PDF MIME type when token is available.""" + converter = PaddleOcrConverter(token="test-token") stream = io.BytesIO(b"%PDF-1.4") stream_info = MagicMock(extension=None, mimetype="application/pdf") assert converter.accepts(stream, stream_info) is True - def test_accepts_image_extensions(self): - """Accept image extensions.""" - converter = PaddleOcrConverter() + def test_accepts_image_extensions_with_token(self): + """Accept image extensions when token is available.""" + converter = PaddleOcrConverter(token="test-token") for ext in [".jpg", ".jpeg", ".png"]: stream = io.BytesIO(b"fake") stream_info = MagicMock(extension=ext, mimetype=None) assert converter.accepts(stream, stream_info) is True + def test_rejects_without_token(self): + """Reject all files when no token is available.""" + converter = PaddleOcrConverter() # no token + stream = io.BytesIO(b"%PDF-1.4") + stream_info = MagicMock(extension=".pdf", mimetype="application/pdf") + assert converter.accepts(stream, stream_info) is False + def test_rejects_non_supported(self): """Reject non-supported files.""" converter = PaddleOcrConverter() @@ -92,8 +99,8 @@ def test_convert_image_success(self): assert "# Image Title" in result.markdown mock_client.ocr.assert_called_once() - def test_convert_image_error(self): - """Convert image with PaddleOCR error returns comment.""" + def test_convert_image_error_raises(self): + """Convert image with PaddleOCR error raises exception (for framework fallback).""" converter = PaddleOcrConverter(token="test-token") mock_client = MagicMock() @@ -102,9 +109,8 @@ def test_convert_image_error(self): stream = io.BytesIO(b"fake-image") stream_info = MagicMock(extension=".png", mimetype="image/png") - result = converter.convert(stream, stream_info) - - assert "Error converting image" in result.markdown + with pytest.raises(Exception, match="API Error"): + converter.convert(stream, stream_info) class TestPaddleOcrConverterPdf: diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..934b3df72 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -2,12 +2,14 @@ # # SPDX-License-Identifier: MIT import argparse -import sys import codecs -from textwrap import dedent +import logging +import sys from importlib.metadata import entry_points +from textwrap import dedent + from .__about__ import __version__ -from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult +from ._markitdown import DocumentConverterResult, MarkItDown, StreamInfo def main(): @@ -104,6 +106,14 @@ def main(): help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", ) + parser.add_argument( + "--log-level", + type=str, + default="WARNING", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Set the logging level (default: WARNING). Use INFO or DEBUG to see plugin logs.", + ) + parser.add_argument( "--keep-data-uris", action="store_true", @@ -113,6 +123,13 @@ def main(): parser.add_argument("filename", nargs="?") args = parser.parse_args() + # Configure logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format="%(asctime)s %(levelname)-8s %(name)s: %(message)s", + datefmt="%H:%M:%S", + ) + # Parse the extension hint extension_hint = args.extension if extension_hint is not None: From e88628cd64915376e832e31cb07472ecdfdec126 Mon Sep 17 00:00:00 2001 From: hankl Date: Thu, 21 May 2026 14:59:08 +0800 Subject: [PATCH 10/15] =?UTF-8?q?=E6=89=B9=E9=87=8Focr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/markitdown_glmocr/_config.py | 20 +- .../src/markitdown_glmocr/_converter.py | 229 ++++++++- .../tests/test_ai_service.py | 103 ----- .../markitdown-glmocr/tests/test_analyzer.py | 131 ------ .../markitdown-glmocr/tests/test_converter.py | 146 ++---- .../tests/test_scan_detection.py | 437 ++++++++++++++++++ .../src/markitdown_paddleocr/_config.py | 18 + .../src/markitdown_paddleocr/_converter.py | 211 ++++++++- .../tests/test_scan_detection.py | 430 +++++++++++++++++ 9 files changed, 1355 insertions(+), 370 deletions(-) delete mode 100644 packages/markitdown-glmocr/tests/test_ai_service.py delete mode 100644 packages/markitdown-glmocr/tests/test_analyzer.py create mode 100644 packages/markitdown-glmocr/tests/test_scan_detection.py create mode 100644 packages/markitdown-paddleocr/tests/test_scan_detection.py diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py index d1122524b..6f2531fb8 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_config.py @@ -1,6 +1,19 @@ """Configuration for markitdown-glmocr.""" from dataclasses import dataclass, field +from enum import Enum + + +class ScanDetectionMode(str, Enum): + """扫描检测模式。 + + - PAGE_BY_PAGE: 逐页分析,当前默认行为 + - FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR + - SAMPLING: 抽样前N页,多数是扫描件则全部OCR + """ + PAGE_BY_PAGE = "page_by_page" + FIRST_PAGE_HINT = "first_page_hint" + SAMPLING = "sampling" @dataclass @@ -22,4 +35,9 @@ class GlmOcrConfig: enable_layout: bool = False # Processing strategy - force_ai: bool = False \ No newline at end of file + force_ai: bool = False + + # Scan detection mode for optimization + scan_detection_mode: ScanDetectionMode = ScanDetectionMode.SAMPLING + scan_sample_pages: int = 3 # Number of pages to sample in SAMPLING mode + scan_text_threshold: int = 50 # Min text length to consider page as non-scanned \ No newline at end of file diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py index afa7500d4..19fa0b90f 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py @@ -11,7 +11,7 @@ MissingDependencyException, ) -from ._config import GlmOcrConfig +from ._config import GlmOcrConfig, ScanDetectionMode # Import dependencies _dependency_exc_info = None @@ -62,6 +62,9 @@ def __init__( timeout: int = 1800, enable_layout: bool = False, force_ai: bool = False, + scan_detection_mode: Optional[ScanDetectionMode] = None, + scan_sample_pages: Optional[int] = None, + scan_text_threshold: Optional[int] = None, config: Optional[GlmOcrConfig] = None, ): """ @@ -72,6 +75,9 @@ def __init__( timeout: Request timeout in seconds (default: 1800) enable_layout: Enable layout detection (default: False) force_ai: Force all pages to use AI (default: False) + scan_detection_mode: 扫描检测模式,优化扫描PDF处理 + scan_sample_pages: SAMPLING模式下抽样页数 (default: 3) + scan_text_threshold: 判定为扫描件的最小文本长度阈值 (default: 50) config: Optional GlmOcrConfig instance """ if glmocr is None: @@ -87,11 +93,35 @@ def __init__( enable_layout if enable_layout else config.enable_layout ) self.force_ai = force_ai or config.force_ai + self.scan_detection_mode = ( + scan_detection_mode + if scan_detection_mode is not None + else config.scan_detection_mode + ) + self.scan_sample_pages = ( + scan_sample_pages + if scan_sample_pages is not None + else config.scan_sample_pages + ) + self.scan_text_threshold = ( + scan_text_threshold + if scan_text_threshold is not None + else config.scan_text_threshold + ) else: self.api_key = api_key self.timeout = timeout self.enable_layout = enable_layout self.force_ai = force_ai + self.scan_detection_mode = ( + scan_detection_mode + if scan_detection_mode is not None + else ScanDetectionMode.SAMPLING + ) + self.scan_sample_pages = scan_sample_pages if scan_sample_pages is not None else 3 + self.scan_text_threshold = ( + scan_text_threshold if scan_text_threshold is not None else 50 + ) # Lazy init GlmOcr instance self._glmocr: Optional[GlmOcr] = None @@ -182,36 +212,84 @@ def _convert_image( def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: pdf_stream = io.BytesIO(file_stream.read()) + pdf_bytes = pdf_stream.getvalue() # Keep original bytes for batch OCR markdown_parts = [] with pdfplumber.open(pdf_stream) as pdf: total_pages = len(pdf.pages) logger.info("GlmOcrConverter: 开始处理 PDF, 总页数=%d", total_pages) - for page_num, page in enumerate(pdf.pages): - # Analyze page type - page_type = self._analyze_page(page) + # Optimization: detect if entire PDF is scanned + all_scanned = self._detect_all_scanned(pdf) + + if all_scanned and not self.force_ai: + # Batch mode: upload entire PDF to glmocr SDK (single API call) + logger.info( + "GlmOcrConverter: 全文档扫描模式, 批量上传PDF, 页数=%d", + total_pages, + ) + try: + markdown = self._convert_pdf_batch(pdf_bytes) + if markdown.strip(): + logger.info( + "GlmOcrConverter: 批量OCR完成, 输出长度=%d", + len(markdown), + ) + return DocumentConverterResult(markdown=markdown) + except Exception as e: + logger.warning( + "GlmOcrConverter: 批量OCR失败, 降级为逐页处理, 错误=%s", + e, + ) + # Fall through to per-page processing + # Per-page processing (PAGE_BY_PAGE mode or batch failed) + for page_num, page in enumerate(pdf.pages): # Choose processing method - if self.force_ai or page_type != "plain_text": - # Complex content: use glmocr - # Let exceptions propagate so the framework can try the next converter + if self.force_ai or all_scanned: + # All scanned (after batch failed) or force_ai logger.info( - "GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 glmocr OCR", + "GlmOcrConverter: 第 %d/%d 页, 使用 glmocr OCR", page_num + 1, total_pages, - page_type, ) - markdown = self._convert_with_glmocr(page, page_num) + try: + markdown = self._convert_with_glmocr(page, page_num) + except Exception as e: + logger.error( + "GlmOcrConverter: 第 %d/%d 页识别异常, 错误=%s", + page_num + 1, + e, + ) + raise else: - # Plain text: use pdfplumber - logger.info( - "GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber", - page_num + 1, - total_pages, - page_type, - ) - markdown = self._extract_text_with_tables(page) + # Per-page analysis (PAGE_BY_PAGE mode or non-scanned doc) + page_type = self._analyze_page(page) + + if page_type != "plain_text": + logger.info( + "GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 glmocr OCR", + page_num + 1, + total_pages, + page_type, + ) + try: + markdown = self._convert_with_glmocr(page, page_num) + except Exception as e: + logger.error( + "GlmOcrConverter: 第 %d/%d 页识别异常, 错误=%s", + page_num + 1, + e, + ) + raise + else: + logger.info( + "GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber", + page_num + 1, + total_pages, + page_type, + ) + markdown = self._extract_text_with_tables(page) if markdown.strip(): markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}") @@ -222,6 +300,34 @@ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: logger.info("GlmOcrConverter: PDF 转换完成, 输出长度=%d", len(markdown)) return DocumentConverterResult(markdown=markdown) + def _convert_pdf_batch(self, pdf_bytes: bytes) -> str: + """Convert entire PDF in a single API call. + + More efficient for scanned PDFs: one API call instead of N calls for N pages. + + Args: + pdf_bytes: Raw PDF file content. + + Returns: + Markdown text from all pages. + """ + logger.info("GlmOcrConverter: 批量上传PDF到glmocr SDK, 大小=%d bytes", len(pdf_bytes)) + result = self._get_glmocr().parse(pdf_bytes) + + # Check for errors + d = result.to_dict() + if "error" in d: + logger.error( + "GlmOcrConverter: 批量OCR返回错误, 错误=%s", + d["error"], + ) + raise RuntimeError( + f"GlmOcrConverter: glmocr SDK batch OCR error: {d['error']}" + ) + + markdown = result.markdown_result or "" + return markdown + def _analyze_page(self, page: Any) -> str: """Analyze page content type.""" # Check for images @@ -239,6 +345,93 @@ def _analyze_page(self, page: Any) -> str: return "plain_text" + def _is_scanned_page(self, page: Any) -> bool: + """Check if a page is likely a scanned image. + + A page is considered scanned if: + 1. It contains images, AND + 2. It has very little extractable text (below threshold) + + Args: + page: pdfplumber page object + + Returns: + True if the page appears to be a scanned image + """ + # Must have images to be a scan + has_images = hasattr(page, "images") and bool(page.images) + if not has_images: + return False + + # Check extractable text length + try: + text = page.extract_text() or "" + text_len = len(text.strip()) + # If there's substantial text, it might be a mixed page or + # a digital PDF with embedded images + if text_len >= self.scan_text_threshold: + return False + except Exception: + # If text extraction fails, assume it's a scan + return True + + return True + + def _detect_all_scanned(self, pdf: Any) -> bool: + """Detect if entire PDF is scanned based on scan_detection_mode. + + Optimization: When first few pages are scanned, we can assume + all pages are scanned and skip per-page analysis. + + Args: + pdf: pdfplumber PDF object + + Returns: + True if entire PDF should be treated as scanned + """ + if self.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE: + return False + + total_pages = len(pdf.pages) + if total_pages == 0: + return False + + if self.scan_detection_mode == ScanDetectionMode.FIRST_PAGE_HINT: + # Check only first page + first_page = pdf.pages[0] + is_scanned = self._is_scanned_page(first_page) + first_page.close() + if is_scanned: + logger.info( + "GlmOcrConverter: 首页检测为扫描件, 模式=FIRST_PAGE_HINT, 全文档使用OCR" + ) + return is_scanned + + if self.scan_detection_mode == ScanDetectionMode.SAMPLING: + # Sample first N pages + sample_count = min(self.scan_sample_pages, total_pages) + scanned_count = 0 + + for i in range(sample_count): + page = pdf.pages[i] + if self._is_scanned_page(page): + scanned_count += 1 + + # If majority of sampled pages are scanned, treat all as scanned + majority_threshold = sample_count // 2 + 1 + all_scanned = scanned_count >= majority_threshold + + if all_scanned: + logger.info( + "GlmOcrConverter: 抽样检测 %d/%d 页为扫描件, 模式=SAMPLING, 全文档使用OCR", + scanned_count, + sample_count, + ) + + return all_scanned + + return False + def _convert_with_glmocr(self, page: Any, page_num: int) -> str: """Convert page using glmocr SDK. diff --git a/packages/markitdown-glmocr/tests/test_ai_service.py b/packages/markitdown-glmocr/tests/test_ai_service.py deleted file mode 100644 index dbbe06d50..000000000 --- a/packages/markitdown-glmocr/tests/test_ai_service.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Tests for AI service with zai-sdk.""" - -import io -import pytest -from unittest.mock import MagicMock, patch - -from markitdown_glmocr._ai_service import AIService, AIResult -from markitdown_glmocr._config import GlmOcrConfig - - -class TestAIService: - """AI Service tests with zai-sdk.""" - - def test_missing_zai_sdk_raises_error(self): - """Missing zai-sdk raises error.""" - with patch("markitdown_glmocr._ai_service.ZhipuAiClient", None): - with pytest.raises(ImportError, match="zai-sdk is required"): - AIService(api_key="test") - - def test_missing_api_key_raises_error(self): - """Missing API key raises error.""" - with patch("markitdown_glmocr._ai_service.ZhipuAiClient", MagicMock()): - with pytest.raises(ValueError, match="API key is required"): - AIService(api_key="") - - def test_successful_conversion(self): - """Successful conversion.""" - # Mock ZhipuAiClient - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.md_results = "
Test
" - mock_response.layout_details = [] - mock_client.layout_parsing.create.return_value = mock_response - - with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): - service = AIService(api_key="test-api-key") - result = service.image_to_markdown(io.BytesIO(b"fake-image")) - - assert result.success is True - assert "Test" in result.text - - def test_html_table_conversion(self): - """HTML table to Markdown conversion.""" - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.md_results = '
AB
12
' - mock_response.layout_details = [] - mock_client.layout_parsing.create.return_value = mock_response - - with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): - service = AIService(api_key="test-api-key") - result = service.image_to_markdown(io.BytesIO(b"fake-image")) - - assert result.success is True - assert "| A | B |" in result.text - assert "|---|---|" in result.text - assert "| 1 | 2 |" in result.text - - def test_empty_result(self): - """Empty result handling.""" - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.md_results = "" - mock_response.layout_details = [] - mock_client.layout_parsing.create.return_value = mock_response - - with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): - service = AIService(api_key="test-api-key") - result = service.image_to_markdown(io.BytesIO(b"fake-image")) - - assert result.success is True - assert result.text == "" - - def test_error_handling(self): - """Error handling.""" - mock_client = MagicMock() - mock_client.layout_parsing.create.side_effect = Exception("API Error") - - with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): - service = AIService(api_key="test-api-key") - result = service.image_to_markdown(io.BytesIO(b"fake-image")) - - assert result.success is False - assert "API Error" in result.error - - def test_base64_encoding(self): - """Test base64 encoding of image.""" - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.md_results = "test" - mock_response.layout_details = [] - mock_client.layout_parsing.create.return_value = mock_response - - with patch("markitdown_glmocr._ai_service.ZhipuAiClient", return_value=mock_client): - service = AIService(api_key="test-api-key") - result = service.image_to_markdown(io.BytesIO(b"fake-image"), "test.png") - - assert result.success is True - - # Verify data URI was used - call_args = mock_client.layout_parsing.create.call_args - file_arg = call_args.kwargs['file'] - assert file_arg.startswith("data:image/png;base64,") \ No newline at end of file diff --git a/packages/markitdown-glmocr/tests/test_analyzer.py b/packages/markitdown-glmocr/tests/test_analyzer.py deleted file mode 100644 index 6841f0b44..000000000 --- a/packages/markitdown-glmocr/tests/test_analyzer.py +++ /dev/null @@ -1,131 +0,0 @@ -"""Tests for page analyzer.""" - -import pytest -from unittest.mock import MagicMock - -from markitdown_glmocr._page_analyzer import ( - PageType, - detect_images, - detect_tables, - analyze_page, -) - - -class TestDetectImages: - """图片检测测试""" - - def test_no_images(self): - """无图片页面""" - page = MagicMock() - page.images = [] - page.objects = {} - - assert detect_images(page) is False - - def test_has_images_via_images_attr(self): - """通过 page.images 检测图片""" - page = MagicMock() - page.images = [MagicMock(x0=0, y0=0, x1=100, y1=100)] - - assert detect_images(page) is True - - def test_has_images_via_objects(self): - """通过 page.objects 检测图片""" - page = MagicMock() - page.images = [] - page.objects = {"image": [MagicMock()]} - - assert detect_images(page) is True - - def test_has_xobject_image(self): - """通过 XObject 检测图片""" - page = MagicMock() - page.images = [] - page.objects = { - "xobject": [{"subtype": "Image"}] - } - - assert detect_images(page) is True - - -class TestDetectTables: - """表格检测测试""" - - def test_no_tables(self): - """无表格页面""" - page = MagicMock() - page.extract_tables.return_value = [] - - assert detect_tables(page) is False - - def test_has_tables_via_extract_tables(self): - """通过 extract_tables 检测表格""" - page = MagicMock() - page.extract_tables.return_value = [ - [["A", "B", "C"], ["1", "2", "3"]] - ] - - assert detect_tables(page) is True - - def test_empty_table_not_detected(self): - """空表格不应被检测""" - page = MagicMock() - page.extract_tables.return_value = [ - [["", "", ""], ["", "", ""]] - ] - - assert detect_tables(page) is False - - def test_has_table_lines(self): - """通过线条检测表格""" - page = MagicMock() - page.extract_tables.return_value = [] - - # 模拟网格线条 - lines = [] - for i in range(5): - # 水平线 - lines.append({"height": 0.5, "width": 100}) - # 垂直线 - lines.append({"height": 100, "width": 0.5}) - - page.objects = {"line": lines} - - assert detect_tables(page) is True - - -class TestAnalyzePage: - """页面分析测试""" - - def test_plain_text_page(self): - """纯文本页面""" - page = MagicMock() - page.images = [] - page.objects = {} - page.extract_tables.return_value = [] - - assert analyze_page(page) == PageType.PLAIN_TEXT - - def test_page_with_images(self): - """仅包含图片""" - page = MagicMock() - page.images = [MagicMock()] - page.extract_tables.return_value = [] - - assert analyze_page(page) == PageType.HAS_IMAGES - - def test_page_with_tables(self): - """仅包含表格""" - page = MagicMock() - page.images = [] - page.extract_tables.return_value = [[["A", "B"]]] - - assert analyze_page(page) == PageType.HAS_TABLES - - def test_complex_page(self): - """同时包含图片和表格""" - page = MagicMock() - page.images = [MagicMock()] - page.extract_tables.return_value = [[["A", "B"]]] - - assert analyze_page(page) == PageType.COMPLEX \ No newline at end of file diff --git a/packages/markitdown-glmocr/tests/test_converter.py b/packages/markitdown-glmocr/tests/test_converter.py index d48c75f2d..d91c7d995 100644 --- a/packages/markitdown-glmocr/tests/test_converter.py +++ b/packages/markitdown-glmocr/tests/test_converter.py @@ -2,43 +2,46 @@ import io import pytest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, patch, PropertyMock -from markitdown_glmocr._converter import GlmOcrPdfConverter -from markitdown_glmocr._ai_service import AIService, AIResult -from markitdown_glmocr._page_analyzer import PageType +from markitdown_glmocr._converter import GlmOcrConverter +from markitdown_glmocr._config import ScanDetectionMode -class TestGlmOcrPdfConverter: +class TestGlmOcrConverter: """Converter tests.""" - def test_accepts_pdf_extension(self): + @patch("markitdown_glmocr._converter.glmocr") + def test_accepts_pdf_extension(self, mock_glmocr): """Accept .pdf extension.""" - converter = GlmOcrPdfConverter() + converter = GlmOcrConverter() stream = io.BytesIO(b"%PDF-1.4") stream_info = MagicMock(extension=".pdf", mimetype=None) assert converter.accepts(stream, stream_info) is True - def test_accepts_pdf_mimetype(self): + @patch("markitdown_glmocr._converter.glmocr") + def test_accepts_pdf_mimetype(self, mock_glmocr): """Accept PDF MIME type.""" - converter = GlmOcrPdfConverter() + converter = GlmOcrConverter() stream = io.BytesIO(b"%PDF-1.4") stream_info = MagicMock(extension=None, mimetype="application/pdf") assert converter.accepts(stream, stream_info) is True - def test_rejects_non_pdf(self): + @patch("markitdown_glmocr._converter.glmocr") + def test_rejects_non_pdf(self, mock_glmocr): """Reject non-PDF files.""" - converter = GlmOcrPdfConverter() + converter = GlmOcrConverter() stream = io.BytesIO(b"not a pdf") stream_info = MagicMock(extension=".txt", mimetype="text/plain") assert converter.accepts(stream, stream_info) is False - def test_table_to_markdown(self): + @patch("markitdown_glmocr._converter.glmocr") + def test_table_to_markdown(self, mock_glmocr): """Table to Markdown conversion.""" - converter = GlmOcrPdfConverter() + converter = GlmOcrConverter() table = [ ["Name", "Age", "City"], ["Alice", "25", "Beijing"], @@ -46,59 +49,28 @@ def test_table_to_markdown(self): ] result = converter._table_to_markdown(table) - + assert "|" in result assert "Name" in result assert "Alice" in result assert "---" in result # Separator - def test_plain_text_page_without_ai(self): + @patch("markitdown_glmocr._converter.glmocr") + def test_plain_text_page_without_ai(self, mock_glmocr): """Plain text page without AI.""" - converter = GlmOcrPdfConverter() + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) # Mock page page = MagicMock() page.images = [] - page.objects = {} - page.extract_tables.return_value = [] + page.find_tables.return_value = [] + page.curves = [] page.extract_text.return_value = "Hello World" - page.close = MagicMock() - - # Mock PDF - mock_pdf = MagicMock() - mock_pdf.pages = [page] - - with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: - mock_open.return_value.__enter__.return_value = mock_pdf - - stream = io.BytesIO(b"%PDF-1.4") - result = converter.convert(stream, MagicMock()) - - assert "Hello World" in result.markdown - - def test_complex_page_with_ai(self): - """Complex page with AI.""" - # Mock AI service - ai_service = MagicMock(spec=AIService) - ai_service.image_to_markdown.return_value = AIResult( - success=True, - text="# AI Generated\n\nThis is from AI." - ) - - converter = GlmOcrPdfConverter(ai_service=ai_service) - - # Mock page - page = MagicMock() - page.images = [MagicMock()] page.extract_tables.return_value = [] - page.extract_text.return_value = "Plain text" - page.to_image.return_value.original = MagicMock() page.close = MagicMock() - # Mock image save - img_stream = io.BytesIO() - page.to_image.return_value.original.save = lambda s, format: s.write(b"fake") - # Mock PDF mock_pdf = MagicMock() mock_pdf.pages = [page] @@ -109,64 +81,36 @@ def test_complex_page_with_ai(self): stream = io.BytesIO(b"%PDF-1.4") result = converter.convert(stream, MagicMock()) - # Should call AI - ai_service.image_to_markdown.assert_called_once() - assert "AI Generated" in result.markdown + assert "Hello World" in result.markdown - def test_force_ai_mode(self): + @patch("markitdown_glmocr._converter.glmocr") + def test_force_ai_mode(self, mock_glmocr): """Force AI mode.""" - ai_service = MagicMock(spec=AIService) - ai_service.image_to_markdown.return_value = AIResult( - success=True, - text="AI result" - ) + # Mock glmocr instance + mock_result = MagicMock() + mock_result.markdown_result = "AI result" + mock_result.to_dict.return_value = {} - converter = GlmOcrPdfConverter(ai_service=ai_service, force_ai=True) + mock_glmocr_instance = MagicMock() + mock_glmocr_instance.parse.return_value = mock_result + mock_glmocr.GlmOcr.return_value = mock_glmocr_instance + + converter = GlmOcrConverter(force_ai=True) + # Force initialization of the mocked glmocr + converter._get_glmocr = lambda: mock_glmocr_instance # Even plain text page page = MagicMock() page.images = [] - page.objects = {} - page.extract_tables.return_value = [] + page.find_tables.return_value = [] + page.curves = [] page.extract_text.return_value = "Plain text" - page.to_image.return_value.original = MagicMock() - page.close = MagicMock() - - img_stream = io.BytesIO() - page.to_image.return_value.original.save = lambda s, format: s.write(b"fake") - - mock_pdf = MagicMock() - mock_pdf.pages = [page] - - with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: - mock_open.return_value.__enter__.return_value = mock_pdf - - stream = io.BytesIO(b"%PDF-1.4") - result = converter.convert(stream, MagicMock()) - - # Should call AI (because force_ai=True) - ai_service.image_to_markdown.assert_called_once() - - def test_fallback_on_ai_failure(self): - """Fallback on AI failure.""" - ai_service = MagicMock(spec=AIService) - ai_service.image_to_markdown.return_value = AIResult( - success=False, - text="", - error="API error" - ) - - converter = GlmOcrPdfConverter(ai_service=ai_service) - - page = MagicMock() - page.images = [MagicMock()] page.extract_tables.return_value = [] - page.extract_text.return_value = "Fallback text" - page.to_image.return_value.original = MagicMock() page.close = MagicMock() - img_stream = io.BytesIO() - page.to_image.return_value.original.save = lambda s, format: s.write(b"fake") + # Mock to_image + mock_img = MagicMock() + page.to_image.return_value = mock_img mock_pdf = MagicMock() mock_pdf.pages = [page] @@ -177,5 +121,5 @@ def test_fallback_on_ai_failure(self): stream = io.BytesIO(b"%PDF-1.4") result = converter.convert(stream, MagicMock()) - # Should fallback to default text - assert "Fallback text" in result.markdown \ No newline at end of file + # Should call AI (because force_ai=True) + mock_glmocr_instance.parse.assert_called_once() diff --git a/packages/markitdown-glmocr/tests/test_scan_detection.py b/packages/markitdown-glmocr/tests/test_scan_detection.py new file mode 100644 index 000000000..01b2442a6 --- /dev/null +++ b/packages/markitdown-glmocr/tests/test_scan_detection.py @@ -0,0 +1,437 @@ +"""Tests for scan detection optimization in GlmOcrConverter.""" + +import pytest +from unittest.mock import MagicMock, patch + +from markitdown_glmocr._config import GlmOcrConfig, ScanDetectionMode +from markitdown_glmocr._converter import GlmOcrConverter + + +class TestScanDetectionMode: + """扫描检测模式配置测试""" + + def test_default_mode_is_sampling(self): + """默认模式应为 SAMPLING""" + config = GlmOcrConfig() + assert config.scan_detection_mode == ScanDetectionMode.SAMPLING + + def test_custom_mode_from_config(self): + """从配置对象读取自定义模式""" + with patch("markitdown_glmocr._converter.glmocr"): + config = GlmOcrConfig(scan_detection_mode=ScanDetectionMode.FIRST_PAGE_HINT) + converter = GlmOcrConverter(config=config) + assert converter.scan_detection_mode == ScanDetectionMode.FIRST_PAGE_HINT + + def test_custom_mode_from_constructor(self): + """从构造函数传入自定义模式""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) + assert converter.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE + + def test_constructor_overrides_config(self): + """构造函数参数优先于配置对象""" + with patch("markitdown_glmocr._converter.glmocr"): + config = GlmOcrConfig(scan_detection_mode=ScanDetectionMode.FIRST_PAGE_HINT) + converter = GlmOcrConverter( + config=config, + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) + assert converter.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE + + +class TestIsScannedPage: + """扫描页面检测测试""" + + def test_page_without_images_not_scanned(self): + """无图片的页面不是扫描件""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter() + + page = MagicMock() + page.images = [] + page.extract_text.return_value = "Some text content here" + + assert converter._is_scanned_page(page) is False + + def test_page_with_images_and_text_not_scanned(self): + """有图片但有足够文本的页面不是扫描件""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter(scan_text_threshold=50) + + page = MagicMock() + page.images = [MagicMock()] + page.extract_text.return_value = "This is more than 50 characters of text content that should be extracted" + + assert converter._is_scanned_page(page) is False + + def test_page_with_images_no_text_is_scanned(self): + """有图片但无文本的页面是扫描件""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter(scan_text_threshold=50) + + page = MagicMock() + page.images = [MagicMock()] + page.extract_text.return_value = "" + + assert converter._is_scanned_page(page) is True + + def test_page_with_images_little_text_is_scanned(self): + """有图片但文本少于阈值的页面是扫描件""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter(scan_text_threshold=50) + + page = MagicMock() + page.images = [MagicMock()] + page.extract_text.return_value = "Short text" # Only 10 chars + + assert converter._is_scanned_page(page) is True + + def test_text_extraction_error_assumes_scanned(self): + """文本提取失败时假定是扫描件""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter() + + page = MagicMock() + page.images = [MagicMock()] + page.extract_text.side_effect = Exception("Extraction failed") + + assert converter._is_scanned_page(page) is True + + def test_custom_threshold(self): + """自定义阈值生效""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter(scan_text_threshold=100) + + # Text below threshold + page1 = MagicMock() + page1.images = [MagicMock()] + page1.extract_text.return_value = "This is exactly 50 characters" # ~30 chars + + assert converter._is_scanned_page(page1) is True + + # Text above threshold + page2 = MagicMock() + page2.images = [MagicMock()] + page2.extract_text.return_value = "This is definitely more than 100 characters of text content here for testing and verification purposes" # 106 chars + + assert converter._is_scanned_page(page2) is False + + +class TestDetectAllScanned: + """全文档扫描检测测试""" + + def test_page_by_page_mode_returns_false(self): + """PAGE_BY_PAGE 模式永远返回 False""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) + + # Even with all scanned pages + pdf = MagicMock() + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + pdf.pages = [scanned_page, scanned_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is False + + def test_first_page_hint_first_page_scanned(self): + """FIRST_PAGE_HINT 模式,首页扫描则全文档扫描""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.FIRST_PAGE_HINT, + ) + + # First page scanned + pdf = MagicMock() + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + pdf.pages = [scanned_page, normal_page, normal_page] + + assert converter._detect_all_scanned(pdf) is True + + def test_first_page_hint_first_page_not_scanned(self): + """FIRST_PAGE_HINT 模式,首页非扫描则不判定全扫描""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.FIRST_PAGE_HINT, + ) + + # First page not scanned + pdf = MagicMock() + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + pdf.pages = [normal_page, scanned_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is False + + def test_sampling_mode_majority_scanned(self): + """SAMPLING 模式,多数页面扫描则全文档扫描""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # 3 pages, 2 scanned, 1 normal -> majority scanned + pdf = MagicMock() + + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + pdf.pages = [scanned_page, scanned_page, normal_page] + + assert converter._detect_all_scanned(pdf) is True + + def test_sampling_mode_minority_scanned(self): + """SAMPLING 模式,少数页面扫描则不判定全扫描""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # 3 pages, 1 scanned, 2 normal -> minority scanned + pdf = MagicMock() + + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + pdf.pages = [normal_page, normal_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is False + + def test_sampling_mode_all_scanned(self): + """SAMPLING 模式,所有抽样页扫描则全文档扫描""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + pdf = MagicMock() + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + pdf.pages = [scanned_page, scanned_page, scanned_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is True + + def test_sampling_mode_custom_sample_count(self): + """SAMPLING 模式,自定义抽样页数""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=5, + ) + + # 5 pages sampled, 3 scanned -> majority + pdf = MagicMock() + + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + pdf.pages = [scanned_page, scanned_page, scanned_page, normal_page, normal_page] + + assert converter._detect_all_scanned(pdf) is True + + def test_empty_pdf_returns_false(self): + """空 PDF 返回 False""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter() + + pdf = MagicMock() + pdf.pages = [] + + assert converter._detect_all_scanned(pdf) is False + + def test_pdf_with_less_pages_than_sample_count(self): + """PDF 页数少于抽样数时使用实际页数""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=5, + ) + + # Only 2 pages, both scanned -> majority + pdf = MagicMock() + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + pdf.pages = [scanned_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is True + + +class TestConvertPdfWithScanDetection: + """PDF 转换中的扫描检测集成测试""" + + def test_all_scanned_uses_batch_mode(self): + """全扫描模式优先使用批量上传""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # Mock _detect_all_scanned to return True + converter._detect_all_scanned = MagicMock(return_value=True) + converter._convert_pdf_batch = MagicMock(return_value="Batch OCR result") + converter._convert_with_glmocr = MagicMock(return_value="Page OCR result") + + # Mock PDF + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + + pdf = MagicMock() + pdf.pages = [scanned_page, scanned_page] + + with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = pdf + + import io + stream = io.BytesIO(b"%PDF-1.4") + result = converter._convert_pdf(stream) + + # Should call batch mode (1 API call) + converter._convert_pdf_batch.assert_called_once() + # Should NOT call per-page OCR + converter._convert_with_glmocr.assert_not_called() + assert "Batch OCR result" in result.markdown + + def test_batch_failure_fallback_to_per_page(self): + """批量OCR失败后降级为逐页处理""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # Mock _detect_all_scanned to return True + converter._detect_all_scanned = MagicMock(return_value=True) + converter._convert_pdf_batch = MagicMock(side_effect=RuntimeError("Batch API error")) + converter._convert_with_glmocr = MagicMock(return_value="Page OCR result") + + # Mock PDF + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + + pdf = MagicMock() + pdf.pages = [scanned_page, scanned_page] + + with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = pdf + + import io + stream = io.BytesIO(b"%PDF-1.4") + result = converter._convert_pdf(stream) + + # Should have tried batch first + converter._convert_pdf_batch.assert_called_once() + # Should fall back to per-page OCR + assert converter._convert_with_glmocr.call_count == 2 + + def test_all_scanned_skips_per_page_analysis(self): + """全扫描模式跳过逐页分析""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # Mock _detect_all_scanned to return True + converter._detect_all_scanned = MagicMock(return_value=True) + converter._convert_pdf_batch = MagicMock(return_value="Batch OCR result") + converter._analyze_page = MagicMock(return_value="plain_text") + + # Mock PDF + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + + pdf = MagicMock() + pdf.pages = [scanned_page, scanned_page] + + with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = pdf + + import io + stream = io.BytesIO(b"%PDF-1.4") + result = converter._convert_pdf(stream) + + # Should call batch mode, not _analyze_page + converter._convert_pdf_batch.assert_called_once() + converter._analyze_page.assert_not_called() + + def test_page_by_page_mode_analyzes_each_page(self): + """PAGE_BY_PAGE 模式分析每页""" + with patch("markitdown_glmocr._converter.glmocr"): + converter = GlmOcrConverter( + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) + + # Mock _analyze_page to return different results + converter._analyze_page = MagicMock(side_effect=["plain_text", "complex"]) + converter._convert_with_glmocr = MagicMock(return_value="OCR result") + converter._extract_text_with_tables = MagicMock(return_value="Text result") + + # Mock PDF + page1 = MagicMock() + page1.close = MagicMock() + page2 = MagicMock() + page2.close = MagicMock() + + pdf = MagicMock() + pdf.pages = [page1, page2] + + with patch("markitdown_glmocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = pdf + + import io + stream = io.BytesIO(b"%PDF-1.4") + result = converter._convert_pdf(stream) + + # Should analyze each page + assert converter._analyze_page.call_count == 2 + # Should use different methods for different pages + converter._extract_text_with_tables.assert_called_once() + converter._convert_with_glmocr.assert_called_once() \ No newline at end of file diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py index 51fc00d60..ddd6ca794 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py @@ -2,6 +2,19 @@ import os from dataclasses import dataclass +from enum import Enum + + +class ScanDetectionMode(str, Enum): + """扫描检测模式。 + + - PAGE_BY_PAGE: 逐页分析,当前默认行为 + - FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR + - SAMPLING: 抽样前N页,多数是扫描件则全部OCR + """ + PAGE_BY_PAGE = "page_by_page" + FIRST_PAGE_HINT = "first_page_hint" + SAMPLING = "sampling" @dataclass @@ -35,6 +48,11 @@ class PaddleOcrConfig: # Processing strategy force_ai: bool = False + # Scan detection mode for optimization + scan_detection_mode: ScanDetectionMode = ScanDetectionMode.SAMPLING + scan_sample_pages: int = 3 # Number of pages to sample in SAMPLING mode + scan_text_threshold: int = 50 # Min text length to consider page as non-scanned + @classmethod def from_env(cls, **overrides) -> "PaddleOcrConfig": """Create config from environment variables with optional overrides.""" diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py index 627b2dfd5..48e5c2bd6 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py @@ -11,7 +11,7 @@ MissingDependencyException, ) -from ._config import PaddleOcrConfig +from ._config import PaddleOcrConfig, ScanDetectionMode from ._paddle_client import PaddleClient # Import PDF dependencies @@ -58,6 +58,9 @@ def __init__( use_doc_orientation_classify: bool = False, use_doc_unwarping: bool = False, use_chart_recognition: bool = False, + scan_detection_mode: Optional[ScanDetectionMode] = None, + scan_sample_pages: Optional[int] = None, + scan_text_threshold: Optional[int] = None, config: Optional[PaddleOcrConfig] = None, ): """Initialize converter. @@ -71,6 +74,9 @@ def __init__( use_doc_orientation_classify: Enable document orientation classification use_doc_unwarping: Enable document unwarping use_chart_recognition: Enable chart recognition + scan_detection_mode: 扫描检测模式,优化扫描PDF处理 + scan_sample_pages: SAMPLING模式下抽样页数 (default: 3) + scan_text_threshold: 判定为扫描件的最小文本长度阈值 (default: 50) config: Optional PaddleOcrConfig instance """ # Build config from explicit params or provided config @@ -91,6 +97,21 @@ def __init__( self.use_chart_recognition = ( use_chart_recognition or config.use_chart_recognition ) + self.scan_detection_mode = ( + scan_detection_mode + if scan_detection_mode is not None + else config.scan_detection_mode + ) + self.scan_sample_pages = ( + scan_sample_pages + if scan_sample_pages is not None + else config.scan_sample_pages + ) + self.scan_text_threshold = ( + scan_text_threshold + if scan_text_threshold is not None + else config.scan_text_threshold + ) else: self.token = token self.model = model @@ -100,6 +121,15 @@ def __init__( self.use_doc_orientation_classify = use_doc_orientation_classify self.use_doc_unwarping = use_doc_unwarping self.use_chart_recognition = use_chart_recognition + self.scan_detection_mode = ( + scan_detection_mode + if scan_detection_mode is not None + else ScanDetectionMode.SAMPLING + ) + self.scan_sample_pages = scan_sample_pages if scan_sample_pages is not None else 3 + self.scan_text_threshold = ( + scan_text_threshold if scan_text_threshold is not None else 50 + ) # Lazy init client self._client: Optional[PaddleClient] = None @@ -199,6 +229,7 @@ def _convert_image( def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: """Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages).""" pdf_stream = io.BytesIO(file_stream.read()) + pdf_bytes = pdf_stream.getvalue() # Keep original bytes for batch OCR markdown_parts = [] ocr_failed = False @@ -207,18 +238,40 @@ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: total_pages = len(pdf.pages) logger.info("PaddleOcrConverter: 开始处理 PDF, 总页数=%d", total_pages) - for page_num, page in enumerate(pdf.pages): - # Analyze page type - page_type = self._analyze_page(page) + # Optimization: detect if entire PDF is scanned + all_scanned = self._detect_all_scanned(pdf) + + if all_scanned and not self.force_ai: + # Batch mode: upload entire PDF to OCR API (single API call) + logger.info( + "PaddleOcrConverter: 全文档扫描模式, 批量上传PDF, 页数=%d", + total_pages, + ) + try: + markdown = self._convert_pdf_batch(pdf_bytes) + if markdown.strip(): + logger.info( + "PaddleOcrConverter: 批量OCR完成, 输出长度=%d", + len(markdown), + ) + return DocumentConverterResult(markdown=markdown) + except Exception as e: + logger.warning( + "PaddleOcrConverter: 批量OCR失败, 降级为逐页处理, 错误=%s", + e, + ) + ocr_failed = True + # Fall through to per-page processing + # Per-page processing (PAGE_BY_PAGE mode or batch failed) + for page_num, page in enumerate(pdf.pages): # Choose processing method - if self.force_ai or page_type != "plain_text": - # Complex content: try PaddleOCR, fallback to pdfplumber on failure + if self.force_ai or all_scanned: + # All scanned (after batch failed) or force_ai logger.info( - "PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 PaddleOCR", + "PaddleOcrConverter: 第 %d/%d 页, 使用 PaddleOCR", page_num + 1, total_pages, - page_type, ) try: markdown = self._convert_with_paddleocr(page, page_num) @@ -232,14 +285,35 @@ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: ocr_failed = True markdown = self._extract_text_with_tables(page) else: - # Plain text: use pdfplumber - logger.info( - "PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber", - page_num + 1, - total_pages, - page_type, - ) - markdown = self._extract_text_with_tables(page) + # Per-page analysis (PAGE_BY_PAGE mode or non-scanned doc) + page_type = self._analyze_page(page) + + if page_type != "plain_text": + logger.info( + "PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 PaddleOCR", + page_num + 1, + total_pages, + page_type, + ) + try: + markdown = self._convert_with_paddleocr(page, page_num) + except Exception as e: + logger.warning( + "PaddleOcrConverter: 第 %d/%d 页 OCR 失败, 降级为 pdfplumber, 错误=%s", + page_num + 1, + total_pages, + e, + ) + ocr_failed = True + markdown = self._extract_text_with_tables(page) + else: + logger.info( + "PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber", + page_num + 1, + total_pages, + page_type, + ) + markdown = self._extract_text_with_tables(page) if markdown.strip(): markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}") @@ -272,6 +346,24 @@ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: logger.info("PaddleOcrConverter: PDF 转换完成, 输出长度=%d", len(markdown)) return DocumentConverterResult(markdown=markdown) + def _convert_pdf_batch(self, pdf_bytes: bytes) -> str: + """Convert entire PDF in a single API call. + + More efficient for scanned PDFs: one API call instead of N calls for N pages. + + Args: + pdf_bytes: Raw PDF file content. + + Returns: + Markdown text from all pages. + """ + logger.info("PaddleOcrConverter: 批量上传PDF到OCR API, 大小=%d bytes", len(pdf_bytes)) + markdown = self._get_client().ocr( + file_bytes=pdf_bytes, + filename="document.pdf", + ) + return markdown + def _analyze_page(self, page: Any) -> str: """Analyze page content type.""" # Check for images @@ -289,6 +381,93 @@ def _analyze_page(self, page: Any) -> str: return "plain_text" + def _is_scanned_page(self, page: Any) -> bool: + """Check if a page is likely a scanned image. + + A page is considered scanned if: + 1. It contains images, AND + 2. It has very little extractable text (below threshold) + + Args: + page: pdfplumber page object + + Returns: + True if the page appears to be a scanned image + """ + # Must have images to be a scan + has_images = hasattr(page, "images") and bool(page.images) + if not has_images: + return False + + # Check extractable text length + try: + text = page.extract_text() or "" + text_len = len(text.strip()) + # If there's substantial text, it might be a mixed page or + # a digital PDF with embedded images + if text_len >= self.scan_text_threshold: + return False + except Exception: + # If text extraction fails, assume it's a scan + return True + + return True + + def _detect_all_scanned(self, pdf: Any) -> bool: + """Detect if entire PDF is scanned based on scan_detection_mode. + + Optimization: When first few pages are scanned, we can assume + all pages are scanned and skip per-page analysis. + + Args: + pdf: pdfplumber PDF object + + Returns: + True if entire PDF should be treated as scanned + """ + if self.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE: + return False + + total_pages = len(pdf.pages) + if total_pages == 0: + return False + + if self.scan_detection_mode == ScanDetectionMode.FIRST_PAGE_HINT: + # Check only first page + first_page = pdf.pages[0] + is_scanned = self._is_scanned_page(first_page) + first_page.close() + if is_scanned: + logger.info( + "PaddleOcrConverter: 首页检测为扫描件, 模式=FIRST_PAGE_HINT, 全文档使用OCR" + ) + return is_scanned + + if self.scan_detection_mode == ScanDetectionMode.SAMPLING: + # Sample first N pages + sample_count = min(self.scan_sample_pages, total_pages) + scanned_count = 0 + + for i in range(sample_count): + page = pdf.pages[i] + if self._is_scanned_page(page): + scanned_count += 1 + + # If majority of sampled pages are scanned, treat all as scanned + majority_threshold = sample_count // 2 + 1 + all_scanned = scanned_count >= majority_threshold + + if all_scanned: + logger.info( + "PaddleOcrConverter: 抽样检测 %d/%d 页为扫描件, 模式=SAMPLING, 全文档使用OCR", + scanned_count, + sample_count, + ) + + return all_scanned + + return False + def _convert_with_paddleocr(self, page: Any, page_num: int) -> str: """Convert page using PaddleOCR API.""" # Render page to image diff --git a/packages/markitdown-paddleocr/tests/test_scan_detection.py b/packages/markitdown-paddleocr/tests/test_scan_detection.py new file mode 100644 index 000000000..116197fe6 --- /dev/null +++ b/packages/markitdown-paddleocr/tests/test_scan_detection.py @@ -0,0 +1,430 @@ +"""Tests for scan detection optimization.""" + +import pytest +from unittest.mock import MagicMock, patch + +from markitdown_paddleocr._config import PaddleOcrConfig, ScanDetectionMode +from markitdown_paddleocr._converter import PaddleOcrConverter + + +class TestScanDetectionMode: + """扫描检测模式配置测试""" + + def test_default_mode_is_sampling(self): + """默认模式应为 SAMPLING""" + config = PaddleOcrConfig() + assert config.scan_detection_mode == ScanDetectionMode.SAMPLING + + def test_custom_mode_from_config(self): + """从配置对象读取自定义模式""" + config = PaddleOcrConfig(scan_detection_mode=ScanDetectionMode.FIRST_PAGE_HINT) + converter = PaddleOcrConverter(config=config, token="test_token") + assert converter.scan_detection_mode == ScanDetectionMode.FIRST_PAGE_HINT + + def test_custom_mode_from_constructor(self): + """从构造函数传入自定义模式""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) + assert converter.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE + + def test_constructor_overrides_config(self): + """构造函数参数优先于配置对象""" + config = PaddleOcrConfig(scan_detection_mode=ScanDetectionMode.FIRST_PAGE_HINT) + converter = PaddleOcrConverter( + config=config, + token="test_token", + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) + assert converter.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE + + +class TestIsScannedPage: + """扫描页面检测测试""" + + def test_page_without_images_not_scanned(self): + """无图片的页面不是扫描件""" + converter = PaddleOcrConverter(token="test_token") + + page = MagicMock() + page.images = [] + page.extract_text.return_value = "Some text content here" + + assert converter._is_scanned_page(page) is False + + def test_page_with_images_and_text_not_scanned(self): + """有图片但有足够文本的页面不是扫描件""" + converter = PaddleOcrConverter(token="test_token", scan_text_threshold=50) + + page = MagicMock() + page.images = [MagicMock()] + page.extract_text.return_value = "This is more than 50 characters of text content that should be extracted" + + assert converter._is_scanned_page(page) is False + + def test_page_with_images_no_text_is_scanned(self): + """有图片但无文本的页面是扫描件""" + converter = PaddleOcrConverter(token="test_token", scan_text_threshold=50) + + page = MagicMock() + page.images = [MagicMock()] + page.extract_text.return_value = "" + + assert converter._is_scanned_page(page) is True + + def test_page_with_images_little_text_is_scanned(self): + """有图片但文本少于阈值的页面是扫描件""" + converter = PaddleOcrConverter(token="test_token", scan_text_threshold=50) + + page = MagicMock() + page.images = [MagicMock()] + page.extract_text.return_value = "Short text" # Only 10 chars + + assert converter._is_scanned_page(page) is True + + def test_text_extraction_error_assumes_scanned(self): + """文本提取失败时假定是扫描件""" + converter = PaddleOcrConverter(token="test_token") + + page = MagicMock() + page.images = [MagicMock()] + page.extract_text.side_effect = Exception("Extraction failed") + + assert converter._is_scanned_page(page) is True + + def test_custom_threshold(self): + """自定义阈值生效""" + converter = PaddleOcrConverter(token="test_token", scan_text_threshold=100) + + # Text below threshold + page1 = MagicMock() + page1.images = [MagicMock()] + page1.extract_text.return_value = "This is exactly 50 characters" # ~30 chars + + assert converter._is_scanned_page(page1) is True + + # Text above threshold + page2 = MagicMock() + page2.images = [MagicMock()] + page2.extract_text.return_value = "This is definitely more than 100 characters of text content here for testing and verification purposes" # 106 chars + + assert converter._is_scanned_page(page2) is False + + +class TestDetectAllScanned: + """全文档扫描检测测试""" + + def test_page_by_page_mode_returns_false(self): + """PAGE_BY_PAGE 模式永远返回 False""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) + + # Even with all scanned pages + pdf = MagicMock() + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + pdf.pages = [scanned_page, scanned_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is False + + def test_first_page_hint_first_page_scanned(self): + """FIRST_PAGE_HINT 模式,首页扫描则全文档扫描""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.FIRST_PAGE_HINT, + ) + + # First page scanned + pdf = MagicMock() + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + pdf.pages = [scanned_page, normal_page, normal_page] + + assert converter._detect_all_scanned(pdf) is True + + def test_first_page_hint_first_page_not_scanned(self): + """FIRST_PAGE_HINT 模式,首页非扫描则不判定全扫描""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.FIRST_PAGE_HINT, + ) + + # First page not scanned + pdf = MagicMock() + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + pdf.pages = [normal_page, scanned_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is False + + def test_sampling_mode_majority_scanned(self): + """SAMPLING 模式,多数页面扫描则全文档扫描""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # 3 pages, 2 scanned, 1 normal -> majority scanned + pdf = MagicMock() + + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + pdf.pages = [scanned_page, scanned_page, normal_page] + + assert converter._detect_all_scanned(pdf) is True + + def test_sampling_mode_minority_scanned(self): + """SAMPLING 模式,少数页面扫描则不判定全扫描""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # 3 pages, 1 scanned, 2 normal -> minority scanned + pdf = MagicMock() + + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + pdf.pages = [normal_page, normal_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is False + + def test_sampling_mode_all_scanned(self): + """SAMPLING 模式,所有抽样页扫描则全文档扫描""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + pdf = MagicMock() + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + pdf.pages = [scanned_page, scanned_page, scanned_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is True + + def test_sampling_mode_custom_sample_count(self): + """SAMPLING 模式,自定义抽样页数""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=5, + ) + + # 5 pages sampled, 3 scanned -> majority + pdf = MagicMock() + + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + normal_page = MagicMock() + normal_page.images = [] + normal_page.extract_text.return_value = "Normal text" + + pdf.pages = [scanned_page, scanned_page, scanned_page, normal_page, normal_page] + + assert converter._detect_all_scanned(pdf) is True + + def test_empty_pdf_returns_false(self): + """空 PDF 返回 False""" + converter = PaddleOcrConverter(token="test_token") + + pdf = MagicMock() + pdf.pages = [] + + assert converter._detect_all_scanned(pdf) is False + + def test_pdf_with_less_pages_than_sample_count(self): + """PDF 页数少于抽样数时使用实际页数""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=5, + ) + + # Only 2 pages, both scanned -> majority + pdf = MagicMock() + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + + pdf.pages = [scanned_page, scanned_page] + + assert converter._detect_all_scanned(pdf) is True + + +class TestConvertPdfWithScanDetection: + """PDF 转换中的扫描检测集成测试""" + + def test_all_scanned_uses_batch_mode(self): + """全扫描模式优先使用批量上传""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # Mock _detect_all_scanned to return True + converter._detect_all_scanned = MagicMock(return_value=True) + converter._convert_pdf_batch = MagicMock(return_value="Batch OCR result") + converter._convert_with_paddleocr = MagicMock(return_value="Page OCR result") + + # Mock PDF + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + + pdf = MagicMock() + pdf.pages = [scanned_page, scanned_page] + + with patch("markitdown_paddleocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = pdf + + import io + stream = io.BytesIO(b"%PDF-1.4") + result = converter._convert_pdf(stream) + + # Should call batch mode (1 API call) + converter._convert_pdf_batch.assert_called_once() + # Should NOT call per-page OCR + converter._convert_with_paddleocr.assert_not_called() + assert "Batch OCR result" in result.markdown + + def test_batch_failure_fallback_to_per_page(self): + """批量OCR失败后降级为逐页处理""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # Mock _detect_all_scanned to return True + converter._detect_all_scanned = MagicMock(return_value=True) + converter._convert_pdf_batch = MagicMock(side_effect=RuntimeError("Batch API error")) + converter._convert_with_paddleocr = MagicMock(return_value="Page OCR result") + + # Mock PDF + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + + pdf = MagicMock() + pdf.pages = [scanned_page, scanned_page] + + with patch("markitdown_paddleocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = pdf + + import io + stream = io.BytesIO(b"%PDF-1.4") + result = converter._convert_pdf(stream) + + # Should have tried batch first + converter._convert_pdf_batch.assert_called_once() + # Should fall back to per-page OCR + assert converter._convert_with_paddleocr.call_count == 2 + + def test_all_scanned_skips_per_page_analysis(self): + """全扫描模式跳过逐页分析""" + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.SAMPLING, + scan_sample_pages=3, + ) + + # Mock _detect_all_scanned to return True + converter._detect_all_scanned = MagicMock(return_value=True) + converter._convert_pdf_batch = MagicMock(return_value="Batch OCR result") + converter._analyze_page = MagicMock(return_value="plain_text") + + # Mock PDF + scanned_page = MagicMock() + scanned_page.images = [MagicMock()] + scanned_page.extract_text.return_value = "" + scanned_page.close = MagicMock() + + pdf = MagicMock() + pdf.pages = [scanned_page, scanned_page] + + with patch("markitdown_paddleocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = pdf + + import io + stream = io.BytesIO(b"%PDF-1.4") + result = converter._convert_pdf(stream) + + # Should call batch mode, not _analyze_page + converter._convert_pdf_batch.assert_called_once() + converter._analyze_page.assert_not_called() + + def test_page_by_page_mode_analyzes_each_page(self): + """PAGE_BY_PAGE 模式分析每页""" + + converter = PaddleOcrConverter( + token="test_token", + scan_detection_mode=ScanDetectionMode.PAGE_BY_PAGE, + ) + + # Mock _analyze_page to return different results + converter._analyze_page = MagicMock(side_effect=["plain_text", "complex"]) + converter._convert_with_paddleocr = MagicMock(return_value="OCR result") + converter._extract_text_with_tables = MagicMock(return_value="Text result") + + # Mock PDF + page1 = MagicMock() + page1.close = MagicMock() + page2 = MagicMock() + page2.close = MagicMock() + + pdf = MagicMock() + pdf.pages = [page1, page2] + + with patch("markitdown_paddleocr._converter.pdfplumber.open") as mock_open: + mock_open.return_value.__enter__.return_value = pdf + + import io + stream = io.BytesIO(b"%PDF-1.4") + result = converter._convert_pdf(stream) + + # Should analyze each page + assert converter._analyze_page.call_count == 2 + # Should use different methods for different pages + converter._extract_text_with_tables.assert_called_once() + converter._convert_with_paddleocr.assert_called_once() \ No newline at end of file From 67e871b40520fbbd50a0195211f6b474a1272660 Mon Sep 17 00:00:00 2001 From: hankl Date: Thu, 21 May 2026 16:22:20 +0800 Subject: [PATCH 11/15] =?UTF-8?q?=E4=B8=8A=E4=BC=A0pypi=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/markitdown-glmocr/README.md | 69 ++++++++++++---- .../src/markitdown_glmocr/__about__.py | 2 +- packages/markitdown-paddleocr/README.md | 69 ++++++++++++---- .../src/markitdown_paddleocr/__about__.py | 2 +- scripts/pypi-upload.ps1 | 76 ++++++++++++++++++ scripts/pypi-upload.sh | 79 +++++++++++++++++++ 6 files changed, 265 insertions(+), 32 deletions(-) create mode 100644 scripts/pypi-upload.ps1 create mode 100644 scripts/pypi-upload.sh diff --git a/packages/markitdown-glmocr/README.md b/packages/markitdown-glmocr/README.md index 15c2b819e..35c221524 100644 --- a/packages/markitdown-glmocr/README.md +++ b/packages/markitdown-glmocr/README.md @@ -195,39 +195,78 @@ glmocr SDK 返回的结构化数据支持以下标签: ### 前置条件 -- 确保已安装 `build` 和 `twine`: +1. 安装构建工具: ```bash -pip install build twine +pip install build twine hatch ``` -- 确保环境变量 `PyPI_API_Token` 已设置为你的 PyPI API Token: +2. 配置 PyPI API Token(Windows 用户环境变量): +```powershell +# PowerShell 设置用户环境变量 +[System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User') +``` + +或在 Bash/Zsh 中: + +```bash +export PYPI_API_TOKEN="pypi-..." +``` + +### 快速发布(推荐) + +项目根目录提供了上传脚本,可一键发布两个插件: + +**Bash / Git Bash:** ```bash -export PyPI_API_Token="pypi-..." +# 构建两个插件 +cd packages/markitdown-glmocr && hatch build + +cd ../markitdown-paddleocr && hatch build + +# 上传(自动上传所有构建的版本) +cd ../.. +./scripts/pypi-upload.sh + +# 或指定版本号 +./scripts/pypi-upload.sh 0.2.0 +``` + +**PowerShell:** +```powershell +# 构建两个插件 +cd packages/markitdown-glmocr; hatch build +cd ../markitdown-paddleocr; hatch build + +# 上传 +cd ../.. +.\scripts\pypi-upload.ps1 + +# 或指定版本号 +.\scripts\pypi-upload.ps1 -Version "0.2.0" ``` -### 发布步骤 +### 手动发布 ```bash -# 1. 进入项目根目录(包含 pyproject.toml) +# 1. 进入项目目录 cd packages/markitdown-glmocr -# 2. 构建分发包(生成 dist/ 目录下的 .tar.gz 和 .whl 文件) -python -m build +# 2. 构建 +hatch build -# 3. 检查包的元数据和内容 +# 3. 检查 twine check dist/* -# 4. 上传到 PyPI(使用环境变量中的 Token 认证) -twine upload dist/* -u __token__ -p "$PyPI_API_Token" +# 4. 上传 +twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/* ``` ### 发布到 TestPyPI(测试) ```bash -# 先上传到 TestPyPI 验证包是否正确 -twine upload --repository testpypi dist/* -u __token__ -p "$PyPI_API_Token" +twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/* # 从 TestPyPI 安装验证 pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr @@ -235,9 +274,9 @@ pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr ### 注意事项 -- 发布前确保 `pyproject.toml` 中的版本号已更新 +- 发布前确保 `src/markitdown_glmocr/__about__.py` 中的版本号已更新 - 同一版本号不能重复上传,如需修正必须 bump 版本号 -- `PyPI_API_Token` 环境变量切勿硬编码到脚本或提交到代码仓库 +- `PYPI_API_TOKEN` 切勿提交到代码仓库 ## 许可证 diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py b/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py index 3dc1f76bc..d3ec452c3 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/packages/markitdown-paddleocr/README.md b/packages/markitdown-paddleocr/README.md index 2f4c49a13..ac7011d4d 100644 --- a/packages/markitdown-paddleocr/README.md +++ b/packages/markitdown-paddleocr/README.md @@ -156,39 +156,78 @@ PaddleOcrConverter.convert() ### 前置条件 -- 确保已安装 `build` 和 `twine`: +1. 安装构建工具: ```bash -pip install build twine +pip install build twine hatch ``` -- 确保环境变量 `PyPI_API_Token` 已设置为你的 PyPI API Token: +2. 配置 PyPI API Token(Windows 用户环境变量): +```powershell +# PowerShell 设置用户环境变量 +[System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User') +``` + +或在 Bash/Zsh 中: + +```bash +export PYPI_API_TOKEN="pypi-..." +``` + +### 快速发布(推荐) + +项目根目录提供了上传脚本,可一键发布两个插件: + +**Bash / Git Bash:** ```bash -export PyPI_API_Token="pypi-..." +# 构建两个插件 +cd packages/markitdown-glmocr && hatch build + +cd ../markitdown-paddleocr && hatch build + +# 上传(自动上传所有构建的版本) +cd ../.. +./scripts/pypi-upload.sh + +# 或指定版本号 +./scripts/pypi-upload.sh 0.2.0 +``` + +**PowerShell:** +```powershell +# 构建两个插件 +cd packages/markitdown-glmocr; hatch build +cd ../markitdown-paddleocr; hatch build + +# 上传 +cd ../.. +.\scripts\pypi-upload.ps1 + +# 或指定版本号 +.\scripts\pypi-upload.ps1 -Version "0.2.0" ``` -### 发布步骤 +### 手动发布 ```bash -# 1. 进入项目根目录(包含 pyproject.toml) +# 1. 进入项目目录 cd packages/markitdown-paddleocr -# 2. 构建分发包(生成 dist/ 目录下的 .tar.gz 和 .whl 文件) -python -m build +# 2. 构建 +hatch build -# 3. 检查包的元数据和内容 +# 3. 检查 twine check dist/* -# 4. 上传到 PyPI(使用环境变量中的 Token 认证) -twine upload dist/* -u __token__ -p "$PyPI_API_Token" +# 4. 上传 +twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/* ``` ### 发布到 TestPyPI(测试) ```bash -# 先上传到 TestPyPI 验证包是否正确 -twine upload --repository testpypi dist/* -u __token__ -p "$PyPI_API_Token" +twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/* # 从 TestPyPI 安装验证 pip install --index-url https://test.pypi.org/simple/ markitdown-paddleocr @@ -196,9 +235,9 @@ pip install --index-url https://test.pypi.org/simple/ markitdown-paddleocr ### 注意事项 -- 发布前确保 `pyproject.toml` 中的版本号已更新 +- 发布前确保 `src/markitdown_paddleocr/__about__.py` 中的版本号已更新 - 同一版本号不能重复上传,如需修正必须 bump 版本号 -- `PyPI_API_Token` 环境变量切勿硬编码到脚本或提交到代码仓库 +- `PYPI_API_TOKEN` 切勿提交到代码仓库 ## 许可证 diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py index 3dc1f76bc..d3ec452c3 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/scripts/pypi-upload.ps1 b/scripts/pypi-upload.ps1 new file mode 100644 index 000000000..a1dbec0b6 --- /dev/null +++ b/scripts/pypi-upload.ps1 @@ -0,0 +1,76 @@ +# 上传 markitdown-glmocr 和 markitdown-paddleocr 到 PyPI +# 用法: .\scripts\pypi-upload.ps1 [-Version "0.2.0"] +# -Version: 可选,指定版本号,默认上传 dist 目录下所有文件 + +param( + [string]$Version = "" +) + +$ErrorActionPreference = "Stop" + +Write-Host "=== PyPI Upload Script ===" -ForegroundColor Green +Write-Host "" + +# 从用户环境变量读取 PYPI_API_TOKEN +$PypiToken = [System.Environment]::GetEnvironmentVariable('PYPI_API_TOKEN', 'User') + +if ([string]::IsNullOrEmpty($PypiToken)) { + Write-Host "错误: 未找到 PYPI_API_TOKEN 环境变量" -ForegroundColor Red + Write-Host "请在 Windows 用户环境变量中配置 PYPI_API_TOKEN" + exit 1 +} + +Write-Host "✓ PyPI API Token 已加载" -ForegroundColor Green +Write-Host "" + +# 设置 UTF-8 编码 +$env:PYTHONUTF8 = "1" + +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$ProjectRoot = Split-Path -Parent $ScriptDir + +$Packages = @("markitdown-glmocr", "markitdown-paddleocr") + +foreach ($Pkg in $Packages) { + $PkgDir = Join-Path $ProjectRoot "packages\$Pkg" + $DistDir = Join-Path $PkgDir "dist" + + if (-not (Test-Path $DistDir)) { + Write-Host "跳过 $Pkg : dist 目录不存在" -ForegroundColor Yellow + continue + } + + Write-Host "--- 上传 $Pkg ---" -ForegroundColor Green + + # 获取包名格式 (markitdown-glmocr -> markitdown_glmocr) + $PkgName = $Pkg -replace '-', '_' + + # 确定要上传的文件 + if ($Version) { + $Pattern = "$PkgName-$Version*" + } else { + $Pattern = "$PkgName*" + } + + $UploadFiles = Get-ChildItem -Path $DistDir -Filter $Pattern -ErrorAction SilentlyContinue + + if ($UploadFiles) { + Write-Host "文件:" + $UploadFiles | ForEach-Object { Write-Host " $($_.Name)" } + Write-Host "" + + $FilesArg = $UploadFiles | ForEach-Object { $_.FullName } + & twine upload --username __token__ --password $PypiToken --disable-progress-bar @FilesArg + + # 提取版本号 + $LatestVersion = ($UploadFiles[0].Name | Select-String -Pattern '\d+\.\d+\.\d+').Matches.Value + Write-Host "✓ $Pkg 上传成功!" -ForegroundColor Green + Write-Host " https://pypi.org/project/$Pkg/$LatestVersion/" -ForegroundColor Cyan + Write-Host "" + } else { + Write-Host "跳过 $Pkg : 未找到版本 $Version 的构建文件" -ForegroundColor Yellow + Write-Host "" + } +} + +Write-Host "=== 上传完成 ===" -ForegroundColor Green diff --git a/scripts/pypi-upload.sh b/scripts/pypi-upload.sh new file mode 100644 index 000000000..dcd3ca6e6 --- /dev/null +++ b/scripts/pypi-upload.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# 上传 markitdown-glmocr 和 markitdown-paddleocr 到 PyPI +# 用法: ./scripts/pypi-upload.sh [version] +# version: 可选,指定版本号,默认上传 dist 目录下所有文件 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +# 颜色输出 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}=== PyPI Upload Script ===${NC}" +echo "" + +# 从 Windows 用户环境变量读取 PYPI_API_TOKEN +if [ -z "$PYPI_API_TOKEN" ]; then + PYPI_API_TOKEN=$(powershell -Command "[System.Environment]::GetEnvironmentVariable('PYPI_API_TOKEN', 'User')" 2>/dev/null) +fi + +if [ -z "$PYPI_API_TOKEN" ] || [ "$PYPI_API_TOKEN" = "(no output)" ]; then + echo -e "${RED}错误: 未找到 PYPI_API_TOKEN 环境变量${NC}" + echo "请设置 PYPI_API_TOKEN 环境变量或在 Windows 用户环境变量中配置" + exit 1 +fi + +echo -e "${GREEN}✓ PyPI API Token 已加载${NC}" +echo "" + +# 设置 UTF-8 编码避免 Windows GBK 问题 +export PYTHONUTF8=1 + +VERSION="${1:-}" +PACKAGES=("markitdown-glmocr" "markitdown-paddleocr") + +for PKG in "${PACKAGES[@]}"; do + PKG_DIR="$PROJECT_ROOT/packages/$PKG" + + if [ ! -d "$PKG_DIR/dist" ]; then + echo -e "${YELLOW}跳过 $PKG: dist 目录不存在${NC}" + continue + fi + + echo -e "${GREEN}--- 上传 $PKG ---${NC}" + + # 获取包名格式 (markitdown-glmocr -> markitdown_glmocr) + PKG_NAME=$(echo "$PKG" | tr '-' '_') + + # 确定要上传的文件 + if [ -n "$VERSION" ]; then + UPLOAD_FILES="$PKG_DIR/dist/${PKG_NAME}-${VERSION}*" + else + UPLOAD_FILES="$PKG_DIR/dist/${PKG_NAME}*" + fi + + # 检查文件是否存在 + if ls $UPLOAD_FILES 1> /dev/null 2>&1; then + echo "文件:" + ls $UPLOAD_FILES + echo "" + + twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar $UPLOAD_FILES + + # 从输出中提取版本号 + LATEST_VERSION=$(ls $UPLOAD_FILES | head -1 | grep -oP '\d+\.\d+\.\d+' | head -1) + echo -e "${GREEN}✓ $PKG 上传成功!${NC}" + echo " https://pypi.org/project/$PKG/${LATEST_VERSION:-latest}/" + echo "" + else + echo -e "${YELLOW}跳过 $PKG: 未找到版本 ${VERSION:-任何} 的构建文件${NC}" + echo "" + fi +done + +echo -e "${GREEN}=== 上传完成 ===${NC}" From 3d674e76f120171f0c9436adccc475f2dc023c1d Mon Sep 17 00:00:00 2001 From: hankl Date: Tue, 2 Jun 2026 11:18:31 +0800 Subject: [PATCH 12/15] =?UTF-8?q?markitdown-glmocr:=20=E6=89=B9=E9=87=8FOC?= =?UTF-8?q?R=E5=A4=B1=E8=B4=A5=E6=97=B6=E7=9B=B4=E6=8E=A5=E6=8A=9B?= =?UTF-8?q?=E5=BC=82=E5=B8=B8=E8=AE=A9=E6=A1=86=E6=9E=B6fallback=EF=BC=8C?= =?UTF-8?q?=E4=B8=8D=E5=86=8D=E9=99=8D=E7=BA=A7=E4=B8=BA=E9=80=90=E9=A1=B5?= =?UTF-8?q?=E5=A4=84=E7=90=86=EF=BC=88=E9=81=BF=E5=85=8D429=E9=99=90?= =?UTF-8?q?=E6=B5=81=E6=97=B6N=E6=AC=A1=E6=97=A0=E6=95=88=E9=87=8D?= =?UTF-8?q?=E8=AF=95=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/markitdown_glmocr/_converter.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py index 19fa0b90f..35e00900c 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_converter.py @@ -118,7 +118,9 @@ def __init__( if scan_detection_mode is not None else ScanDetectionMode.SAMPLING ) - self.scan_sample_pages = scan_sample_pages if scan_sample_pages is not None else 3 + self.scan_sample_pages = ( + scan_sample_pages if scan_sample_pages is not None else 3 + ) self.scan_text_threshold = ( scan_text_threshold if scan_text_threshold is not None else 50 ) @@ -237,11 +239,11 @@ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult: ) return DocumentConverterResult(markdown=markdown) except Exception as e: - logger.warning( - "GlmOcrConverter: 批量OCR失败, 降级为逐页处理, 错误=%s", + logger.error( + "GlmOcrConverter: 批量OCR失败, 抛出异常让框架fallback到下一个converter, 错误=%s", e, ) - # Fall through to per-page processing + raise # Per-page processing (PAGE_BY_PAGE mode or batch failed) for page_num, page in enumerate(pdf.pages): @@ -311,7 +313,9 @@ def _convert_pdf_batch(self, pdf_bytes: bytes) -> str: Returns: Markdown text from all pages. """ - logger.info("GlmOcrConverter: 批量上传PDF到glmocr SDK, 大小=%d bytes", len(pdf_bytes)) + logger.info( + "GlmOcrConverter: 批量上传PDF到glmocr SDK, 大小=%d bytes", len(pdf_bytes) + ) result = self._get_glmocr().parse(pdf_bytes) # Check for errors From b537542233ae890ec932dd6ffb152212d6e27987 Mon Sep 17 00:00:00 2001 From: hankl Date: Tue, 2 Jun 2026 15:04:56 +0800 Subject: [PATCH 13/15] bump version: markitdown-glmocr 0.2.1, markitdown-paddleocr 0.2.1 --- packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py | 2 +- .../markitdown-paddleocr/src/markitdown_paddleocr/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py b/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py index d3ec452c3..3ced3581b 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py @@ -1 +1 @@ -__version__ = "0.2.0" +__version__ = "0.2.1" diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py index d3ec452c3..3ced3581b 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py @@ -1 +1 @@ -__version__ = "0.2.0" +__version__ = "0.2.1" From 4f94d9b10012ec1faf863f462c00822932e8d181 Mon Sep 17 00:00:00 2001 From: hankl Date: Tue, 2 Jun 2026 16:13:00 +0800 Subject: [PATCH 14/15] =?UTF-8?q?glmocr=E5=92=8Cpaddleocr=E4=BC=98?= =?UTF-8?q?=E5=85=88=E7=BA=A7=E5=B9=B3=E7=BA=A7(-1.0)=EF=BC=8C=E7=94=B1?= =?UTF-8?q?=E4=B8=8A=E5=B1=82agent=20skills=E6=8E=A7=E5=88=B6=E8=B0=83?= =?UTF-8?q?=E7=94=A8=E9=A1=BA=E5=BA=8F=EF=BC=9Bbump=20version=200.2.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../markitdown-glmocr/src/markitdown_glmocr/__about__.py | 2 +- packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py | 6 +++--- .../src/markitdown_paddleocr/__about__.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py b/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py index 3ced3581b..b5fdc7530 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/__about__.py @@ -1 +1 @@ -__version__ = "0.2.1" +__version__ = "0.2.2" diff --git a/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py b/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py index 5963dd43c..025a5ffd7 100644 --- a/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py +++ b/packages/markitdown-glmocr/src/markitdown_glmocr/_plugin.py @@ -25,9 +25,9 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: logger.info("markitdown-glmocr: 开始注册插件") # Register converter - # Priority -2.0: higher priority than PaddleOcrConverter (-1.0), - # so glmocr is tried first and paddleocr serves as fallback. - PRIORITY_GLMOCR = -2.0 + # Priority -1.0: same level as PaddleOcrConverter, + # the upper-level agent's skills control which plugin to call first. + PRIORITY_GLMOCR = -1.0 try: converter = GlmOcrConverter( diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py index 3ced3581b..b5fdc7530 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py @@ -1 +1 @@ -__version__ = "0.2.1" +__version__ = "0.2.2" From 3f04dbcdfedb450a4a87cf258e6f5e8eb7a34e91 Mon Sep 17 00:00:00 2001 From: hankl Date: Tue, 2 Jun 2026 16:28:19 +0800 Subject: [PATCH 15/15] =?UTF-8?q?markitdown-paddleocr:=20PaddleOCR?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E4=BB=8EVL-1.5=E5=88=87=E6=8D=A2=E5=88=B0VL-?= =?UTF-8?q?1.6;=20bump=20version=200.2.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/markitdown-paddleocr/README.md | 6 +++--- .../src/markitdown_paddleocr/__about__.py | 2 +- .../src/markitdown_paddleocr/_config.py | 5 +++-- .../src/markitdown_paddleocr/_converter.py | 14 +++++++++----- .../src/markitdown_paddleocr/_dual_converter.py | 15 +++++++++++---- .../src/markitdown_paddleocr/_plugin.py | 2 +- 6 files changed, 28 insertions(+), 16 deletions(-) diff --git a/packages/markitdown-paddleocr/README.md b/packages/markitdown-paddleocr/README.md index ac7011d4d..e64f7120a 100644 --- a/packages/markitdown-paddleocr/README.md +++ b/packages/markitdown-paddleocr/README.md @@ -25,7 +25,7 @@ pip install markitdown-paddleocr export BAIDU_PADDLE_TOKEN="your-paddle-token" # 可选 -export PADDLE_OCR_MODEL="PaddleOCR-VL-1.5" # 模型名称 +export PADDLE_OCR_MODEL="PaddleOCR-VL-1.6" # 模型名称 ``` ### 配置优先级 @@ -103,7 +103,7 @@ print(markdown) | 参数 | 类型 | 默认值 | 说明 | |------|------|--------|------| | `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token | -| `model` | str | `PaddleOCR-VL-1.5` | OCR 模型名称 | +| `model` | str | `PaddleOCR-VL-1.6` | OCR 模型名称 | | `poll_interval` | float | 2.0 | 轮询间隔(秒) | | `poll_timeout` | float | 300.0 | 轮询超时(秒) | | `force_ai` | bool | False | 强制所有页面使用 OCR | @@ -116,7 +116,7 @@ print(markdown) | 变量 | 说明 | 示例 | |------|------|------| | `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` | -| `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.5` | +| `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.6` | ## 工作原理 diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py index b5fdc7530..d31c31eae 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/__about__.py @@ -1 +1 @@ -__version__ = "0.2.2" +__version__ = "0.2.3" diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py index ddd6ca794..e66bb21e6 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_config.py @@ -12,6 +12,7 @@ class ScanDetectionMode(str, Enum): - FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR - SAMPLING: 抽样前N页,多数是扫描件则全部OCR """ + PAGE_BY_PAGE = "page_by_page" FIRST_PAGE_HINT = "first_page_hint" SAMPLING = "sampling" @@ -31,7 +32,7 @@ class PaddleOcrConfig: token: str = "" # Reads from BAIDU_PADDLE_TOKEN by default # OCR model - model: str = "PaddleOCR-VL-1.5" + model: str = "PaddleOCR-VL-1.6" # API endpoint job_url: str = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs" @@ -58,7 +59,7 @@ def from_env(cls, **overrides) -> "PaddleOcrConfig": """Create config from environment variables with optional overrides.""" defaults = { "token": os.environ.get("BAIDU_PADDLE_TOKEN", ""), - "model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.5"), + "model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.6"), } defaults.update(overrides) return cls(**defaults) diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py index 48e5c2bd6..6a11b8c85 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_converter.py @@ -51,7 +51,7 @@ class PaddleOcrConverter(DocumentConverter): def __init__( self, token: Optional[str] = None, - model: str = "PaddleOCR-VL-1.5", + model: str = "PaddleOCR-VL-1.6", poll_interval: float = 2.0, poll_timeout: float = 300.0, force_ai: bool = False, @@ -67,7 +67,7 @@ def __init__( Args: token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided) - model: OCR model name (default: PaddleOCR-VL-1.5) + model: OCR model name (default: PaddleOCR-VL-1.6) poll_interval: Seconds between status polls (default: 2.0) poll_timeout: Max seconds to wait for job completion (default: 300.0) force_ai: Force all pages to use OCR (default: False) @@ -82,7 +82,7 @@ def __init__( # Build config from explicit params or provided config if config: self.token = token or config.token - self.model = model if model != "PaddleOCR-VL-1.5" else config.model + self.model = model if model != "PaddleOCR-VL-1.6" else config.model self.poll_interval = ( poll_interval if poll_interval != 2.0 else config.poll_interval ) @@ -126,7 +126,9 @@ def __init__( if scan_detection_mode is not None else ScanDetectionMode.SAMPLING ) - self.scan_sample_pages = scan_sample_pages if scan_sample_pages is not None else 3 + self.scan_sample_pages = ( + scan_sample_pages if scan_sample_pages is not None else 3 + ) self.scan_text_threshold = ( scan_text_threshold if scan_text_threshold is not None else 50 ) @@ -357,7 +359,9 @@ def _convert_pdf_batch(self, pdf_bytes: bytes) -> str: Returns: Markdown text from all pages. """ - logger.info("PaddleOcrConverter: 批量上传PDF到OCR API, 大小=%d bytes", len(pdf_bytes)) + logger.info( + "PaddleOcrConverter: 批量上传PDF到OCR API, 大小=%d bytes", len(pdf_bytes) + ) markdown = self._get_client().ocr( file_bytes=pdf_bytes, filename="document.pdf", diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_dual_converter.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_dual_converter.py index e27395c4d..0957b9b87 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_dual_converter.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_dual_converter.py @@ -1,10 +1,14 @@ """DualOcrConverter - glmocr (primary) → paddleocr (fallback) automatic degradation.""" import logging -from typing import Optional +from typing import Any, BinaryIO, Optional -from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo -from typing import BinaryIO, Any +from markitdown import ( + DocumentConverter, + DocumentConverterResult, + MarkItDown, + StreamInfo, +) logger = logging.getLogger(__name__) @@ -28,7 +32,7 @@ def __init__( glmocr_force_ai: bool = False, # paddleocr kwargs paddleocr_token: Optional[str] = None, - paddleocr_model: str = "PaddleOCR-VL-1.5", + paddleocr_model: str = "PaddleOCR-VL-1.6", paddleocr_poll_interval: float = 2.0, paddleocr_poll_timeout: float = 300.0, paddleocr_force_ai: bool = False, @@ -61,6 +65,7 @@ def _init_converters(self): """Lazily init both converters.""" try: from markitdown_glmocr import GlmOcrConverter + # Filter out None values kwargs = {k: v for k, v in self.glmocr_kwargs.items() if v is not None} self._primary = GlmOcrConverter(**kwargs) @@ -71,6 +76,7 @@ def _init_converters(self): try: from markitdown_paddleocr import PaddleOcrConverter + kwargs = {k: v for k, v in self.paddleocr_kwargs.items() if v is not None} self._fallback = PaddleOcrConverter(**kwargs) logger.info("paddleocr converter initialized (fallback)") @@ -155,6 +161,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): def io_bytes(data: bytes): """Create a seekable BytesIO from bytes.""" import io + buf = io.BytesIO(data) buf.seek(0) return buf diff --git a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py index d24916ac5..e84e70bb8 100644 --- a/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py +++ b/packages/markitdown-paddleocr/src/markitdown_paddleocr/_plugin.py @@ -28,7 +28,7 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: try: converter = PaddleOcrConverter( token=kwargs.get("token"), - model=kwargs.get("model", "PaddleOCR-VL-1.5"), + model=kwargs.get("model", "PaddleOCR-VL-1.6"), poll_interval=kwargs.get("poll_interval", 2.0), poll_timeout=kwargs.get("poll_timeout", 300.0), force_ai=kwargs.get("force_ai", False),