refactor: 统一使用 OpenAI 兼容 API，支持自定义 base_url/key/model

- 移除 Gemini 和 Ollama 独立适配，统一使用 ChatOpenAI + base_url - config.ini 简化为 BASE_URL / API_KEY / MODEL / TEMPERATURE / MAX_RETRIES - 新增 config.example.ini 示例配置 - 移除 langchain-google-genai / langchain-ollama / pymupdf 依赖 - main.py 新增断点续跑：跳过已有 index.md / index_refined.md - LLM 请求支持 max_retries 自动重试（默认 3 次） - 优化 README
2026-04-18 18:42:42 +10:00
parent f1214be148
commit 1c1c68a214
7 changed files with 1399 additions and 2072 deletions
@@ -1,6 +1,7 @@
 import os
 from pdf_convertor import (
    convert_pdf_to_markdown,
+    load_md_file,
    save_md_images,
    refine_content,
 )
@@ -24,12 +25,29 @@ def main():

        current_output_dir.mkdir(parents=True, exist_ok=True)

+        index_md = current_output_dir / "index.md"
+        refined_md = current_output_dir / "index_refined.md"
+
+        # 整个跳过：已存在精炼结果，跳过该文件
+        if refined_md.exists():
+            print(f"Skipping {pdf_path.name}: already processed (index_refined.md exists)")
+            continue
+
        print(f"Processing {pdf_path} -> {current_output_dir}")

        with open(pdf_path, "rb") as pdf_file:
            pdf_content = pdf_file.read()
-            md, images = convert_pdf_to_markdown(pdf_content)
-            save_md_images(current_output_dir, md, images)
+
+            # 部分跳过：已存在转换结果，跳过 PDF 转 MD
+            if index_md.exists():
+                print(f"  Skipping PDF→MD conversion: index.md already exists")
+                md, images = load_md_file(index_md)
+            else:
+                md, images = convert_pdf_to_markdown(pdf_content)
+                save_md_images(current_output_dir, md, images)
+
+            # 部分跳过：已存在精炼结果，跳过 LLM 精炼
+            # (前面已检查整个跳过，这里不需要再检查)

            try:
                md = refine_content(md, images, pdf_content)