refactor: 统一使用 OpenAI 兼容 API,支持自定义 base_url/key/model

- 移除 Gemini 和 Ollama 独立适配,统一使用 ChatOpenAI + base_url
- config.ini 简化为 BASE_URL / API_KEY / MODEL / TEMPERATURE / MAX_RETRIES
- 新增 config.example.ini 示例配置
- 移除 langchain-google-genai / langchain-ollama / pymupdf 依赖
- main.py 新增断点续跑:跳过已有 index.md / index_refined.md
- LLM 请求支持 max_retries 自动重试(默认 3 次)
- 优化 README
This commit is contained in:
2026-04-18 18:42:42 +10:00
parent f1214be148
commit 1c1c68a214
7 changed files with 1399 additions and 2072 deletions
+20 -2
View File
@@ -1,6 +1,7 @@
import os
from pdf_convertor import (
convert_pdf_to_markdown,
load_md_file,
save_md_images,
refine_content,
)
@@ -24,12 +25,29 @@ def main():
current_output_dir.mkdir(parents=True, exist_ok=True)
index_md = current_output_dir / "index.md"
refined_md = current_output_dir / "index_refined.md"
# 整个跳过:已存在精炼结果,跳过该文件
if refined_md.exists():
print(f"Skipping {pdf_path.name}: already processed (index_refined.md exists)")
continue
print(f"Processing {pdf_path} -> {current_output_dir}")
with open(pdf_path, "rb") as pdf_file:
pdf_content = pdf_file.read()
md, images = convert_pdf_to_markdown(pdf_content)
save_md_images(current_output_dir, md, images)
# 部分跳过:已存在转换结果,跳过 PDF 转 MD
if index_md.exists():
print(f" Skipping PDF→MD conversion: index.md already exists")
md, images = load_md_file(index_md)
else:
md, images = convert_pdf_to_markdown(pdf_content)
save_md_images(current_output_dir, md, images)
# 部分跳过:已存在精炼结果,跳过 LLM 精炼
# (前面已检查整个跳过,这里不需要再检查)
try:
md = refine_content(md, images, pdf_content)