refactor: 统一使用 OpenAI 兼容 API,支持自定义 base_url/key/model
- 移除 Gemini 和 Ollama 独立适配,统一使用 ChatOpenAI + base_url - config.ini 简化为 BASE_URL / API_KEY / MODEL / TEMPERATURE / MAX_RETRIES - 新增 config.example.ini 示例配置 - 移除 langchain-google-genai / langchain-ollama / pymupdf 依赖 - main.py 新增断点续跑:跳过已有 index.md / index_refined.md - LLM 请求支持 max_retries 自动重试(默认 3 次) - 优化 README
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
from pdf_convertor import (
|
||||
convert_pdf_to_markdown,
|
||||
load_md_file,
|
||||
save_md_images,
|
||||
refine_content,
|
||||
)
|
||||
@@ -24,12 +25,29 @@ def main():
|
||||
|
||||
current_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
index_md = current_output_dir / "index.md"
|
||||
refined_md = current_output_dir / "index_refined.md"
|
||||
|
||||
# 整个跳过:已存在精炼结果,跳过该文件
|
||||
if refined_md.exists():
|
||||
print(f"Skipping {pdf_path.name}: already processed (index_refined.md exists)")
|
||||
continue
|
||||
|
||||
print(f"Processing {pdf_path} -> {current_output_dir}")
|
||||
|
||||
with open(pdf_path, "rb") as pdf_file:
|
||||
pdf_content = pdf_file.read()
|
||||
md, images = convert_pdf_to_markdown(pdf_content)
|
||||
save_md_images(current_output_dir, md, images)
|
||||
|
||||
# 部分跳过:已存在转换结果,跳过 PDF 转 MD
|
||||
if index_md.exists():
|
||||
print(f" Skipping PDF→MD conversion: index.md already exists")
|
||||
md, images = load_md_file(index_md)
|
||||
else:
|
||||
md, images = convert_pdf_to_markdown(pdf_content)
|
||||
save_md_images(current_output_dir, md, images)
|
||||
|
||||
# 部分跳过:已存在精炼结果,跳过 LLM 精炼
|
||||
# (前面已检查整个跳过,这里不需要再检查)
|
||||
|
||||
try:
|
||||
md = refine_content(md, images, pdf_content)
|
||||
|
||||
Reference in New Issue
Block a user