Files
slide-translate/main.py
T
nite 1c1c68a214 refactor: 统一使用 OpenAI 兼容 API,支持自定义 base_url/key/model
- 移除 Gemini 和 Ollama 独立适配,统一使用 ChatOpenAI + base_url
- config.ini 简化为 BASE_URL / API_KEY / MODEL / TEMPERATURE / MAX_RETRIES
- 新增 config.example.ini 示例配置
- 移除 langchain-google-genai / langchain-ollama / pymupdf 依赖
- main.py 新增断点续跑:跳过已有 index.md / index_refined.md
- LLM 请求支持 max_retries 自动重试(默认 3 次)
- 优化 README
2026-04-18 18:42:42 +10:00

62 lines
1.8 KiB
Python
Executable File

import os
from pdf_convertor import (
convert_pdf_to_markdown,
load_md_file,
save_md_images,
refine_content,
)
from pathlib import Path
def main():
input_dir = Path("input")
output_dir = Path("output")
output_dir.mkdir(parents=True, exist_ok=True)
for filename in os.listdir(input_dir):
if not filename.endswith(".pdf"):
continue
pdf_path = input_dir.joinpath(filename)
current_output_dir = output_dir.joinpath(
pdf_path.name.removesuffix(pdf_path.suffix)
)
current_output_dir.mkdir(parents=True, exist_ok=True)
index_md = current_output_dir / "index.md"
refined_md = current_output_dir / "index_refined.md"
# 整个跳过:已存在精炼结果,跳过该文件
if refined_md.exists():
print(f"Skipping {pdf_path.name}: already processed (index_refined.md exists)")
continue
print(f"Processing {pdf_path} -> {current_output_dir}")
with open(pdf_path, "rb") as pdf_file:
pdf_content = pdf_file.read()
# 部分跳过:已存在转换结果,跳过 PDF 转 MD
if index_md.exists():
print(f" Skipping PDF→MD conversion: index.md already exists")
md, images = load_md_file(index_md)
else:
md, images = convert_pdf_to_markdown(pdf_content)
save_md_images(current_output_dir, md, images)
# 部分跳过:已存在精炼结果,跳过 LLM 精炼
# (前面已检查整个跳过,这里不需要再检查)
try:
md = refine_content(md, images, pdf_content)
except BaseException:
continue
save_md_images(current_output_dir, md, images, md_name="index_refined.md")
if __name__ == "__main__":
main()