slide-translate/main.py

import os
from pdf_convertor import (
    convert_pdf_to_markdown,
    load_md_file,
    save_md_images,
    refine_content,
)
from pathlib import Path


def main():
    input_dir = Path("input")
    output_dir = Path("output")

    output_dir.mkdir(parents=True, exist_ok=True)

    for filename in os.listdir(input_dir):
        if not filename.endswith(".pdf"):
            continue

        pdf_path = input_dir.joinpath(filename)
        current_output_dir = output_dir.joinpath(
            pdf_path.name.removesuffix(pdf_path.suffix)
        )

        current_output_dir.mkdir(parents=True, exist_ok=True)

        index_md = current_output_dir / "index.md"
        refined_md = current_output_dir / "index_refined.md"

        # 整个跳过：已存在精炼结果，跳过该文件
        if refined_md.exists():
            print(f"Skipping {pdf_path.name}: already processed (index_refined.md exists)")
            continue

        print(f"Processing {pdf_path} -> {current_output_dir}")

        with open(pdf_path, "rb") as pdf_file:
            pdf_content = pdf_file.read()

            # 部分跳过：已存在转换结果，跳过 PDF 转 MD
            if index_md.exists():
                print(f"  Skipping PDF→MD conversion: index.md already exists")
                md, images = load_md_file(index_md)
            else:
                md, images = convert_pdf_to_markdown(pdf_content)
                save_md_images(current_output_dir, md, images)

            # 部分跳过：已存在精炼结果，跳过 LLM 精炼
            # (前面已检查整个跳过，这里不需要再检查)

            try:
                md = refine_content(md, images, pdf_content)
            except BaseException:
                continue

            save_md_images(current_output_dir, md, images, md_name="index_refined.md")


if __name__ == "__main__":
    main()