slide-translate/refine.py

import argparse
from pdf_convertor import load_md_file, save_md_images, refine_content
from pathlib import Path

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Refine Markdown content from PDF.")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--md-path", type=str, help="Path to the input Markdown file.")
    group.add_argument(
        "--all",
        action="store_true",
        help="Process all markdown files in the output directory.",
    )
    args = parser.parse_args()

    if args.all:
        output_dir = Path("output")
        for md_file_path in output_dir.glob("*/index.md"):
            md_path = md_file_path
            pdf_path = Path("input").joinpath(md_file_path.parent.name + ".pdf")

            output = md_file_path.parent
            output.mkdir(parents=True, exist_ok=True)

            md, images = load_md_file(md_path)
            with open(pdf_path, "rb") as pdf_file:
                pdf = pdf_file.read()
            md = refine_content(md, images, pdf)

            save_md_images(output, md, images, md_name="index_refined.md")
    else:
        md_path = Path(args.md_path)
        pdf_path = Path("input").joinpath(md_path.parent.name + ".pdf")

        output = md_path.parent
        output.mkdir(parents=True, exist_ok=True)

        md, images = load_md_file(md_path)
        with open(pdf_path, "rb") as pdf_file:
            pdf = pdf_file.read()
        md = refine_content(md, images, pdf)

        save_md_images(output, md, images, md_name="index_refined.md")