import argparse from pdf_convertor import load_md_file, save_md_images, refine_content from pathlib import Path if __name__ == "__main__": parser = argparse.ArgumentParser(description="Refine Markdown content from PDF.") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--md-path", type=str, help="Path to the input Markdown file.") group.add_argument( "--all", action="store_true", help="Process all markdown files in the output directory.", ) args = parser.parse_args() if args.all: output_dir = Path("output") for md_file_path in output_dir.glob("*/index.md"): md_path = md_file_path pdf_path = Path("input").joinpath(md_file_path.parent.name + ".pdf") output = md_file_path.parent output.mkdir(parents=True, exist_ok=True) md, images = load_md_file(md_path) with open(pdf_path, "rb") as pdf_file: pdf = pdf_file.read() md = refine_content(md, images, pdf) save_md_images(output, md, images, md_name="index_refined.md") else: md_path = Path(args.md_path) pdf_path = Path("input").joinpath(md_path.parent.name + ".pdf") output = md_path.parent output.mkdir(parents=True, exist_ok=True) md, images = load_md_file(md_path) with open(pdf_path, "rb") as pdf_file: pdf = pdf_file.read() md = refine_content(md, images, pdf) save_md_images(output, md, images, md_name="index_refined.md")