import os from pdf_convertor import ( convert_pdf_to_markdown, load_md_file, save_md_images, refine_content, ) from pathlib import Path def main(): input_dir = Path("input") output_dir = Path("output") output_dir.mkdir(parents=True, exist_ok=True) for filename in os.listdir(input_dir): if not filename.endswith(".pdf"): continue pdf_path = input_dir.joinpath(filename) current_output_dir = output_dir.joinpath( pdf_path.name.removesuffix(pdf_path.suffix) ) current_output_dir.mkdir(parents=True, exist_ok=True) index_md = current_output_dir / "index.md" refined_md = current_output_dir / "index_refined.md" # 整个跳过:已存在精炼结果,跳过该文件 if refined_md.exists(): print(f"Skipping {pdf_path.name}: already processed (index_refined.md exists)") continue print(f"Processing {pdf_path} -> {current_output_dir}") with open(pdf_path, "rb") as pdf_file: pdf_content = pdf_file.read() # 部分跳过:已存在转换结果,跳过 PDF 转 MD if index_md.exists(): print(f" Skipping PDF→MD conversion: index.md already exists") md, images = load_md_file(index_md) else: md, images = convert_pdf_to_markdown(pdf_content) save_md_images(current_output_dir, md, images) # 部分跳过:已存在精炼结果,跳过 LLM 精炼 # (前面已检查整个跳过,这里不需要再检查) try: md = refine_content(md, images, pdf_content) except BaseException: continue save_md_images(current_output_dir, md, images, md_name="index_refined.md") if __name__ == "__main__": main()