Files
slide-translate/main.py

249 lines
9.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import base64
import os
import re
import configparser
import sys
from pathlib import Path
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc.base import ImageRefMode
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_google_genai import ChatGoogleGenerativeAI
def convert_pdf_to_markdown(input_doc_path, output_md_path):
"""Converts a PDF document to Markdown format."""
accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.CUDA
)
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
# Enable the profiling to measure the time spent
settings.debug.profile_pipeline_timings = True
# Convert the document
print(f"Converting {input_doc_path} to Markdown...")
conversion_result = converter.convert(input_doc_path)
doc = conversion_result.document
# List with total time per document
doc_conversion_secs = conversion_result.timings["pipeline_total"].times
doc.save_as_markdown(
filename=Path(output_md_path),
artifacts_dir=Path(
os.path.join(os.path.splitext(os.path.basename(output_md_path))[0], "image")
),
image_mode=ImageRefMode.REFERENCED,
)
print(f"Conversion took: {doc_conversion_secs} seconds")
print(f"Markdown file saved to: {output_md_path}")
def simplify_image_references_in_markdown(markdown_path):
"""Simplifies image names in the markdown file and renames the image files."""
print(f"Simplifying image references in {markdown_path}...")
with open(markdown_path, "r+", encoding="utf-8") as f:
content = f.read()
# Find all unique image paths
image_paths = set(re.findall(r"\((\S*?image_\d{6}_[a-f0-9]+\.png)\)", content))
for old_path in image_paths:
old_path_prefix = os.path.join("output", old_path)
if not os.path.exists(path=old_path_prefix):
continue
directory = os.path.dirname(old_path_prefix)
old_filename = os.path.basename(old_path_prefix)
# Create new filename, e.g., image_000000.png
parts = old_filename.split("_")
new_filename = f"{parts[0]}_{parts[1]}.png"
new_path = os.path.join(directory, new_filename)
# Rename the physical file
if not os.path.exists(new_path):
os.rename(old_path_prefix, new_path)
# Replace the path in the markdown content
new_path_in_markdown = new_path.replace(f"output{os.sep}", "")
content = content.replace(old_path, new_path_in_markdown)
# Go back to the beginning of the file and write the modified content
f.seek(0)
f.write(content)
f.truncate()
print("Image references simplified.")
def refine_and_translate_content(markdown_path, pdf_path):
"""Refines and translates the Markdown content using an LLM."""
print("Starting content refinement and translation...")
config = configparser.ConfigParser()
config.read("config.ini")
google_api_key = config.get("api_keys", "GOOGLE_API_KEY", fallback=None)
if not google_api_key:
print("Error: GOOGLE_API_KEY not found in config.ini")
return
os.environ["GOOGLE_API_KEY"] = google_api_key
try:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)
except Exception as e:
print(
f"Error initializing LLM. Make sure your Google API key is set correctly. Error: {e}"
)
return
try:
with open(markdown_path, "r", encoding="utf-8") as f:
markdown_text = f.read()
markdown_content = markdown_text.encode("utf-8")
with open(pdf_path, "rb") as pdf_file:
pdf_bytes = pdf_file.read()
except FileNotFoundError as e:
print(f"Error reading files: {e}")
return
prompt = """
您是一名专业的科技文档编辑和翻译。您的任务是润色一份从随附 PDF 文档自动转换而来的 Markdown 文本。请以原始 PDF 作为布局、图像和上下文的真实依据。
请根据提供的 Markdown 和 PDF 执行以下四项操作:
1. **清理多余字符**:查看 Markdown 文本,删除原始 PDF 中不存在的任何转换伪影或奇怪格式。
2. **解释图像内容**:参考 PDF 中的图表、示意图和图像,在图像引用后添加清晰的解释。
3. **更正列表格式**:转换可能使嵌套列表扁平化。分析 PDF 中的列表结构,并在 Markdown 中恢复正确的多级缩进。
4. **更正数学公式和符号**:将纯文字公式转换为正确的公式表达,例如 `Kmin` 应使用 `$K_{min}``E = hc/λ`,应使用 `$E = \\frac{hc}{\\lambda}$`
5. **调整标题**:将相同层级的同名标题按照小节内的不同内容重新命名,避免同层级同名标题出现,并且确保大纲的清晰性。
6. **翻译成中文**:将整个清理和更正后的文档翻译成简体中文。当您遇到专业或技术术语时,您必须在其译文旁边保留原始英文术语并用括号括起来。
只需要输出调整翻译后的 markdown 文本,不需要任何其他的文字内容。
"""
human_message_parts = [
{
"type": "media",
"mime_type": "text/markdown",
"data": base64.b64encode(markdown_content).decode("utf-8"),
},
]
# Find all image references in the markdown content
image_paths = re.findall(r"!\[.*?\]\((.*?)\)", markdown_text)
markdown_dir = os.path.dirname(markdown_path)
if image_paths:
print(f"Found {len(image_paths)} image references in the markdown file.")
for image_path in image_paths:
# Construct the full path to the image file
full_image_path = os.path.join(markdown_dir, image_path)
if os.path.exists(full_image_path):
with open(full_image_path, "rb") as f:
image_data = f.read()
human_message_parts.append(
{
"type": "text",
"text": f"这是图片 '{os.path.basename(image_path)}':\n",
}
)
human_message_parts.append(
{
"type": "media",
"mime_type": "image/png",
"data": base64.b64encode(image_data).decode("utf-8"),
}
)
else:
print(f"Warning: Image file not found at {full_image_path}")
human_message_parts.extend(
[
{
"type": "text",
"text": "这是原始的PDF文件:\n",
},
{
"type": "media",
"mime_type": "application/pdf",
"data": base64.b64encode(pdf_bytes).decode("utf-8"),
},
]
)
message_content = [
SystemMessage(prompt),
HumanMessage(human_message_parts),
]
print(
"Sending request to Gemini with the PDF, Markdown and referenced images... This may take a moment."
)
try:
response = llm.invoke(message_content)
refined_content = response.content
except Exception as e:
print(f"An error occurred while invoking the LLM: {e}")
return
refined_output_path = os.path.splitext(markdown_path)[0] + "_refined_zh.md"
with open(refined_output_path, "w", encoding="utf-8") as f:
f.write(str(refined_content))
print(f"Task complete! Refined and translated file saved to: {refined_output_path}")
def main():
input_dir = "input"
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
pdf_files = [f for f in os.listdir(input_dir) if f.endswith(".pdf")]
if not pdf_files:
print(f"Error: No PDF files found in the '{input_dir}' directory.")
sys.exit(1)
for fileName in pdf_files:
print(f"\nProcessing file: {fileName}")
input_doc_path = os.path.join(input_dir, fileName)
output_md_path = os.path.join(output_dir, fileName.replace(".pdf", ".md"))
# Step 1: Convert PDF to Markdown
convert_pdf_to_markdown(input_doc_path, output_md_path)
# Step 2: Simplify image references
simplify_image_references_in_markdown(output_md_path)
# # Step 3: Refine and translate the content
refine_and_translate_content(output_md_path, input_doc_path)
if __name__ == "__main__":
main()