feat(llm-integration): Enhance prompt clarity and unify PDF attachment

This commit improves the structure and clarity of the prompt sent to the LLM (Gemini/OpenAI) in the `refine_content` function. Changes include: * Adding explicit introductory text for the Markdown, individual images, and PDF sections to guide the LLM on the purpose of each input. * Introducing clear "START OF IMAGE" and "END OF IMAGE" delimiters for each image to better define their boundaries. * Unifying the PDF attachment mechanism for both Gemini and OpenAI providers, simplifying the code and ensuring consistent handling of PDF input. These changes aim to improve the LLM's understanding of the provided content, leading to more accurate and relevant refinements.
2025-11-12 19:14:19 +11:00
parent 0e4a609c93
commit f1214be148
1 changed files with 32 additions and 20 deletions
--- a/pdf_convertor.py
+++ b/pdf_convertor.py
@@ -153,6 +153,14 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:

    # 添加 Markdown
    human_message_parts = []
+
+    human_message_parts.append(
+        {
+            "type": "text",
+            "text": "I will provide you with: \n1. Original Markdown text (with image references)\n2. Individual images with their exact filenames\n3. Original PDF for reference\n---\nOriginal Markdown text:\n",
+        }
+    )
+
    human_message_parts.append(
        {
            "type": "text",
@@ -161,11 +169,17 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
    )

    # 添加图片
+    human_message_parts.append(
+        {
+            "type": "text",
+            "text": "\n\n---\n\nIndividual images (use ONLY these to describe the corresponding Markdown image references):\n\n",
+        }
+    )
    for image_name in images.keys():
        human_message_parts.append(
            {
                "type": "text",
-                "text": f"This is image: '{image_name}':\n",
+                "text": f"=== START OF IMAGE: '{image_name}' ===\n",
            }
        )
        human_message_parts.append(
@@ -175,31 +189,29 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
                "mime_type": "image/png",
            }
        )
+        human_message_parts.append(
+            {
+                "type": "text",
+                "text": f"=== END OF IMAGE: '{image_name}' ===\n\n",
+            }
+        )

    # 添加 PDF
-    if provider == "gemini":
-        human_message_parts.extend(
-            [
-                {
-                    "type": "text",
-                    "text": "This is original PDF file:\n",
-                },
-                {
-                    "type": "media",
-                    "mime_type": "application/pdf",
-                    "data": base64.b64encode(pdf).decode("utf-8"),
-                },
-            ]
-        )
-    if provider == "openai":
+    human_message_parts.append(
+        {
+            "type": "text",
+            "text": "\n---\n\nOriginal PDF (for overall layout and context reference only):\n\n",
+        }
+    )
+
+    if provider == "gemini" or provider == "openai":
        human_message_parts.extend(
            [
                {
                    "type": "file",
-                    "file": {
-                        "filename": "origin.pdf",
-                        "file_data": f"data:application/pdf;base64,{base64.b64encode(pdf).decode('utf-8')}",
-                    },
+                    "base64": base64.b64encode(pdf).decode("utf-8"),
+                    "mime_type": "application/pdf",
+                    "filename": "origin.pdf",
                },
            ]
        )