From f1214be148791ef00e90872fdd6f9a32510dcbe3 Mon Sep 17 00:00:00 2001
From: nite <admin@nite07.com>
Date: Wed, 12 Nov 2025 19:14:19 +1100
Subject: [PATCH] feat(llm-integration): Enhance prompt clarity and unify PDF
 attachment

This commit improves the structure and clarity of the prompt sent to the LLM (Gemini/OpenAI) in the `refine_content` function.

Changes include:
*   Adding explicit introductory text for the Markdown, individual images, and PDF sections to guide the LLM on the purpose of each input.
*   Introducing clear "START OF IMAGE" and "END OF IMAGE" delimiters for each image to better define their boundaries.
*   Unifying the PDF attachment mechanism for both Gemini and OpenAI providers, simplifying the code and ensuring consistent handling of PDF input.

These changes aim to improve the LLM's understanding of the provided content, leading to more accurate and relevant refinements.
---
 pdf_convertor.py | 52 +++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/pdf_convertor.py b/pdf_convertor.py
index a700d44..b0b77bf 100755
--- a/pdf_convertor.py
+++ b/pdf_convertor.py
@@ -153,6 +153,14 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
 
     # 添加 Markdown
     human_message_parts = []
+
+    human_message_parts.append(
+        {
+            "type": "text",
+            "text": "I will provide you with: \n1. Original Markdown text (with image references)\n2. Individual images with their exact filenames\n3. Original PDF for reference\n---\nOriginal Markdown text:\n",
+        }
+    )
+
     human_message_parts.append(
         {
             "type": "text",
@@ -161,11 +169,17 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
     )
 
     # 添加图片
+    human_message_parts.append(
+        {
+            "type": "text",
+            "text": "\n\n---\n\nIndividual images (use ONLY these to describe the corresponding Markdown image references):\n\n",
+        }
+    )
     for image_name in images.keys():
         human_message_parts.append(
             {
                 "type": "text",
-                "text": f"This is image: '{image_name}':\n",
+                "text": f"=== START OF IMAGE: '{image_name}' ===\n",
             }
         )
         human_message_parts.append(
@@ -175,31 +189,29 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
                 "mime_type": "image/png",
             }
         )
+        human_message_parts.append(
+            {
+                "type": "text",
+                "text": f"=== END OF IMAGE: '{image_name}' ===\n\n",
+            }
+        )
 
     # 添加 PDF
-    if provider == "gemini":
-        human_message_parts.extend(
-            [
-                {
-                    "type": "text",
-                    "text": "This is original PDF file:\n",
-                },
-                {
-                    "type": "media",
-                    "mime_type": "application/pdf",
-                    "data": base64.b64encode(pdf).decode("utf-8"),
-                },
-            ]
-        )
-    if provider == "openai":
+    human_message_parts.append(
+        {
+            "type": "text",
+            "text": "\n---\n\nOriginal PDF (for overall layout and context reference only):\n\n",
+        }
+    )
+
+    if provider == "gemini" or provider == "openai":
         human_message_parts.extend(
             [
                 {
                     "type": "file",
-                    "file": {
-                        "filename": "origin.pdf",
-                        "file_data": f"data:application/pdf;base64,{base64.b64encode(pdf).decode('utf-8')}",
-                    },
+                    "base64": base64.b64encode(pdf).decode("utf-8"),
+                    "mime_type": "application/pdf",
+                    "filename": "origin.pdf",
                 },
             ]
         )