feat: Introduce OpenAI LLM provider and update API key handling

This commit integrates OpenAI as a new Large Language Model (LLM) provider, expanding the available options for content refinement. Key changes include: - Added `set_openai_api_key` to handle OpenAI API key retrieval from `config.ini` or environment variables. - Modified `set_api_key` to dynamically read the LLM provider from `config.ini`
2025-11-12 02:51:18 +11:00
parent ae7c579580
commit 1a867844ce
6 changed files with 446 additions and 77 deletions
--- a/pdf_convertor.py
+++ b/pdf_convertor.py
@@ -13,6 +13,7 @@ from docling_core.types.doc.base import ImageRefMode
 from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_ollama import ChatOllama
+from langchain_openai import ChatOpenAI
 from llm import set_api_key, get_model_name, get_temperature
 from io import BytesIO
 from pathlib import Path
@@ -120,7 +121,7 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
    config.read("config.ini")
    provider = config.get("llm", "PROVIDER", fallback="gemini")

-    set_api_key(provider)
+    set_api_key()

    try:
        if provider == "gemini":
@@ -135,6 +136,11 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
                num_ctx=256000,
                num_predict=-1,
            )
+        elif provider == "openai":
+            llm = ChatOpenAI(
+                model=get_model_name(),
+                temperature=get_temperature(),
+            )
        else:
            raise ValueError(f"Unsupported LLM provider: {provider}")
    except Exception as e:
@@ -147,21 +153,12 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:

    # 添加 Markdown
    human_message_parts = []
-    if provider == "gemini":
-        human_message_parts.append(
-            {
-                "type": "media",
-                "mime_type": "text/markdown",
-                "data": base64.b64encode(md.encode("UTF-8")).decode("utf-8"),
-            }
-        )
-    elif provider == "ollama":
-        human_message_parts.append(
-            {
-                "type": "text",
-                "text": md,
-            }
-        )
+    human_message_parts.append(
+        {
+            "type": "text",
+            "text": md,
+        }
+    )

    # 添加图片
    for image_name in images.keys():
@@ -171,21 +168,14 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
                "text": f"This is image: '{image_name}':\n",
            }
        )
-        if provider == "gemini":
-            human_message_parts.append(
-                {
-                    "type": "media",
-                    "mime_type": "image/png",
-                    "data": base64.b64encode(images[image_name]).decode("utf-8"),
-                }
-            )
-        if provider == "ollama":
-            human_message_parts.append(
-                {
-                    "type": "image_url",
-                    "image_url": f"data:image/png;base64,{base64.b64encode(images[image_name]).decode('utf-8')}",
-                }
-            )
+        human_message_parts.append(
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/png;base64,{base64.b64encode(images[image_name]).decode('utf-8')}"
+                },
+            }
+        )

    # 添加 PDF
    if provider == "gemini":
@@ -202,6 +192,18 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
                },
            ]
        )
+    if provider == "openai":
+        human_message_parts.extend(
+            [
+                {
+                    "type": "file",
+                    "file": {
+                        "filename": "origin.pdf",
+                        "file_data": f"data:application/pdf;base64,{base64.b64encode(pdf).decode('utf-8')}",
+                    },
+                },
+            ]
+        )
    if provider == "ollama":
        doc = fitz.open(stream=pdf, filetype="pdf")
        for page_num in range(doc.page_count):