feat: Improve content refinement with SystemMessage and prompt updates
This commit refactors the content refinement process to leverage `SystemMessage` for the primary prompt, enhancing clarity and adherence to LLM best practices. The `pdf_convertor.py` file was updated to: - Import `SystemMessage` from `langchain_core.messages`. - Modify the `refine_content` function to use `SystemMessage` for the main prompt, moving the prompt content from `human_message_parts`. - Adjust `human_message_parts` to only contain the Markdown and image data for the `HumanMessage`. The `pdf_convertor_prompt.md` file was updated to: - Reformat the prompt with clearer headings and instructions for each task. - Improve the clarity and conciseness of the instructions for cleaning up characters, explaining image content, and correcting list formatting. Additionally, `.gitignore` was updated to include `.vscode/` to prevent IDE-specific files from being committed. These changes improve the structure of the LLM interaction and make the prompt more readable and maintainable.
This commit is contained in:
@@ -10,7 +10,7 @@ from docling_core.types.io import DocumentStream
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling_core.types.doc.base import ImageRefMode
|
||||
from langchain_core.messages import HumanMessage
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
from langchain_google_genai import ChatGoogleGenerativeAI
|
||||
from langchain_ollama import ChatOllama
|
||||
from llm import set_api_key, get_model_name, get_temperature
|
||||
@@ -146,7 +146,7 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
|
||||
prompt = f.read()
|
||||
|
||||
# 添加 Markdown
|
||||
human_message_parts = [{"type": "text", "text": prompt}]
|
||||
human_message_parts = []
|
||||
if provider == "gemini":
|
||||
human_message_parts.append(
|
||||
{
|
||||
@@ -223,6 +223,7 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
|
||||
doc.close()
|
||||
|
||||
message_content = [
|
||||
SystemMessage(content=prompt),
|
||||
HumanMessage(content=human_message_parts), # type: ignore
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user