u
This commit is contained in:
@@ -115,26 +115,14 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
|
||||
set_gemini_api_key()
|
||||
|
||||
try:
|
||||
llm = ChatGoogleGenerativeAI(model=get_model_name(), temperature=0)
|
||||
llm = ChatGoogleGenerativeAI(model=get_model_name(), temperature=0.7)
|
||||
except Exception as e:
|
||||
raise BaseException(
|
||||
f"Error initializing LLM. Make sure your Google API key is set correctly. Error: {e}"
|
||||
)
|
||||
|
||||
prompt = """
|
||||
You are a professional technical document editor. Your task is to polish a Markdown text automatically converted from an accompanying PDF document. Please use the original PDF as the source of truth for layout, images, and context.
|
||||
|
||||
Please perform the following operations based on the provided Markdown and PDF:
|
||||
|
||||
1. **Clean up extraneous characters**: Review the Markdown text and remove any conversion artifacts or strange formatting that do not exist in the original PDF.
|
||||
2. **Explain image content**: Refer to charts, diagrams, and images in the PDF, and add descriptions after image citations so that complete information can be obtained through text descriptions even without the images.
|
||||
3. **Correct list formatting**: The conversion may have flattened nested lists. Analyze the list structure in the PDF and restore the correct multi-level indentation in the Markdown.
|
||||
4. **Correct mathematical formulas and symbols**: Convert plain text formulas into proper formula notation, for example, `Kmin` should be `$K_{min}`, and `E = hc/λ` should be `$E = \\frac{hc}{\\lambda}`.
|
||||
5. **Adjust headings**: Rename headings that have the same name according to the different content within the subsections to avoid duplicate same-name headings and ensure the outline is clear.
|
||||
6. **Translate**: Translate the content into Simplified Chinese. Proper nouns should retain their original expression during translation, for example, `Magnetic resonance imaging` should be translated as `磁共振成像(Magnetic resonance imaging, MRI)`. If the term appears multiple times, the original expression should be appended each time.
|
||||
|
||||
Only output the adjusted Markdown text, without any other text content.
|
||||
"""
|
||||
with open("pdf_convertor_prompt.md", "r") as f:
|
||||
prompt = f.read()
|
||||
|
||||
human_message_parts = [
|
||||
{
|
||||
@@ -187,4 +175,19 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
|
||||
except Exception as e:
|
||||
raise BaseException(f"An error occurred while invoking the LLM: {e}")
|
||||
|
||||
return str(refined_content)
|
||||
if str(refined_content) == "":
|
||||
raise BaseException("Response of Gemini is empty")
|
||||
|
||||
return fix_output(str(refined_content))
|
||||
|
||||
|
||||
def fix_output(md: str) -> str:
|
||||
if not md.startswith("['") or not md.endswith("']"):
|
||||
return md
|
||||
md = md.removeprefix("['")
|
||||
md = md.removesuffix("']")
|
||||
md = md.replace("\\n", "\n")
|
||||
md = md.replace("\", '", "\n\n")
|
||||
md = md.replace("', \"", "\n\n")
|
||||
md = md.replace("', '", "\n\n")
|
||||
return md
|
||||
|
||||
Reference in New Issue
Block a user