u
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -10,4 +10,5 @@ wheels/
|
||||
.venv
|
||||
input/
|
||||
output/
|
||||
config.ini
|
||||
config.ini
|
||||
test.py
|
||||
5
llm.py
5
llm.py
@@ -21,3 +21,8 @@ def get_model_name() -> str:
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
return config.get("llm", "MODEL_NAME", fallback="gemini-2.5-flash")
|
||||
|
||||
def get_temperature() -> float:
|
||||
config = configparser.ConfigParser()
|
||||
config.read("config.ini")
|
||||
return float(config.get("llm", "TEMPERATURE", fallback=0.7))
|
||||
|
||||
6
main.py
6
main.py
@@ -29,7 +29,11 @@ def main():
|
||||
with open(pdf_path, "rb") as pdf_file:
|
||||
pdf_content = pdf_file.read()
|
||||
md, images = convert_pdf_to_markdown(pdf_content)
|
||||
md = refine_content(md, images, pdf_content)
|
||||
try:
|
||||
md = refine_content(md, images, pdf_content)
|
||||
except BaseException:
|
||||
continue
|
||||
|
||||
save_md_images(current_output_dir, md, images)
|
||||
|
||||
|
||||
|
||||
@@ -115,26 +115,14 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
|
||||
set_gemini_api_key()
|
||||
|
||||
try:
|
||||
llm = ChatGoogleGenerativeAI(model=get_model_name(), temperature=0)
|
||||
llm = ChatGoogleGenerativeAI(model=get_model_name(), temperature=0.7)
|
||||
except Exception as e:
|
||||
raise BaseException(
|
||||
f"Error initializing LLM. Make sure your Google API key is set correctly. Error: {e}"
|
||||
)
|
||||
|
||||
prompt = """
|
||||
You are a professional technical document editor. Your task is to polish a Markdown text automatically converted from an accompanying PDF document. Please use the original PDF as the source of truth for layout, images, and context.
|
||||
|
||||
Please perform the following operations based on the provided Markdown and PDF:
|
||||
|
||||
1. **Clean up extraneous characters**: Review the Markdown text and remove any conversion artifacts or strange formatting that do not exist in the original PDF.
|
||||
2. **Explain image content**: Refer to charts, diagrams, and images in the PDF, and add descriptions after image citations so that complete information can be obtained through text descriptions even without the images.
|
||||
3. **Correct list formatting**: The conversion may have flattened nested lists. Analyze the list structure in the PDF and restore the correct multi-level indentation in the Markdown.
|
||||
4. **Correct mathematical formulas and symbols**: Convert plain text formulas into proper formula notation, for example, `Kmin` should be `$K_{min}`, and `E = hc/λ` should be `$E = \\frac{hc}{\\lambda}`.
|
||||
5. **Adjust headings**: Rename headings that have the same name according to the different content within the subsections to avoid duplicate same-name headings and ensure the outline is clear.
|
||||
6. **Translate**: Translate the content into Simplified Chinese. Proper nouns should retain their original expression during translation, for example, `Magnetic resonance imaging` should be translated as `磁共振成像(Magnetic resonance imaging, MRI)`. If the term appears multiple times, the original expression should be appended each time.
|
||||
|
||||
Only output the adjusted Markdown text, without any other text content.
|
||||
"""
|
||||
with open("pdf_convertor_prompt.md", "r") as f:
|
||||
prompt = f.read()
|
||||
|
||||
human_message_parts = [
|
||||
{
|
||||
@@ -187,4 +175,19 @@ def refine_content(md: str, images: dict[str, bytes], pdf: bytes) -> str:
|
||||
except Exception as e:
|
||||
raise BaseException(f"An error occurred while invoking the LLM: {e}")
|
||||
|
||||
return str(refined_content)
|
||||
if str(refined_content) == "":
|
||||
raise BaseException("Response of Gemini is empty")
|
||||
|
||||
return fix_output(str(refined_content))
|
||||
|
||||
|
||||
def fix_output(md: str) -> str:
|
||||
if not md.startswith("['") or not md.endswith("']"):
|
||||
return md
|
||||
md = md.removeprefix("['")
|
||||
md = md.removesuffix("']")
|
||||
md = md.replace("\\n", "\n")
|
||||
md = md.replace("\", '", "\n\n")
|
||||
md = md.replace("', \"", "\n\n")
|
||||
md = md.replace("', '", "\n\n")
|
||||
return md
|
||||
|
||||
69
pdf_convertor_prompt.md
Normal file
69
pdf_convertor_prompt.md
Normal file
@@ -0,0 +1,69 @@
|
||||
You are a professional technical document editor. Your task is to polish a Markdown text that has been automatically converted from a corresponding PDF document. Please use the original PDF as the sole reference for layout, images, and context.
|
||||
|
||||
Please perform the following operations based on the provided Markdown and PDF:
|
||||
|
||||
1. **Clean up extraneous characters**: Check the Markdown text and delete any conversion artifacts or strange formatting that does not exist in the original PDF.
|
||||
2. **Explain image content**: Refer to the charts, diagrams, and images in the PDF, and add a description after the image reference so that the full information can be obtained from the text description even without the image.
|
||||
|
||||
- Add a blank line after the image reference to control line breaks.
|
||||
|
||||
For example
|
||||
|
||||
```
|
||||

|
||||
|
||||
A detailed explanation of the image, detailed enough to replace the image and help the reader understand the content.
|
||||
```
|
||||
|
||||
3. **Correct list formatting**: The conversion process may flatten nested lists. Please analyze the list structure in the PDF and restore the correct multi-level indentation in Markdown.
|
||||
4. **Correct mathematical formulas and symbols**: Convert plain text formulas into correct formula notation, for example, `Kmin` should be `$K_{min}$`, and `E = hc/λ` should be `$E = \\frac{hc}{\\lambda}`.
|
||||
5. **Adjust headings**: Based on the different content within sub-chapters, rename headings with the same name to avoid duplicate headings and ensure a clear outline.
|
||||
6. **Clean up redundant headings**: If there is no content between adjacent headings of the same level, the headings should be adjusted to conform to standards.
|
||||
|
||||
- For example, the following is improper formatting, where there is no content between multiple headings of the same level
|
||||
|
||||
```
|
||||
## Convolutional Neural Networks: Weight Sharing with Multiple Filters
|
||||
|
||||
## Weight sharing
|
||||
|
||||
Multiple filters can be applied to detect the spatial distributions of multiple visual patterns.
|
||||
|
||||

|
||||
|
||||
This diagram consists of two parts. The left part illustrates how multiple filters (represented by connections of different colors) are applied to an input image, with each filter detecting a different pattern. The right part shows how a single filter (hidden unit / filter response) is convolved over the input to produce a feature map.
|
||||
|
||||
## Convolutional Neural Networks: Weight Sharing and Translation Invariance
|
||||
|
||||
## Weight sharing
|
||||
|
||||
## Translation invariance:
|
||||
|
||||
* Captures statistics in local patches, and they are independent of location.
|
||||
```
|
||||
|
||||
It can be changed to
|
||||
|
||||
```
|
||||
## Convolutional Neural Networks
|
||||
|
||||
### Weight Sharing with Multiple Filters
|
||||
|
||||
Multiple filters can be applied to detect the spatial distributions of multiple visual patterns.
|
||||
|
||||

|
||||
|
||||
This diagram consists of two parts. The left part illustrates how multiple filters (represented by connections of different colors) are applied to an input image, with each filter detecting a different pattern. The right part shows how a single filter (hidden unit / filter response) is convolved over the input to produce a feature map.
|
||||
|
||||
## Convolutional Neural Networks
|
||||
|
||||
### Weight Sharing and Translation Invariance
|
||||
|
||||
#### Translation invariance:
|
||||
|
||||
* Captures statistics in local patches, and they are independent of location.
|
||||
```
|
||||
|
||||
7. **Translate**: Translate the content into Simplified Chinese. When translating, proper nouns should retain their original expression, for example, `Magnetic resonance imaging` should be translated as `磁共振成像(Magnetic resonance imaging, MRI)`. If the term appears multiple times, the original expression should be included each time.
|
||||
|
||||
Only output the adjusted Markdown text, without any other text content. Do not output in JSON format, and do not add ` ``` ` or ` ```markdown ` at the beginning or end of the Markdown.
|
||||
Reference in New Issue
Block a user