更新pdf_processor,适用mineru

This commit is contained in:
oliviamn 2025-07-15 00:29:34 +08:00
parent d3e1927bc5
commit 88b790dd6b
4 changed files with 92 additions and 9 deletions

View File

@ -24,18 +24,36 @@ Add the following settings to your environment or `.env` file:
# Mineru API Configuration # Mineru API Configuration
MINERU_API_URL=http://mineru-api:8000 MINERU_API_URL=http://mineru-api:8000
MINERU_TIMEOUT=300 MINERU_TIMEOUT=300
MINERU_LANG_LIST=["ch"]
MINERU_BACKEND=pipeline
MINERU_PARSE_METHOD=auto
MINERU_FORMULA_ENABLE=true
MINERU_TABLE_ENABLE=true
``` ```
### 4. API Endpoint ### 4. API Endpoint
The processor expects Mineru to provide a REST API endpoint at `/extract` that accepts PDF files via multipart form data and returns JSON with markdown content. The processor expects Mineru to provide a REST API endpoint at `/file_parse` that accepts PDF files via multipart form data and returns JSON with markdown content.
#### Expected Request Format: #### Expected Request Format:
``` ```
POST /extract POST /file_parse
Content-Type: multipart/form-data Content-Type: multipart/form-data
file: [PDF file] files: [PDF file]
output_dir: ./output
lang_list: ["ch"]
backend: pipeline
parse_method: auto
formula_enable: true
table_enable: true
return_md: true
return_middle_json: false
return_model_output: false
return_content_list: false
return_images: false
start_page_id: 0
end_page_id: 99999
``` ```
#### Expected Response Format: #### Expected Response Format:
@ -49,6 +67,14 @@ The processor can handle multiple response formats:
OR OR
```json
{
"md": "# Document Title\n\nContent here..."
}
```
OR
```json ```json
{ {
"content": "# Document Title\n\nContent here..." "content": "# Document Title\n\nContent here..."

View File

@ -32,8 +32,15 @@ class Settings(BaseSettings):
OLLAMA_MODEL: str = "llama2" OLLAMA_MODEL: str = "llama2"
# Mineru API settings # Mineru API settings
MINERU_API_URL: str = "http://mineru-api:8000" # MINERU_API_URL: str = "http://mineru-api:8001"
MINERU_API_URL: str = "http://host.docker.internal:8001"
MINERU_TIMEOUT: int = 300 # 5 minutes timeout MINERU_TIMEOUT: int = 300 # 5 minutes timeout
MINERU_LANG_LIST: list = ["ch"] # Language list for parsing
MINERU_BACKEND: str = "pipeline" # Backend to use
MINERU_PARSE_METHOD: str = "auto" # Parse method
MINERU_FORMULA_ENABLE: bool = True # Enable formula parsing
MINERU_TABLE_ENABLE: bool = True # Enable table parsing
# Logging settings # Logging settings
LOG_LEVEL: str = "INFO" LOG_LEVEL: str = "INFO"

View File

@ -29,6 +29,11 @@ class PdfDocumentProcessor(DocumentProcessor):
# Mineru API configuration # Mineru API configuration
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000') self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]: def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
""" """
@ -41,15 +46,33 @@ class PdfDocumentProcessor(DocumentProcessor):
API response as dictionary or None if failed API response as dictionary or None if failed
""" """
try: try:
url = f"{self.mineru_base_url}/extract" url = f"{self.mineru_base_url}/file_parse"
with open(file_path, 'rb') as file: with open(file_path, 'rb') as file:
files = {'file': (os.path.basename(file_path), file, 'application/pdf')} files = {'files': (os.path.basename(file_path), file, 'application/pdf')}
# Prepare form data according to Mineru API specification
data = {
'output_dir': './output',
'lang_list': self.mineru_lang_list,
'backend': self.mineru_backend,
'parse_method': self.mineru_parse_method,
'formula_enable': self.mineru_formula_enable,
'table_enable': self.mineru_table_enable,
'return_md': True,
'return_middle_json': False,
'return_model_output': False,
'return_content_list': False,
'return_images': False,
'start_page_id': 0,
'end_page_id': 99999
}
logger.info(f"Calling Mineru API at {url}") logger.info(f"Calling Mineru API at {url}")
response = requests.post( response = requests.post(
url, url,
files=files, files=files,
data=data,
timeout=self.mineru_timeout timeout=self.mineru_timeout
) )
@ -82,9 +105,13 @@ class PdfDocumentProcessor(DocumentProcessor):
Extracted markdown content as string Extracted markdown content as string
""" """
try: try:
# Try different possible response formats logger.debug(f"Mineru API response structure: {response}")
# Try different possible response formats based on Mineru API
if 'markdown' in response: if 'markdown' in response:
return response['markdown'] return response['markdown']
elif 'md' in response:
return response['md']
elif 'content' in response: elif 'content' in response:
return response['content'] return response['content']
elif 'text' in response: elif 'text' in response:
@ -93,14 +120,32 @@ class PdfDocumentProcessor(DocumentProcessor):
result = response['result'] result = response['result']
if 'markdown' in result: if 'markdown' in result:
return result['markdown'] return result['markdown']
elif 'md' in result:
return result['md']
elif 'content' in result: elif 'content' in result:
return result['content'] return result['content']
elif 'text' in result: elif 'text' in result:
return result['text'] return result['text']
elif 'data' in response and isinstance(response['data'], dict):
data = response['data']
if 'markdown' in data:
return data['markdown']
elif 'md' in data:
return data['md']
elif 'content' in data:
return data['content']
elif 'text' in data:
return data['text']
elif isinstance(response, list) and len(response) > 0:
# If response is a list, try to extract from first item
first_item = response[0]
if isinstance(first_item, dict):
return self._extract_markdown_from_response(first_item)
elif isinstance(first_item, str):
return first_item
else: else:
# If no standard format found, try to extract from the response structure # If no standard format found, try to extract from the response structure
logger.warning("Could not find standard markdown field in Mineru response") logger.warning("Could not find standard markdown field in Mineru response")
logger.debug(f"Mineru response structure: {response}")
# Return the response as string if it's simple, or empty string # Return the response as string if it's simple, or empty string
if isinstance(response, str): if isinstance(response, str):
@ -110,6 +155,11 @@ class PdfDocumentProcessor(DocumentProcessor):
for key, value in response.items(): for key, value in response.items():
if isinstance(value, str) and len(value) > 100: # Likely content if isinstance(value, str) and len(value) > 100: # Likely content
return value return value
elif isinstance(value, dict):
# Recursively search in nested dictionaries
nested_content = self._extract_markdown_from_response(value)
if nested_content:
return nested_content
return "" return ""

View File

@ -7,7 +7,7 @@ services:
dockerfile: Dockerfile dockerfile: Dockerfile
platform: linux/arm64 platform: linux/arm64
ports: ports:
- "8000:8000" - "8001:8000"
volumes: volumes:
- ./storage/uploads:/app/storage/uploads - ./storage/uploads:/app/storage/uploads
- ./storage/processed:/app/storage/processed - ./storage/processed:/app/storage/processed