更新pdf_processor,适用mineru

This commit is contained in:
oliviamn 2025-07-15 00:29:34 +08:00
parent d3e1927bc5
commit 88b790dd6b
4 changed files with 92 additions and 9 deletions

View File

@ -24,18 +24,36 @@ Add the following settings to your environment or `.env` file:
# Mineru API Configuration
MINERU_API_URL=http://mineru-api:8000
MINERU_TIMEOUT=300
MINERU_LANG_LIST=["ch"]
MINERU_BACKEND=pipeline
MINERU_PARSE_METHOD=auto
MINERU_FORMULA_ENABLE=true
MINERU_TABLE_ENABLE=true
```
### 4. API Endpoint
The processor expects Mineru to provide a REST API endpoint at `/extract` that accepts PDF files via multipart form data and returns JSON with markdown content.
The processor expects Mineru to provide a REST API endpoint at `/file_parse` that accepts PDF files via multipart form data and returns JSON with markdown content.
#### Expected Request Format:
```
POST /extract
POST /file_parse
Content-Type: multipart/form-data
file: [PDF file]
files: [PDF file]
output_dir: ./output
lang_list: ["ch"]
backend: pipeline
parse_method: auto
formula_enable: true
table_enable: true
return_md: true
return_middle_json: false
return_model_output: false
return_content_list: false
return_images: false
start_page_id: 0
end_page_id: 99999
```
#### Expected Response Format:
@ -49,6 +67,14 @@ The processor can handle multiple response formats:
OR
```json
{
"md": "# Document Title\n\nContent here..."
}
```
OR
```json
{
"content": "# Document Title\n\nContent here..."

View File

@ -32,8 +32,15 @@ class Settings(BaseSettings):
OLLAMA_MODEL: str = "llama2"
# Mineru API settings
MINERU_API_URL: str = "http://mineru-api:8000"
# MINERU_API_URL: str = "http://mineru-api:8001"
MINERU_API_URL: str = "http://host.docker.internal:8001"
MINERU_TIMEOUT: int = 300 # 5 minutes timeout
MINERU_LANG_LIST: list = ["ch"] # Language list for parsing
MINERU_BACKEND: str = "pipeline" # Backend to use
MINERU_PARSE_METHOD: str = "auto" # Parse method
MINERU_FORMULA_ENABLE: bool = True # Enable formula parsing
MINERU_TABLE_ENABLE: bool = True # Enable table parsing
# Logging settings
LOG_LEVEL: str = "INFO"

View File

@ -29,6 +29,11 @@ class PdfDocumentProcessor(DocumentProcessor):
# Mineru API configuration
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
@ -41,15 +46,33 @@ class PdfDocumentProcessor(DocumentProcessor):
API response as dictionary or None if failed
"""
try:
url = f"{self.mineru_base_url}/extract"
url = f"{self.mineru_base_url}/file_parse"
with open(file_path, 'rb') as file:
files = {'file': (os.path.basename(file_path), file, 'application/pdf')}
files = {'files': (os.path.basename(file_path), file, 'application/pdf')}
# Prepare form data according to Mineru API specification
data = {
'output_dir': './output',
'lang_list': self.mineru_lang_list,
'backend': self.mineru_backend,
'parse_method': self.mineru_parse_method,
'formula_enable': self.mineru_formula_enable,
'table_enable': self.mineru_table_enable,
'return_md': True,
'return_middle_json': False,
'return_model_output': False,
'return_content_list': False,
'return_images': False,
'start_page_id': 0,
'end_page_id': 99999
}
logger.info(f"Calling Mineru API at {url}")
response = requests.post(
url,
files=files,
data=data,
timeout=self.mineru_timeout
)
@ -82,9 +105,13 @@ class PdfDocumentProcessor(DocumentProcessor):
Extracted markdown content as string
"""
try:
# Try different possible response formats
logger.debug(f"Mineru API response structure: {response}")
# Try different possible response formats based on Mineru API
if 'markdown' in response:
return response['markdown']
elif 'md' in response:
return response['md']
elif 'content' in response:
return response['content']
elif 'text' in response:
@ -93,14 +120,32 @@ class PdfDocumentProcessor(DocumentProcessor):
result = response['result']
if 'markdown' in result:
return result['markdown']
elif 'md' in result:
return result['md']
elif 'content' in result:
return result['content']
elif 'text' in result:
return result['text']
elif 'data' in response and isinstance(response['data'], dict):
data = response['data']
if 'markdown' in data:
return data['markdown']
elif 'md' in data:
return data['md']
elif 'content' in data:
return data['content']
elif 'text' in data:
return data['text']
elif isinstance(response, list) and len(response) > 0:
# If response is a list, try to extract from first item
first_item = response[0]
if isinstance(first_item, dict):
return self._extract_markdown_from_response(first_item)
elif isinstance(first_item, str):
return first_item
else:
# If no standard format found, try to extract from the response structure
logger.warning("Could not find standard markdown field in Mineru response")
logger.debug(f"Mineru response structure: {response}")
# Return the response as string if it's simple, or empty string
if isinstance(response, str):
@ -110,6 +155,11 @@ class PdfDocumentProcessor(DocumentProcessor):
for key, value in response.items():
if isinstance(value, str) and len(value) > 100: # Likely content
return value
elif isinstance(value, dict):
# Recursively search in nested dictionaries
nested_content = self._extract_markdown_from_response(value)
if nested_content:
return nested_content
return ""

View File

@ -7,7 +7,7 @@ services:
dockerfile: Dockerfile
platform: linux/arm64
ports:
- "8000:8000"
- "8001:8000"
volumes:
- ./storage/uploads:/app/storage/uploads
- ./storage/processed:/app/storage/processed