更新pdf_processor,适用mineru
This commit is contained in:
parent
d3e1927bc5
commit
88b790dd6b
|
|
@ -24,18 +24,36 @@ Add the following settings to your environment or `.env` file:
|
||||||
# Mineru API Configuration
|
# Mineru API Configuration
|
||||||
MINERU_API_URL=http://mineru-api:8000
|
MINERU_API_URL=http://mineru-api:8000
|
||||||
MINERU_TIMEOUT=300
|
MINERU_TIMEOUT=300
|
||||||
|
MINERU_LANG_LIST=["ch"]
|
||||||
|
MINERU_BACKEND=pipeline
|
||||||
|
MINERU_PARSE_METHOD=auto
|
||||||
|
MINERU_FORMULA_ENABLE=true
|
||||||
|
MINERU_TABLE_ENABLE=true
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. API Endpoint
|
### 4. API Endpoint
|
||||||
|
|
||||||
The processor expects Mineru to provide a REST API endpoint at `/extract` that accepts PDF files via multipart form data and returns JSON with markdown content.
|
The processor expects Mineru to provide a REST API endpoint at `/file_parse` that accepts PDF files via multipart form data and returns JSON with markdown content.
|
||||||
|
|
||||||
#### Expected Request Format:
|
#### Expected Request Format:
|
||||||
```
|
```
|
||||||
POST /extract
|
POST /file_parse
|
||||||
Content-Type: multipart/form-data
|
Content-Type: multipart/form-data
|
||||||
|
|
||||||
file: [PDF file]
|
files: [PDF file]
|
||||||
|
output_dir: ./output
|
||||||
|
lang_list: ["ch"]
|
||||||
|
backend: pipeline
|
||||||
|
parse_method: auto
|
||||||
|
formula_enable: true
|
||||||
|
table_enable: true
|
||||||
|
return_md: true
|
||||||
|
return_middle_json: false
|
||||||
|
return_model_output: false
|
||||||
|
return_content_list: false
|
||||||
|
return_images: false
|
||||||
|
start_page_id: 0
|
||||||
|
end_page_id: 99999
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Expected Response Format:
|
#### Expected Response Format:
|
||||||
|
|
@ -49,6 +67,14 @@ The processor can handle multiple response formats:
|
||||||
|
|
||||||
OR
|
OR
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"md": "# Document Title\n\nContent here..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
OR
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"content": "# Document Title\n\nContent here..."
|
"content": "# Document Title\n\nContent here..."
|
||||||
|
|
|
||||||
|
|
@ -32,8 +32,15 @@ class Settings(BaseSettings):
|
||||||
OLLAMA_MODEL: str = "llama2"
|
OLLAMA_MODEL: str = "llama2"
|
||||||
|
|
||||||
# Mineru API settings
|
# Mineru API settings
|
||||||
MINERU_API_URL: str = "http://mineru-api:8000"
|
# MINERU_API_URL: str = "http://mineru-api:8001"
|
||||||
|
MINERU_API_URL: str = "http://host.docker.internal:8001"
|
||||||
|
|
||||||
MINERU_TIMEOUT: int = 300 # 5 minutes timeout
|
MINERU_TIMEOUT: int = 300 # 5 minutes timeout
|
||||||
|
MINERU_LANG_LIST: list = ["ch"] # Language list for parsing
|
||||||
|
MINERU_BACKEND: str = "pipeline" # Backend to use
|
||||||
|
MINERU_PARSE_METHOD: str = "auto" # Parse method
|
||||||
|
MINERU_FORMULA_ENABLE: bool = True # Enable formula parsing
|
||||||
|
MINERU_TABLE_ENABLE: bool = True # Enable table parsing
|
||||||
|
|
||||||
# Logging settings
|
# Logging settings
|
||||||
LOG_LEVEL: str = "INFO"
|
LOG_LEVEL: str = "INFO"
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,11 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
# Mineru API configuration
|
# Mineru API configuration
|
||||||
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
|
self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
|
||||||
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
|
self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout
|
||||||
|
self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
|
||||||
|
self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
|
||||||
|
self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
|
||||||
|
self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
|
||||||
|
self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
|
||||||
|
|
||||||
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -41,15 +46,33 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
API response as dictionary or None if failed
|
API response as dictionary or None if failed
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
url = f"{self.mineru_base_url}/extract"
|
url = f"{self.mineru_base_url}/file_parse"
|
||||||
|
|
||||||
with open(file_path, 'rb') as file:
|
with open(file_path, 'rb') as file:
|
||||||
files = {'file': (os.path.basename(file_path), file, 'application/pdf')}
|
files = {'files': (os.path.basename(file_path), file, 'application/pdf')}
|
||||||
|
|
||||||
|
# Prepare form data according to Mineru API specification
|
||||||
|
data = {
|
||||||
|
'output_dir': './output',
|
||||||
|
'lang_list': self.mineru_lang_list,
|
||||||
|
'backend': self.mineru_backend,
|
||||||
|
'parse_method': self.mineru_parse_method,
|
||||||
|
'formula_enable': self.mineru_formula_enable,
|
||||||
|
'table_enable': self.mineru_table_enable,
|
||||||
|
'return_md': True,
|
||||||
|
'return_middle_json': False,
|
||||||
|
'return_model_output': False,
|
||||||
|
'return_content_list': False,
|
||||||
|
'return_images': False,
|
||||||
|
'start_page_id': 0,
|
||||||
|
'end_page_id': 99999
|
||||||
|
}
|
||||||
|
|
||||||
logger.info(f"Calling Mineru API at {url}")
|
logger.info(f"Calling Mineru API at {url}")
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url,
|
url,
|
||||||
files=files,
|
files=files,
|
||||||
|
data=data,
|
||||||
timeout=self.mineru_timeout
|
timeout=self.mineru_timeout
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -82,9 +105,13 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
Extracted markdown content as string
|
Extracted markdown content as string
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Try different possible response formats
|
logger.debug(f"Mineru API response structure: {response}")
|
||||||
|
|
||||||
|
# Try different possible response formats based on Mineru API
|
||||||
if 'markdown' in response:
|
if 'markdown' in response:
|
||||||
return response['markdown']
|
return response['markdown']
|
||||||
|
elif 'md' in response:
|
||||||
|
return response['md']
|
||||||
elif 'content' in response:
|
elif 'content' in response:
|
||||||
return response['content']
|
return response['content']
|
||||||
elif 'text' in response:
|
elif 'text' in response:
|
||||||
|
|
@ -93,14 +120,32 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
result = response['result']
|
result = response['result']
|
||||||
if 'markdown' in result:
|
if 'markdown' in result:
|
||||||
return result['markdown']
|
return result['markdown']
|
||||||
|
elif 'md' in result:
|
||||||
|
return result['md']
|
||||||
elif 'content' in result:
|
elif 'content' in result:
|
||||||
return result['content']
|
return result['content']
|
||||||
elif 'text' in result:
|
elif 'text' in result:
|
||||||
return result['text']
|
return result['text']
|
||||||
|
elif 'data' in response and isinstance(response['data'], dict):
|
||||||
|
data = response['data']
|
||||||
|
if 'markdown' in data:
|
||||||
|
return data['markdown']
|
||||||
|
elif 'md' in data:
|
||||||
|
return data['md']
|
||||||
|
elif 'content' in data:
|
||||||
|
return data['content']
|
||||||
|
elif 'text' in data:
|
||||||
|
return data['text']
|
||||||
|
elif isinstance(response, list) and len(response) > 0:
|
||||||
|
# If response is a list, try to extract from first item
|
||||||
|
first_item = response[0]
|
||||||
|
if isinstance(first_item, dict):
|
||||||
|
return self._extract_markdown_from_response(first_item)
|
||||||
|
elif isinstance(first_item, str):
|
||||||
|
return first_item
|
||||||
else:
|
else:
|
||||||
# If no standard format found, try to extract from the response structure
|
# If no standard format found, try to extract from the response structure
|
||||||
logger.warning("Could not find standard markdown field in Mineru response")
|
logger.warning("Could not find standard markdown field in Mineru response")
|
||||||
logger.debug(f"Mineru response structure: {response}")
|
|
||||||
|
|
||||||
# Return the response as string if it's simple, or empty string
|
# Return the response as string if it's simple, or empty string
|
||||||
if isinstance(response, str):
|
if isinstance(response, str):
|
||||||
|
|
@ -110,6 +155,11 @@ class PdfDocumentProcessor(DocumentProcessor):
|
||||||
for key, value in response.items():
|
for key, value in response.items():
|
||||||
if isinstance(value, str) and len(value) > 100: # Likely content
|
if isinstance(value, str) and len(value) > 100: # Likely content
|
||||||
return value
|
return value
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
# Recursively search in nested dictionaries
|
||||||
|
nested_content = self._extract_markdown_from_response(value)
|
||||||
|
if nested_content:
|
||||||
|
return nested_content
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ services:
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
platform: linux/arm64
|
platform: linux/arm64
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8001:8000"
|
||||||
volumes:
|
volumes:
|
||||||
- ./storage/uploads:/app/storage/uploads
|
- ./storage/uploads:/app/storage/uploads
|
||||||
- ./storage/processed:/app/storage/processed
|
- ./storage/processed:/app/storage/processed
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue