更新pdf_processor,适用mineru

2025-07-15 00:29:34 +08:00 · 2025-07-15 00:29:34 +08:00 · 88b790dd6b
parent d3e1927bc5
commit 88b790dd6b
4 changed files with 92 additions and 9 deletions
--- a/backend/PDF_PROCESSOR_README.md
+++ b/backend/PDF_PROCESSOR_README.md
@ -24,18 +24,36 @@ Add the following settings to your environment or `.env` file:
 # Mineru API Configuration
 MINERU_API_URL=http://mineru-api:8000
 MINERU_TIMEOUT=300
+MINERU_LANG_LIST=["ch"]
+MINERU_BACKEND=pipeline
+MINERU_PARSE_METHOD=auto
+MINERU_FORMULA_ENABLE=true
+MINERU_TABLE_ENABLE=true
 ```

 ### 4. API Endpoint

-The processor expects Mineru to provide a REST API endpoint at `/extract` that accepts PDF files via multipart form data and returns JSON with markdown content.
+The processor expects Mineru to provide a REST API endpoint at `/file_parse` that accepts PDF files via multipart form data and returns JSON with markdown content.

 #### Expected Request Format:
 ```
-POST /extract
+POST /file_parse
 Content-Type: multipart/form-data

-file: [PDF file]
+files: [PDF file]
+output_dir: ./output
+lang_list: ["ch"]
+backend: pipeline
+parse_method: auto
+formula_enable: true
+table_enable: true
+return_md: true
+return_middle_json: false
+return_model_output: false
+return_content_list: false
+return_images: false
+start_page_id: 0
+end_page_id: 99999
 ```

 #### Expected Response Format:
@ -49,6 +67,14 @@ The processor can handle multiple response formats:

 OR

+```json
+{
+  "md": "# Document Title\n\nContent here..."
+}
+```
+
+OR
+
 ```json
 {
  "content": "# Document Title\n\nContent here..."
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -32,8 +32,15 @@ class Settings(BaseSettings):
    OLLAMA_MODEL: str = "llama2"

    # Mineru API settings
-    MINERU_API_URL: str = "http://mineru-api:8000"
+    # MINERU_API_URL: str = "http://mineru-api:8001"
+    MINERU_API_URL: str = "http://host.docker.internal:8001"
+
    MINERU_TIMEOUT: int = 300  # 5 minutes timeout
+    MINERU_LANG_LIST: list = ["ch"]  # Language list for parsing
+    MINERU_BACKEND: str = "pipeline"  # Backend to use
+    MINERU_PARSE_METHOD: str = "auto"  # Parse method
+    MINERU_FORMULA_ENABLE: bool = True  # Enable formula parsing
+    MINERU_TABLE_ENABLE: bool = True  # Enable table parsing

    # Logging settings
    LOG_LEVEL: str = "INFO"
--- a/backend/app/core/document_handlers/processors/pdf_processor.py
+++ b/backend/app/core/document_handlers/processors/pdf_processor.py
@ -29,6 +29,11 @@ class PdfDocumentProcessor(DocumentProcessor):
        # Mineru API configuration
        self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
        self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300)  # 5 minutes timeout
+        self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
+        self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
+        self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
+        self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
+        self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)

    def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
        """
@ -41,15 +46,33 @@ class PdfDocumentProcessor(DocumentProcessor):
            API response as dictionary or None if failed
        """
        try:
-            url = f"{self.mineru_base_url}/extract"
+            url = f"{self.mineru_base_url}/file_parse"
            
            with open(file_path, 'rb') as file:
-                files = {'file': (os.path.basename(file_path), file, 'application/pdf')}
+                files = {'files': (os.path.basename(file_path), file, 'application/pdf')}
+                
+                # Prepare form data according to Mineru API specification
+                data = {
+                    'output_dir': './output',
+                    'lang_list': self.mineru_lang_list,
+                    'backend': self.mineru_backend,
+                    'parse_method': self.mineru_parse_method,
+                    'formula_enable': self.mineru_formula_enable,
+                    'table_enable': self.mineru_table_enable,
+                    'return_md': True,
+                    'return_middle_json': False,
+                    'return_model_output': False,
+                    'return_content_list': False,
+                    'return_images': False,
+                    'start_page_id': 0,
+                    'end_page_id': 99999
+                }
                
                logger.info(f"Calling Mineru API at {url}")
                response = requests.post(
                    url, 
                    files=files,
+                    data=data,
                    timeout=self.mineru_timeout
                )
                
@ -82,9 +105,13 @@ class PdfDocumentProcessor(DocumentProcessor):
            Extracted markdown content as string
        """
        try:
-            # Try different possible response formats
+            logger.debug(f"Mineru API response structure: {response}")
+            
+            # Try different possible response formats based on Mineru API
            if 'markdown' in response:
                return response['markdown']
+            elif 'md' in response:
+                return response['md']
            elif 'content' in response:
                return response['content']
            elif 'text' in response:
@ -93,14 +120,32 @@ class PdfDocumentProcessor(DocumentProcessor):
                result = response['result']
                if 'markdown' in result:
                    return result['markdown']
+                elif 'md' in result:
+                    return result['md']
                elif 'content' in result:
                    return result['content']
                elif 'text' in result:
                    return result['text']
+            elif 'data' in response and isinstance(response['data'], dict):
+                data = response['data']
+                if 'markdown' in data:
+                    return data['markdown']
+                elif 'md' in data:
+                    return data['md']
+                elif 'content' in data:
+                    return data['content']
+                elif 'text' in data:
+                    return data['text']
+            elif isinstance(response, list) and len(response) > 0:
+                # If response is a list, try to extract from first item
+                first_item = response[0]
+                if isinstance(first_item, dict):
+                    return self._extract_markdown_from_response(first_item)
+                elif isinstance(first_item, str):
+                    return first_item
            else:
                # If no standard format found, try to extract from the response structure
                logger.warning("Could not find standard markdown field in Mineru response")
-                logger.debug(f"Mineru response structure: {response}")
                
                # Return the response as string if it's simple, or empty string
                if isinstance(response, str):
@ -110,6 +155,11 @@ class PdfDocumentProcessor(DocumentProcessor):
                    for key, value in response.items():
                        if isinstance(value, str) and len(value) > 100:  # Likely content
                            return value
+                        elif isinstance(value, dict):
+                            # Recursively search in nested dictionaries
+                            nested_content = self._extract_markdown_from_response(value)
+                            if nested_content:
+                                return nested_content
                
                return ""
                
--- a/mineru/docker-compose.yml
+++ b/mineru/docker-compose.yml
@ -7,7 +7,7 @@ services:
      dockerfile: Dockerfile
    platform: linux/arm64
    ports:
-      - "8000:8000"
+      - "8001:8000"
    volumes:
      - ./storage/uploads:/app/storage/uploads
      - ./storage/processed:/app/storage/processed