更新pdf_processor,适用mineru

2025-07-15 00:29:34 +08:00 · 2025-07-15 00:29:34 +08:00 · 88b790dd6b
parent d3e1927bc5
commit 88b790dd6b
4 changed files with 92 additions and 9 deletions
--- a/backend/PDF_PROCESSOR_README.md
+++ b/backend/PDF_PROCESSOR_README.md
@ -24,18 +24,36 @@ Add the following settings to your environment or `.env` file:
 # Mineru API Configuration
 MINERU_API_URL=http://mineru-api:8000
 MINERU_TIMEOUT=300
 MINERU_LANG_LIST=["ch"]
 MINERU_BACKEND=pipeline
 MINERU_PARSE_METHOD=auto
 MINERU_FORMULA_ENABLE=true
 MINERU_TABLE_ENABLE=true
 ```
 ### 4. API Endpoint
-The processor expects Mineru to provide a REST API endpoint at `/extract` that accepts PDF files via multipart form data and returns JSON with markdown content.
+The processor expects Mineru to provide a REST API endpoint at `/file_parse` that accepts PDF files via multipart form data and returns JSON with markdown content.
 #### Expected Request Format:
 ```
-POST /extract
+POST /file_parse
 Content-Type: multipart/form-data
-file: [PDF file]
+files: [PDF file]
 output_dir: ./output
 lang_list: ["ch"]
 backend: pipeline
 parse_method: auto
 formula_enable: true
 table_enable: true
 return_md: true
 return_middle_json: false
 return_model_output: false
 return_content_list: false
 return_images: false
 start_page_id: 0
 end_page_id: 99999
 ```
 #### Expected Response Format:
@ -49,6 +67,14 @@ The processor can handle multiple response formats:
 OR
 ```json
 {
  "md": "# Document Title\n\nContent here..."
 }
 ```
 OR
 ```json
 {
  "content": "# Document Title\n\nContent here..."
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -32,8 +32,15 @@ class Settings(BaseSettings):
    OLLAMA_MODEL: str = "llama2"
    # Mineru API settings
-    MINERU_API_URL: str = "http://mineru-api:8000"
+    # MINERU_API_URL: str = "http://mineru-api:8001"
    MINERU_API_URL: str = "http://host.docker.internal:8001"
    MINERU_TIMEOUT: int = 300  # 5 minutes timeout
    MINERU_LANG_LIST: list = ["ch"]  # Language list for parsing
    MINERU_BACKEND: str = "pipeline"  # Backend to use
    MINERU_PARSE_METHOD: str = "auto"  # Parse method
    MINERU_FORMULA_ENABLE: bool = True  # Enable formula parsing
    MINERU_TABLE_ENABLE: bool = True  # Enable table parsing
    # Logging settings
    LOG_LEVEL: str = "INFO"
--- a/backend/app/core/document_handlers/processors/pdf_processor.py
+++ b/backend/app/core/document_handlers/processors/pdf_processor.py
@ -29,6 +29,11 @@ class PdfDocumentProcessor(DocumentProcessor):
        # Mineru API configuration
        self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000')
        self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300)  # 5 minutes timeout
        self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch'])
        self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline')
        self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto')
        self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True)
        self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True)
    def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]:
        """
@ -41,15 +46,33 @@ class PdfDocumentProcessor(DocumentProcessor):
            API response as dictionary or None if failed
        """
        try:
-            url = f"{self.mineru_base_url}/extract"
+            url = f"{self.mineru_base_url}/file_parse"
            with open(file_path, 'rb') as file:
-                files = {'file': (os.path.basename(file_path), file, 'application/pdf')}
+                files = {'files': (os.path.basename(file_path), file, 'application/pdf')}
                # Prepare form data according to Mineru API specification
                data = {
                    'output_dir': './output',
                    'lang_list': self.mineru_lang_list,
                    'backend': self.mineru_backend,
                    'parse_method': self.mineru_parse_method,
                    'formula_enable': self.mineru_formula_enable,
                    'table_enable': self.mineru_table_enable,
                    'return_md': True,
                    'return_middle_json': False,
                    'return_model_output': False,
                    'return_content_list': False,
                    'return_images': False,
                    'start_page_id': 0,
                    'end_page_id': 99999
                }
                logger.info(f"Calling Mineru API at {url}")
                response = requests.post(
                    url, 
                    files=files,
                    data=data,
                    timeout=self.mineru_timeout
                )
@ -82,9 +105,13 @@ class PdfDocumentProcessor(DocumentProcessor):
            Extracted markdown content as string
        """
        try:
-            # Try different possible response formats
+            logger.debug(f"Mineru API response structure: {response}")
            # Try different possible response formats based on Mineru API
            if 'markdown' in response:
                return response['markdown']
            elif 'md' in response:
                return response['md']
            elif 'content' in response:
                return response['content']
            elif 'text' in response:
@ -93,14 +120,32 @@ class PdfDocumentProcessor(DocumentProcessor):
                result = response['result']
                if 'markdown' in result:
                    return result['markdown']
                elif 'md' in result:
                    return result['md']
                elif 'content' in result:
                    return result['content']
                elif 'text' in result:
                    return result['text']
            elif 'data' in response and isinstance(response['data'], dict):
                data = response['data']
                if 'markdown' in data:
                    return data['markdown']
                elif 'md' in data:
                    return data['md']
                elif 'content' in data:
                    return data['content']
                elif 'text' in data:
                    return data['text']
            elif isinstance(response, list) and len(response) > 0:
                # If response is a list, try to extract from first item
                first_item = response[0]
                if isinstance(first_item, dict):
                    return self._extract_markdown_from_response(first_item)
                elif isinstance(first_item, str):
                    return first_item
            else:
                # If no standard format found, try to extract from the response structure
                logger.warning("Could not find standard markdown field in Mineru response")
                logger.debug(f"Mineru response structure: {response}")
                # Return the response as string if it's simple, or empty string
                if isinstance(response, str):
@ -110,6 +155,11 @@ class PdfDocumentProcessor(DocumentProcessor):
                    for key, value in response.items():
                        if isinstance(value, str) and len(value) > 100:  # Likely content
                            return value
                        elif isinstance(value, dict):
                            # Recursively search in nested dictionaries
                            nested_content = self._extract_markdown_from_response(value)
                            if nested_content:
                                return nested_content
                return ""
--- a/mineru/docker-compose.yml
+++ b/mineru/docker-compose.yml
@ -7,7 +7,7 @@ services:
      dockerfile: Dockerfile
    platform: linux/arm64
    ports:
-      - "8000:8000"
+      - "8001:8000"
    volumes:
      - ./storage/uploads:/app/storage/uploads
      - ./storage/processed:/app/storage/processed