From 207521895541ff8152bc35dd4bd15e795229d5da Mon Sep 17 00:00:00 2001 From: tigermren Date: Mon, 18 Aug 2025 01:15:40 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=AD=A3=E5=BC=8Ffully=E6=94=AF?= =?UTF-8?q?=E6=8C=81docx?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../processors/docx_processor.py | 73 ++++++++++--------- docker-compose.yml | 26 +++++++ 2 files changed, 63 insertions(+), 36 deletions(-) diff --git a/backend/app/core/document_handlers/processors/docx_processor.py b/backend/app/core/document_handlers/processors/docx_processor.py index c451834..09563ea 100644 --- a/backend/app/core/document_handlers/processors/docx_processor.py +++ b/backend/app/core/document_handlers/processors/docx_processor.py @@ -26,18 +26,19 @@ class DocxDocumentProcessor(DocumentProcessor): self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) - # Mineru API configuration - self.mineru_base_url = getattr(settings, 'MINERU_API_URL', 'http://mineru-api:8000') - self.mineru_timeout = getattr(settings, 'MINERU_TIMEOUT', 300) # 5 minutes timeout - self.mineru_lang_list = getattr(settings, 'MINERU_LANG_LIST', ['ch']) - self.mineru_backend = getattr(settings, 'MINERU_BACKEND', 'pipeline') - self.mineru_parse_method = getattr(settings, 'MINERU_PARSE_METHOD', 'auto') - self.mineru_formula_enable = getattr(settings, 'MINERU_FORMULA_ENABLE', True) - self.mineru_table_enable = getattr(settings, 'MINERU_TABLE_ENABLE', True) + # MagicDoc API configuration (replacing Mineru) + self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000') + self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout + # MagicDoc uses simpler parameters, but we keep compatibility with existing interface + self.magicdoc_lang_list = getattr(settings, 'MAGICDOC_LANG_LIST', 'ch') + self.magicdoc_backend = getattr(settings, 'MAGICDOC_BACKEND', 'pipeline') + self.magicdoc_parse_method = getattr(settings, 'MAGICDOC_PARSE_METHOD', 'auto') + self.magicdoc_formula_enable = getattr(settings, 'MAGICDOC_FORMULA_ENABLE', True) + self.magicdoc_table_enable = getattr(settings, 'MAGICDOC_TABLE_ENABLE', True) - def _call_mineru_api(self, file_path: str) -> Optional[Dict[str, Any]]: + def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]: """ - Call Mineru API to convert DOCX to markdown + Call MagicDoc API to convert DOCX to markdown Args: file_path: Path to the DOCX file @@ -46,19 +47,19 @@ class DocxDocumentProcessor(DocumentProcessor): API response as dictionary or None if failed """ try: - url = f"{self.mineru_base_url}/file_parse" + url = f"{self.magicdoc_base_url}/file_parse" with open(file_path, 'rb') as file: files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')} - # Prepare form data according to Mineru API specification + # Prepare form data according to MagicDoc API specification (compatible with Mineru) data = { 'output_dir': './output', - 'lang_list': self.mineru_lang_list, - 'backend': self.mineru_backend, - 'parse_method': self.mineru_parse_method, - 'formula_enable': self.mineru_formula_enable, - 'table_enable': self.mineru_table_enable, + 'lang_list': self.magicdoc_lang_list, + 'backend': self.magicdoc_backend, + 'parse_method': self.magicdoc_parse_method, + 'formula_enable': self.magicdoc_formula_enable, + 'table_enable': self.magicdoc_table_enable, 'return_md': True, 'return_middle_json': False, 'return_model_output': False, @@ -68,58 +69,58 @@ class DocxDocumentProcessor(DocumentProcessor): 'end_page_id': 99999 } - logger.info(f"Calling Mineru API for DOCX processing at {url}") + logger.info(f"Calling MagicDoc API for DOCX processing at {url}") response = requests.post( url, files=files, data=data, - timeout=self.mineru_timeout + timeout=self.magicdoc_timeout ) if response.status_code == 200: result = response.json() - logger.info("Successfully received response from Mineru API for DOCX") + logger.info("Successfully received response from MagicDoc API for DOCX") return result else: - error_msg = f"Mineru API returned status code {response.status_code}: {response.text}" + error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}" logger.error(error_msg) # For 400 errors, include more specific information if response.status_code == 400: try: error_data = response.json() if 'error' in error_data: - error_msg = f"Mineru API error: {error_data['error']}" + error_msg = f"MagicDoc API error: {error_data['error']}" except: pass raise Exception(error_msg) except requests.exceptions.Timeout: - error_msg = f"Mineru API request timed out after {self.mineru_timeout} seconds" + error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds" logger.error(error_msg) raise Exception(error_msg) except requests.exceptions.RequestException as e: - error_msg = f"Error calling Mineru API for DOCX: {str(e)}" + error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}" logger.error(error_msg) raise Exception(error_msg) except Exception as e: - error_msg = f"Unexpected error calling Mineru API for DOCX: {str(e)}" + error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}" logger.error(error_msg) raise Exception(error_msg) def _extract_markdown_from_response(self, response: Dict[str, Any]) -> str: """ - Extract markdown content from Mineru API response + Extract markdown content from MagicDoc API response Args: - response: Mineru API response dictionary + response: MagicDoc API response dictionary Returns: Extracted markdown content as string """ try: - logger.debug(f"Mineru API response structure for DOCX: {response}") + logger.debug(f"MagicDoc API response structure for DOCX: {response}") - # Try different possible response formats based on Mineru API + # Try different possible response formats based on MagicDoc API if 'markdown' in response: return response['markdown'] elif 'md' in response: @@ -157,7 +158,7 @@ class DocxDocumentProcessor(DocumentProcessor): return first_item else: # If no standard format found, try to extract from the response structure - logger.warning("Could not find standard markdown field in Mineru response for DOCX") + logger.warning("Could not find standard markdown field in MagicDoc response for DOCX") # Return the response as string if it's simple, or empty string if isinstance(response, str): @@ -176,21 +177,21 @@ class DocxDocumentProcessor(DocumentProcessor): return "" except Exception as e: - logger.error(f"Error extracting markdown from Mineru response for DOCX: {str(e)}") + logger.error(f"Error extracting markdown from MagicDoc response for DOCX: {str(e)}") return "" def read_content(self) -> str: - logger.info("Starting DOCX content processing with Mineru API") + logger.info("Starting DOCX content processing with MagicDoc API") - # Call Mineru API to convert DOCX to markdown + # Call MagicDoc API to convert DOCX to markdown # This will raise an exception if the API call fails - mineru_response = self._call_mineru_api(self.input_path) + magicdoc_response = self._call_magicdoc_api(self.input_path) # Extract markdown content from the response - markdown_content = self._extract_markdown_from_response(mineru_response) + markdown_content = self._extract_markdown_from_response(magicdoc_response) if not markdown_content: - raise Exception("No markdown content found in Mineru API response for DOCX") + raise Exception("No markdown content found in MagicDoc API response for DOCX") logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX") diff --git a/docker-compose.yml b/docker-compose.yml index b450a60..0f119d2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -25,6 +25,29 @@ services: networks: - app-network + # MagicDoc API Service + magicdoc-api: + build: + context: ./magicdoc + dockerfile: Dockerfile + platform: linux/amd64 + ports: + - "8002:8000" + volumes: + - ./magicdoc/storage/uploads:/app/storage/uploads + - ./magicdoc/storage/processed:/app/storage/processed + environment: + - PYTHONUNBUFFERED=1 + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + networks: + - app-network + # Backend API Service backend-api: build: @@ -40,9 +63,11 @@ services: - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/0 - MINERU_API_URL=http://mineru-api:8000 + - MAGICDOC_API_URL=http://magicdoc-api:8000 depends_on: - redis - mineru-api + - magicdoc-api networks: - app-network @@ -60,6 +85,7 @@ services: - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/0 - MINERU_API_URL=http://mineru-api:8000 + - MAGICDOC_API_URL=http://magicdoc-api:8000 depends_on: - redis - backend-api