""" Example of how to integrate MagicDoc API with existing document processors """ # Example modification for docx_processor.py # Replace the Mineru API configuration with MagicDoc API configuration class DocxDocumentProcessor(DocumentProcessor): def __init__(self, input_path: str, output_path: str): super().__init__() self.input_path = input_path self.output_path = output_path self.output_dir = os.path.dirname(output_path) self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0] # Setup work directory for temporary files self.work_dir = os.path.join( os.path.dirname(output_path), ".work", os.path.splitext(os.path.basename(input_path))[0] ) os.makedirs(self.work_dir, exist_ok=True) self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) # MagicDoc API configuration (instead of Mineru) self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000') self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]: """ Call MagicDoc API to convert DOCX to markdown Args: file_path: Path to the DOCX file Returns: API response as dictionary or None if failed """ try: url = f"{self.magicdoc_base_url}/file_parse" with open(file_path, 'rb') as file: files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')} # Prepare form data - simplified compared to Mineru data = { 'output_dir': './output', 'lang_list': 'ch', 'backend': 'pipeline', 'parse_method': 'auto', 'formula_enable': True, 'table_enable': True, 'return_md': True, 'return_middle_json': False, 'return_model_output': False, 'return_content_list': False, 'return_images': False, 'start_page_id': 0, 'end_page_id': 99999 } logger.info(f"Calling MagicDoc API for DOCX processing at {url}") response = requests.post( url, files=files, data=data, timeout=self.magicdoc_timeout ) if response.status_code == 200: result = response.json() logger.info("Successfully received response from MagicDoc API for DOCX") return result else: error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}" logger.error(error_msg) raise Exception(error_msg) except requests.exceptions.Timeout: error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds" logger.error(error_msg) raise Exception(error_msg) except requests.exceptions.RequestException as e: error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}" logger.error(error_msg) raise Exception(error_msg) except Exception as e: error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}" logger.error(error_msg) raise Exception(error_msg) def read_content(self) -> str: logger.info("Starting DOCX content processing with MagicDoc API") # Call MagicDoc API to convert DOCX to markdown magicdoc_response = self._call_magicdoc_api(self.input_path) # Extract markdown content from the response markdown_content = self._extract_markdown_from_response(magicdoc_response) if not markdown_content: raise Exception("No markdown content found in MagicDoc API response for DOCX") logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX") # Save the raw markdown content to work directory for reference md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md") with open(md_output_path, 'w', encoding='utf-8') as file: file.write(markdown_content) logger.info(f"Saved raw markdown content from DOCX to {md_output_path}") return markdown_content # Configuration changes needed in settings.py: """ # Add these settings to your configuration MAGICDOC_API_URL = "http://magicdoc-api:8000" # or http://localhost:8002 for local development MAGICDOC_TIMEOUT = 300 # 5 minutes timeout """ # Docker Compose integration: """ # Add to your main docker-compose.yml services: magicdoc-api: build: context: ./magicdoc dockerfile: Dockerfile ports: - "8002:8000" volumes: - ./magicdoc/storage:/app/storage environment: - PYTHONUNBUFFERED=1 restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 start_period: 60s """