145 lines
5.5 KiB
Python
145 lines
5.5 KiB
Python
"""
|
|
Example of how to integrate MagicDoc API with existing document processors
|
|
"""
|
|
|
|
# Example modification for docx_processor.py
|
|
# Replace the Mineru API configuration with MagicDoc API configuration
|
|
|
|
class DocxDocumentProcessor(DocumentProcessor):
|
|
def __init__(self, input_path: str, output_path: str):
|
|
super().__init__()
|
|
self.input_path = input_path
|
|
self.output_path = output_path
|
|
self.output_dir = os.path.dirname(output_path)
|
|
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
|
|
|
# Setup work directory for temporary files
|
|
self.work_dir = os.path.join(
|
|
os.path.dirname(output_path),
|
|
".work",
|
|
os.path.splitext(os.path.basename(input_path))[0]
|
|
)
|
|
os.makedirs(self.work_dir, exist_ok=True)
|
|
|
|
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
|
|
|
# MagicDoc API configuration (instead of Mineru)
|
|
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
|
|
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
|
|
|
|
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Call MagicDoc API to convert DOCX to markdown
|
|
|
|
Args:
|
|
file_path: Path to the DOCX file
|
|
|
|
Returns:
|
|
API response as dictionary or None if failed
|
|
"""
|
|
try:
|
|
url = f"{self.magicdoc_base_url}/file_parse"
|
|
|
|
with open(file_path, 'rb') as file:
|
|
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
|
|
|
# Prepare form data - simplified compared to Mineru
|
|
data = {
|
|
'output_dir': './output',
|
|
'lang_list': 'ch',
|
|
'backend': 'pipeline',
|
|
'parse_method': 'auto',
|
|
'formula_enable': True,
|
|
'table_enable': True,
|
|
'return_md': True,
|
|
'return_middle_json': False,
|
|
'return_model_output': False,
|
|
'return_content_list': False,
|
|
'return_images': False,
|
|
'start_page_id': 0,
|
|
'end_page_id': 99999
|
|
}
|
|
|
|
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
|
|
response = requests.post(
|
|
url,
|
|
files=files,
|
|
data=data,
|
|
timeout=self.magicdoc_timeout
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
logger.info("Successfully received response from MagicDoc API for DOCX")
|
|
return result
|
|
else:
|
|
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
|
|
logger.error(error_msg)
|
|
raise Exception(error_msg)
|
|
|
|
except requests.exceptions.Timeout:
|
|
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
|
|
logger.error(error_msg)
|
|
raise Exception(error_msg)
|
|
except requests.exceptions.RequestException as e:
|
|
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
|
|
logger.error(error_msg)
|
|
raise Exception(error_msg)
|
|
except Exception as e:
|
|
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
|
|
logger.error(error_msg)
|
|
raise Exception(error_msg)
|
|
|
|
def read_content(self) -> str:
|
|
logger.info("Starting DOCX content processing with MagicDoc API")
|
|
|
|
# Call MagicDoc API to convert DOCX to markdown
|
|
magicdoc_response = self._call_magicdoc_api(self.input_path)
|
|
|
|
# Extract markdown content from the response
|
|
markdown_content = self._extract_markdown_from_response(magicdoc_response)
|
|
|
|
if not markdown_content:
|
|
raise Exception("No markdown content found in MagicDoc API response for DOCX")
|
|
|
|
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
|
|
|
|
# Save the raw markdown content to work directory for reference
|
|
md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md")
|
|
with open(md_output_path, 'w', encoding='utf-8') as file:
|
|
file.write(markdown_content)
|
|
|
|
logger.info(f"Saved raw markdown content from DOCX to {md_output_path}")
|
|
|
|
return markdown_content
|
|
|
|
# Configuration changes needed in settings.py:
|
|
"""
|
|
# Add these settings to your configuration
|
|
MAGICDOC_API_URL = "http://magicdoc-api:8000" # or http://localhost:8002 for local development
|
|
MAGICDOC_TIMEOUT = 300 # 5 minutes timeout
|
|
"""
|
|
|
|
# Docker Compose integration:
|
|
"""
|
|
# Add to your main docker-compose.yml
|
|
services:
|
|
magicdoc-api:
|
|
build:
|
|
context: ./magicdoc
|
|
dockerfile: Dockerfile
|
|
ports:
|
|
- "8002:8000"
|
|
volumes:
|
|
- ./magicdoc/storage:/app/storage
|
|
environment:
|
|
- PYTHONUNBUFFERED=1
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 60s
|
|
"""
|