legal-doc-masker/magicdoc/integration_example.py

145 lines
5.5 KiB
Python

"""
Example of how to integrate MagicDoc API with existing document processors
"""
# Example modification for docx_processor.py
# Replace the Mineru API configuration with MagicDoc API configuration
class DocxDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
super().__init__()
self.input_path = input_path
self.output_path = output_path
self.output_dir = os.path.dirname(output_path)
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
# Setup work directory for temporary files
self.work_dir = os.path.join(
os.path.dirname(output_path),
".work",
os.path.splitext(os.path.basename(input_path))[0]
)
os.makedirs(self.work_dir, exist_ok=True)
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
# MagicDoc API configuration (instead of Mineru)
self.magicdoc_base_url = getattr(settings, 'MAGICDOC_API_URL', 'http://magicdoc-api:8000')
self.magicdoc_timeout = getattr(settings, 'MAGICDOC_TIMEOUT', 300) # 5 minutes timeout
def _call_magicdoc_api(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Call MagicDoc API to convert DOCX to markdown
Args:
file_path: Path to the DOCX file
Returns:
API response as dictionary or None if failed
"""
try:
url = f"{self.magicdoc_base_url}/file_parse"
with open(file_path, 'rb') as file:
files = {'files': (os.path.basename(file_path), file, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
# Prepare form data - simplified compared to Mineru
data = {
'output_dir': './output',
'lang_list': 'ch',
'backend': 'pipeline',
'parse_method': 'auto',
'formula_enable': True,
'table_enable': True,
'return_md': True,
'return_middle_json': False,
'return_model_output': False,
'return_content_list': False,
'return_images': False,
'start_page_id': 0,
'end_page_id': 99999
}
logger.info(f"Calling MagicDoc API for DOCX processing at {url}")
response = requests.post(
url,
files=files,
data=data,
timeout=self.magicdoc_timeout
)
if response.status_code == 200:
result = response.json()
logger.info("Successfully received response from MagicDoc API for DOCX")
return result
else:
error_msg = f"MagicDoc API returned status code {response.status_code}: {response.text}"
logger.error(error_msg)
raise Exception(error_msg)
except requests.exceptions.Timeout:
error_msg = f"MagicDoc API request timed out after {self.magicdoc_timeout} seconds"
logger.error(error_msg)
raise Exception(error_msg)
except requests.exceptions.RequestException as e:
error_msg = f"Error calling MagicDoc API for DOCX: {str(e)}"
logger.error(error_msg)
raise Exception(error_msg)
except Exception as e:
error_msg = f"Unexpected error calling MagicDoc API for DOCX: {str(e)}"
logger.error(error_msg)
raise Exception(error_msg)
def read_content(self) -> str:
logger.info("Starting DOCX content processing with MagicDoc API")
# Call MagicDoc API to convert DOCX to markdown
magicdoc_response = self._call_magicdoc_api(self.input_path)
# Extract markdown content from the response
markdown_content = self._extract_markdown_from_response(magicdoc_response)
if not markdown_content:
raise Exception("No markdown content found in MagicDoc API response for DOCX")
logger.info(f"Successfully extracted {len(markdown_content)} characters of markdown content from DOCX")
# Save the raw markdown content to work directory for reference
md_output_path = os.path.join(self.work_dir, f"{self.name_without_suff}.md")
with open(md_output_path, 'w', encoding='utf-8') as file:
file.write(markdown_content)
logger.info(f"Saved raw markdown content from DOCX to {md_output_path}")
return markdown_content
# Configuration changes needed in settings.py:
"""
# Add these settings to your configuration
MAGICDOC_API_URL = "http://magicdoc-api:8000" # or http://localhost:8002 for local development
MAGICDOC_TIMEOUT = 300 # 5 minutes timeout
"""
# Docker Compose integration:
"""
# Add to your main docker-compose.yml
services:
magicdoc-api:
build:
context: ./magicdoc
dockerfile: Dockerfile
ports:
- "8002:8000"
volumes:
- ./magicdoc/storage:/app/storage
environment:
- PYTHONUNBUFFERED=1
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
"""