WIP：新增mineru部分

2025-07-12 15:46:05 +08:00 · 2025-07-12 15:46:05 +08:00 · fcf88e36d6
parent 12c1b5f75e
commit fcf88e36d6
26 changed files with 1580 additions and 0 deletions
--- a/mineru/Dockerfile
+++ b/mineru/Dockerfile
@ -0,0 +1,34 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libreoffice \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+RUN pip install huggingface_hub
+RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
+RUN python download_models_hf.py
+
+
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install -U magic-pdf[full]
+
+
+# Copy the rest of the application
+COPY . .
+
+# Create storage directories
+RUN mkdir -p storage/uploads storage/processed
+
+# Expose the port the app runs on
+EXPOSE 8000
+
+# Command to run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] 
--- a/mineru/MINERU_API_README.md
+++ b/mineru/MINERU_API_README.md
@ -0,0 +1,201 @@
+# Mineru API Documentation
+
+This document describes the FastAPI interface for the Mineru document parsing service.
+
+## Overview
+
+The Mineru API provides endpoints for parsing documents (PDFs, images) using advanced OCR and layout analysis. It supports both pipeline and VLM backends for different use cases.
+
+## Base URL
+
+```
+http://localhost:8000/api/v1/mineru
+```
+
+## Endpoints
+
+### 1. Health Check
+
+**GET** `/health`
+
+Check if the Mineru service is running.
+
+**Response:**
+```json
+{
+  "status": "healthy",
+  "service": "mineru"
+}
+```
+
+### 2. Parse Document
+
+**POST** `/parse`
+
+Parse a document using Mineru's advanced parsing capabilities.
+
+**Parameters:**
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `file` | File | Required | The document file to parse (PDF, PNG, JPEG, JPG) |
+| `lang` | string | "ch" | Language option ('ch', 'en', 'korean', 'japan', etc.) |
+| `backend` | string | "pipeline" | Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client') |
+| `method` | string | "auto" | Method for parsing ('auto', 'txt', 'ocr') |
+| `server_url` | string | null | Server URL for vlm-sglang-client backend |
+| `start_page_id` | int | 0 | Start page ID for parsing |
+| `end_page_id` | int | null | End page ID for parsing |
+| `formula_enable` | boolean | true | Enable formula parsing |
+| `table_enable` | boolean | true | Enable table parsing |
+| `draw_layout_bbox` | boolean | true | Whether to draw layout bounding boxes |
+| `draw_span_bbox` | boolean | true | Whether to draw span bounding boxes |
+| `dump_md` | boolean | true | Whether to dump markdown files |
+| `dump_middle_json` | boolean | true | Whether to dump middle JSON files |
+| `dump_model_output` | boolean | true | Whether to dump model output files |
+| `dump_orig_pdf` | boolean | true | Whether to dump original PDF files |
+| `dump_content_list` | boolean | true | Whether to dump content list files |
+| `make_md_mode` | string | "MM_MD" | The mode for making markdown content |
+
+**Response:**
+```json
+{
+  "status": "success",
+  "file_name": "document_name",
+  "outputs": {
+    "markdown": "/path/to/document_name.md",
+    "middle_json": "/path/to/document_name_middle.json",
+    "model_output": "/path/to/document_name_model.json",
+    "content_list": "/path/to/document_name_content_list.json",
+    "original_pdf": "/path/to/document_name_origin.pdf",
+    "layout_pdf": "/path/to/document_name_layout.pdf",
+    "span_pdf": "/path/to/document_name_span.pdf"
+  },
+  "output_directory": "/path/to/output/directory"
+}
+```
+
+### 3. Download Processed File
+
+**GET** `/download/{file_path}`
+
+Download a processed file from the Mineru output directory.
+
+**Parameters:**
+- `file_path`: Path to the file relative to the mineru output directory
+
+**Response:** File download
+
+## Usage Examples
+
+### Python Example
+
+```python
+import requests
+
+# Parse a document
+with open('document.pdf', 'rb') as f:
+    files = {'file': ('document.pdf', f, 'application/pdf')}
+    params = {
+        'lang': 'ch',
+        'backend': 'pipeline',
+        'method': 'auto',
+        'formula_enable': True,
+        'table_enable': True
+    }
+    
+    response = requests.post(
+        'http://localhost:8000/api/v1/mineru/parse',
+        files=files,
+        params=params
+    )
+    
+    if response.status_code == 200:
+        result = response.json()
+        print(f"Parsed successfully: {result['file_name']}")
+        
+        # Download the markdown file
+        md_path = result['outputs']['markdown']
+        download_response = requests.get(
+            f'http://localhost:8000/api/v1/mineru/download/{md_path}'
+        )
+        
+        with open('output.md', 'wb') as f:
+            f.write(download_response.content)
+```
+
+### cURL Example
+
+```bash
+# Parse a document
+curl -X POST "http://localhost:8000/api/v1/mineru/parse" \
+  -F "file=@document.pdf" \
+  -F "lang=ch" \
+  -F "backend=pipeline" \
+  -F "method=auto"
+
+# Download a processed file
+curl -X GET "http://localhost:8000/api/v1/mineru/download/path/to/file.md" \
+  -o downloaded_file.md
+```
+
+## Backend Options
+
+### Pipeline Backend
+- **Use case**: General purpose, more robust
+- **Advantages**: Better for complex layouts, supports multiple languages
+- **Command**: `backend=pipeline`
+
+### VLM Backends
+- **vlm-transformers**: General purpose VLM
+- **vlm-sglang-engine**: Faster engine-based approach
+- **vlm-sglang-client**: Fastest client-based approach (requires server_url)
+
+## Language Support
+
+Supported languages for the pipeline backend:
+- `ch`: Chinese (Simplified)
+- `en`: English
+- `korean`: Korean
+- `japan`: Japanese
+- `chinese_cht`: Chinese (Traditional)
+- `ta`: Tamil
+- `te`: Telugu
+- `ka`: Kannada
+
+## Output Files
+
+The API generates various output files depending on the parameters:
+
+1. **Markdown** (`.md`): Structured text content
+2. **Middle JSON** (`.json`): Intermediate parsing results
+3. **Model Output** (`.json` or `.txt`): Raw model predictions
+4. **Content List** (`.json`): Structured content list
+5. **Original PDF**: Copy of the input file
+6. **Layout PDF**: PDF with layout bounding boxes
+7. **Span PDF**: PDF with span bounding boxes
+
+## Error Handling
+
+The API returns appropriate HTTP status codes:
+
+- `200`: Success
+- `400`: Bad request (invalid parameters, unsupported file type)
+- `404`: File not found
+- `500`: Internal server error
+
+Error responses include a detail message explaining the issue.
+
+## Testing
+
+Use the provided test script to verify the API:
+
+```bash
+python test_mineru_api.py
+```
+
+## Notes
+
+- The API creates unique output directories for each request to avoid conflicts
+- Temporary files are automatically cleaned up after processing
+- File downloads are restricted to the processed folder for security
+- Large files may take time to process depending on the backend and document complexity 
--- a/mineru/README.md
+++ b/mineru/README.md
@ -0,0 +1,103 @@
+# Legal Document Masker API
+
+This is the backend API for the Legal Document Masking system. It provides endpoints for file upload, processing status tracking, and file download.
+
+## Prerequisites
+
+- Python 3.8+
+- Redis (for Celery)
+
+## File Storage
+
+Files are stored in the following structure:
+```
+backend/
+├── storage/
+│   ├── uploads/     # Original uploaded files
+│   └── processed/   # Masked/processed files
+```
+
+## Setup
+
+### Option 1: Local Development
+
+1. Create a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+3. Set up environment variables:
+Create a `.env` file in the backend directory with the following variables:
+```env
+SECRET_KEY=your-secret-key-here
+```
+
+The database (SQLite) will be automatically created when you first run the application.
+
+4. Start Redis (required for Celery):
+```bash
+redis-server
+```
+
+5. Start Celery worker:
+```bash
+celery -A app.services.file_service worker --loglevel=info
+```
+
+6. Start the FastAPI server:
+```bash
+uvicorn app.main:app --reload
+```
+
+### Option 2: Docker Deployment
+
+1. Build and start the services:
+```bash
+docker-compose up --build
+```
+
+This will start:
+- FastAPI server on port 8000
+- Celery worker for background processing
+- Redis for task queue
+
+## API Documentation
+
+Once the server is running, you can access:
+- Swagger UI: `http://localhost:8000/docs`
+- ReDoc: `http://localhost:8000/redoc`
+
+## API Endpoints
+
+- `POST /api/v1/files/upload` - Upload a new file
+- `GET /api/v1/files` - List all files
+- `GET /api/v1/files/{file_id}` - Get file details
+- `GET /api/v1/files/{file_id}/download` - Download processed file
+- `WS /api/v1/files/ws/status/{file_id}` - WebSocket for real-time status updates
+
+## Development
+
+### Running Tests
+```bash
+pytest
+```
+
+### Code Style
+The project uses Black for code formatting:
+```bash
+black .
+```
+
+### Docker Commands
+
+- Start services: `docker-compose up`
+- Start in background: `docker-compose up -d`
+- Stop services: `docker-compose down`
+- View logs: `docker-compose logs -f`
+- Rebuild: `docker-compose up --build` 
--- a/mineru/app/api/endpoints/mineru.py
+++ b/mineru/app/api/endpoints/mineru.py
@ -0,0 +1,329 @@
+from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
+from fastapi.responses import FileResponse
+from typing import List, Optional
+import os
+import tempfile
+import shutil
+import json
+from pathlib import Path
+import uuid
+from loguru import logger
+
+from ...core.config import settings
+
+# Import mineru functions
+from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
+from mineru.data.data_reader_writer import FileBasedDataWriter
+from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
+from mineru.utils.enum_class import MakeMode
+from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
+from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
+from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
+from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
+from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
+
+router = APIRouter()
+
+class MineruParseRequest:
+    def __init__(
+        self,
+        lang: str = "ch",
+        backend: str = "pipeline",
+        method: str = "auto",
+        server_url: Optional[str] = None,
+        start_page_id: int = 0,
+        end_page_id: Optional[int] = None,
+        formula_enable: bool = True,
+        table_enable: bool = True,
+        draw_layout_bbox: bool = True,
+        draw_span_bbox: bool = True,
+        dump_md: bool = True,
+        dump_middle_json: bool = True,
+        dump_model_output: bool = True,
+        dump_orig_pdf: bool = True,
+        dump_content_list: bool = True,
+        make_md_mode: str = "MM_MD"
+    ):
+        self.lang = lang
+        self.backend = backend
+        self.method = method
+        self.server_url = server_url
+        self.start_page_id = start_page_id
+        self.end_page_id = end_page_id
+        self.formula_enable = formula_enable
+        self.table_enable = table_enable
+        self.draw_layout_bbox = draw_layout_bbox
+        self.draw_span_bbox = draw_span_bbox
+        self.dump_md = dump_md
+        self.dump_middle_json = dump_middle_json
+        self.dump_model_output = dump_model_output
+        self.dump_orig_pdf = dump_orig_pdf
+        self.dump_content_list = dump_content_list
+        self.make_md_mode = MakeMode.MM_MD if make_md_mode == "MM_MD" else MakeMode.CONTENT_LIST
+
+async def process_mineru_document(
+    file: UploadFile,
+    request: MineruParseRequest,
+    output_dir: Path
+) -> dict:
+    """Process a single document using Mineru"""
+    try:
+        # Read file content
+        content = await file.read()
+        
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as temp_file:
+            temp_file.write(content)
+            temp_file_path = Path(temp_file.name)
+        
+        try:
+            # Prepare environment
+            file_name = Path(file.filename).stem
+            local_image_dir, local_md_dir = prepare_env(output_dir, file_name, request.method)
+            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
+            
+            # Convert PDF bytes if needed
+            if request.backend == "pipeline":
+                new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
+                    content, request.start_page_id, request.end_page_id
+                )
+                
+                # Analyze document
+                infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(
+                    [new_pdf_bytes], [request.lang], 
+                    parse_method=request.method, 
+                    formula_enable=request.formula_enable,
+                    table_enable=request.table_enable
+                )
+                
+                # Process results
+                model_list = infer_results[0]
+                images_list = all_image_lists[0]
+                pdf_doc = all_pdf_docs[0]
+                _lang = lang_list[0]
+                _ocr_enable = ocr_enabled_list[0]
+                
+                middle_json = pipeline_result_to_middle_json(
+                    model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, request.formula_enable
+                )
+                
+                pdf_info = middle_json["pdf_info"]
+                
+                # Generate outputs
+                outputs = {}
+                
+                if request.draw_layout_bbox:
+                    draw_layout_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_layout.pdf")
+                    outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf")
+                
+                if request.draw_span_bbox:
+                    draw_span_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_span.pdf")
+                    outputs["span_pdf"] = str(local_md_dir / f"{file_name}_span.pdf")
+                
+                if request.dump_orig_pdf:
+                    md_writer.write(f"{file_name}_origin.pdf", new_pdf_bytes)
+                    outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf")
+                
+                if request.dump_md:
+                    image_dir = str(os.path.basename(local_image_dir))
+                    md_content_str = pipeline_union_make(pdf_info, request.make_md_mode, image_dir)
+                    md_writer.write_string(f"{file_name}.md", md_content_str)
+                    outputs["markdown"] = str(local_md_dir / f"{file_name}.md")
+                
+                if request.dump_content_list:
+                    image_dir = str(os.path.basename(local_image_dir))
+                    content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+                    md_writer.write_string(
+                        f"{file_name}_content_list.json",
+                        json.dumps(content_list, ensure_ascii=False, indent=4)
+                    )
+                    outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json")
+                
+                if request.dump_middle_json:
+                    md_writer.write_string(
+                        f"{file_name}_middle.json",
+                        json.dumps(middle_json, ensure_ascii=False, indent=4)
+                    )
+                    outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json")
+                
+                if request.dump_model_output:
+                    md_writer.write_string(
+                        f"{file_name}_model.json",
+                        json.dumps(model_list, ensure_ascii=False, indent=4)
+                    )
+                    outputs["model_output"] = str(local_md_dir / f"{file_name}_model.json")
+                
+            else:
+                # VLM backend
+                if request.backend.startswith("vlm-"):
+                    backend = request.backend[4:]
+                
+                middle_json, infer_result = vlm_doc_analyze(
+                    content, image_writer=image_writer, 
+                    backend=backend, server_url=request.server_url
+                )
+                
+                pdf_info = middle_json["pdf_info"]
+                
+                # Generate outputs for VLM
+                outputs = {}
+                
+                if request.draw_layout_bbox:
+                    draw_layout_bbox(pdf_info, content, local_md_dir, f"{file_name}_layout.pdf")
+                    outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf")
+                
+                if request.dump_orig_pdf:
+                    md_writer.write(f"{file_name}_origin.pdf", content)
+                    outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf")
+                
+                if request.dump_md:
+                    image_dir = str(os.path.basename(local_image_dir))
+                    md_content_str = vlm_union_make(pdf_info, request.make_md_mode, image_dir)
+                    md_writer.write_string(f"{file_name}.md", md_content_str)
+                    outputs["markdown"] = str(local_md_dir / f"{file_name}.md")
+                
+                if request.dump_content_list:
+                    image_dir = str(os.path.basename(local_image_dir))
+                    content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+                    md_writer.write_string(
+                        f"{file_name}_content_list.json",
+                        json.dumps(content_list, ensure_ascii=False, indent=4)
+                    )
+                    outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json")
+                
+                if request.dump_middle_json:
+                    md_writer.write_string(
+                        f"{file_name}_middle.json",
+                        json.dumps(middle_json, ensure_ascii=False, indent=4)
+                    )
+                    outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json")
+                
+                if request.dump_model_output:
+                    model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
+                    md_writer.write_string(f"{file_name}_model_output.txt", model_output)
+                    outputs["model_output"] = str(local_md_dir / f"{file_name}_model_output.txt")
+            
+            return {
+                "status": "success",
+                "file_name": file_name,
+                "outputs": outputs,
+                "output_directory": str(local_md_dir)
+            }
+            
+        finally:
+            # Clean up temporary file
+            if temp_file_path.exists():
+                temp_file_path.unlink()
+                
+    except Exception as e:
+        logger.exception(f"Error processing document: {e}")
+        raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
+
+@router.post("/parse")
+async def parse_document(
+    file: UploadFile = File(...),
+    lang: str = "ch",
+    backend: str = "pipeline",
+    method: str = "auto",
+    server_url: Optional[str] = None,
+    start_page_id: int = 0,
+    end_page_id: Optional[int] = None,
+    formula_enable: bool = True,
+    table_enable: bool = True,
+    draw_layout_bbox: bool = True,
+    draw_span_bbox: bool = True,
+    dump_md: bool = True,
+    dump_middle_json: bool = True,
+    dump_model_output: bool = True,
+    dump_orig_pdf: bool = True,
+    dump_content_list: bool = True,
+    make_md_mode: str = "MM_MD"
+):
+    """
+    Parse a document using Mineru API
+    
+    Parameters:
+    - file: The document file to parse (PDF, image, etc.)
+    - lang: Language option (default: 'ch')
+    - backend: Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client')
+    - method: Method for parsing ('auto', 'txt', 'ocr')
+    - server_url: Server URL for vlm-sglang-client backend
+    - start_page_id: Start page ID for parsing
+    - end_page_id: End page ID for parsing
+    - formula_enable: Enable formula parsing
+    - table_enable: Enable table parsing
+    - draw_layout_bbox: Whether to draw layout bounding boxes
+    - draw_span_bbox: Whether to draw span bounding boxes
+    - dump_md: Whether to dump markdown files
+    - dump_middle_json: Whether to dump middle JSON files
+    - dump_model_output: Whether to dump model output files
+    - dump_orig_pdf: Whether to dump original PDF files
+    - dump_content_list: Whether to dump content list files
+    - make_md_mode: The mode for making markdown content
+    """
+    
+    # Validate file type
+    allowed_extensions = {".pdf", ".png", ".jpeg", ".jpg"}
+    file_extension = Path(file.filename).suffix.lower()
+    if file_extension not in allowed_extensions:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File type not allowed. Allowed types: {', '.join(allowed_extensions)}"
+        )
+    
+    # Create request object
+    request = MineruParseRequest(
+        lang=lang,
+        backend=backend,
+        method=method,
+        server_url=server_url,
+        start_page_id=start_page_id,
+        end_page_id=end_page_id,
+        formula_enable=formula_enable,
+        table_enable=table_enable,
+        draw_layout_bbox=draw_layout_bbox,
+        draw_span_bbox=draw_span_bbox,
+        dump_md=dump_md,
+        dump_middle_json=dump_middle_json,
+        dump_model_output=dump_model_output,
+        dump_orig_pdf=dump_orig_pdf,
+        dump_content_list=dump_content_list,
+        make_md_mode=make_md_mode
+    )
+    
+    # Create output directory
+    output_dir = settings.PROCESSED_FOLDER / "mineru" / str(uuid.uuid4())
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Process document
+    result = await process_mineru_document(file, request, output_dir)
+    
+    return result
+
+@router.get("/download/{file_path:path}")
+async def download_processed_file(file_path: str):
+    """Download a processed file from the mineru output directory"""
+    try:
+        # Construct the full path
+        full_path = settings.PROCESSED_FOLDER / "mineru" / file_path
+        
+        # Security check: ensure the path is within the processed folder
+        if not str(full_path).startswith(str(settings.PROCESSED_FOLDER)):
+            raise HTTPException(status_code=400, detail="Invalid file path")
+        
+        if not full_path.exists():
+            raise HTTPException(status_code=404, detail="File not found")
+        
+        return FileResponse(
+            path=str(full_path),
+            filename=full_path.name,
+            media_type="application/octet-stream"
+        )
+        
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error downloading file: {str(e)}")
+
+@router.get("/health")
+async def health_check():
+    """Health check endpoint for mineru service"""
+    return {"status": "healthy", "service": "mineru"}
--- a/mineru/app/core/config.py
+++ b/mineru/app/core/config.py
@ -0,0 +1,54 @@
+from pydantic_settings import BaseSettings
+from typing import Optional
+import os
+from pathlib import Path
+
+class Settings(BaseSettings):
+    # API Settings
+    API_V1_STR: str = "/api/v1"
+    PROJECT_NAME: str = "Legal Document Masker API"
+    
+    # Security
+    SECRET_KEY: str = "your-secret-key-here"  # Change in production
+    ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8  # 8 days
+    
+    # Database
+    BASE_DIR: Path = Path(__file__).parent.parent.parent
+    DATABASE_URL: str = f"sqlite:///{BASE_DIR}/storage/legal_doc_masker.db"
+    
+    # File Storage
+    UPLOAD_FOLDER: Path = BASE_DIR / "storage" / "uploads"
+    PROCESSED_FOLDER: Path = BASE_DIR / "storage" / "processed"
+    MAX_FILE_SIZE: int = 50 * 1024 * 1024  # 50MB
+    ALLOWED_EXTENSIONS: set = {"pdf", "docx", "doc", "md"}
+    
+    # Celery
+    CELERY_BROKER_URL: str = "redis://redis:6379/0"
+    CELERY_RESULT_BACKEND: str = "redis://redis:6379/0"
+
+    # Ollama API settings
+    OLLAMA_API_URL: str = "https://api.ollama.com"
+    OLLAMA_API_KEY: str = ""
+    OLLAMA_MODEL: str = "llama2"
+
+    # Logging settings
+    LOG_LEVEL: str = "INFO"
+    LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
+    LOG_FILE: str = "app.log"
+    
+    class Config:
+        case_sensitive = True
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        extra = "allow"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Create storage directories if they don't exist
+        self.UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
+        self.PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True)
+        # Create storage directory for database
+        (self.BASE_DIR / "storage").mkdir(parents=True, exist_ok=True)
+
+settings = Settings()
--- a/mineru/app/core/config/logging_config.py
+++ b/mineru/app/core/config/logging_config.py
@ -0,0 +1,40 @@
+import logging.config
+# from config.settings import settings
+from .settings import settings
+
+LOGGING_CONFIG = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "standard": {
+            "format": settings.LOG_FORMAT,
+            "datefmt": settings.LOG_DATE_FORMAT
+        },
+    },
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "formatter": "standard",
+            "level": settings.LOG_LEVEL,
+            "stream": "ext://sys.stdout"
+        },
+        "file": {
+            "class": "logging.FileHandler",
+            "formatter": "standard",
+            "level": settings.LOG_LEVEL,
+            "filename": settings.LOG_FILE,
+            "mode": "a",
+        }
+    },
+    "loggers": {
+        "": {  # root logger
+            "handlers": ["console", "file"],
+            "level": settings.LOG_LEVEL,
+            "propagate": True
+        }
+    }
+}
+
+def setup_logging():
+    """Initialize logging configuration"""
+    logging.config.dictConfig(LOGGING_CONFIG)
--- a/mineru/app/core/config/settings.py
+++ b/mineru/app/core/config/settings.py
--- a/mineru/app/core/database.py
+++ b/mineru/app/core/database.py
@ -0,0 +1,21 @@
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+from .config import settings
+
+# Create SQLite engine with check_same_thread=False for FastAPI
+engine = create_engine(
+    settings.DATABASE_URL,
+    connect_args={"check_same_thread": False}
+)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+Base = declarative_base()
+
+# Dependency
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close() 
--- a/mineru/app/core/document_handlers/document.py
+++ b/mineru/app/core/document_handlers/document.py
@ -0,0 +1,12 @@
+class Document:
+    def __init__(self, file_path):
+        self.file_path = file_path
+        self.content = ""
+
+    def load(self):
+        with open(self.file_path, 'r') as file:
+            self.content = file.read()
+
+    def save(self, target_path):
+        with open(target_path, 'w') as file:
+            file.write(self.content)
--- a/mineru/app/core/document_handlers/document_factory.py
+++ b/mineru/app/core/document_handlers/document_factory.py
@ -0,0 +1,28 @@
+import os
+from typing import Optional
+from .document_processor import DocumentProcessor
+from .processors import (
+    TxtDocumentProcessor,
+    DocxDocumentProcessor,
+    PdfDocumentProcessor,
+    MarkdownDocumentProcessor
+)
+
+class DocumentProcessorFactory:
+    @staticmethod
+    def create_processor(input_path: str, output_path: str) -> Optional[DocumentProcessor]:
+        file_extension = os.path.splitext(input_path)[1].lower()
+        
+        processors = {
+            '.txt': TxtDocumentProcessor,
+            '.docx': DocxDocumentProcessor,
+            '.doc': DocxDocumentProcessor,
+            '.pdf': PdfDocumentProcessor,
+            '.md': MarkdownDocumentProcessor,
+            '.markdown': MarkdownDocumentProcessor
+        }
+        
+        processor_class = processors.get(file_extension)
+        if processor_class:
+            return processor_class(input_path, output_path)
+        return None
--- a/mineru/app/core/document_handlers/document_processor.py
+++ b/mineru/app/core/document_handlers/document_processor.py
@ -0,0 +1,192 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict
+from ..prompts.masking_prompts import get_masking_mapping_prompt
+import logging
+import json
+from ..services.ollama_client import OllamaClient
+from ...core.config import settings
+from ..utils.json_extractor import LLMJsonExtractor
+
+
+
+logger = logging.getLogger(__name__)
+
+class DocumentProcessor(ABC):
+    def __init__(self):
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
+        self.max_chunk_size = 1000  # Maximum number of characters per chunk
+        self.max_retries = 3  # Maximum number of retries for mapping generation
+
+    @abstractmethod
+    def read_content(self) -> str:
+        """Read document content"""
+        pass
+
+    def _split_into_chunks(self, sentences: list[str]) -> list[str]:
+        """Split sentences into chunks that don't exceed max_chunk_size"""
+        chunks = []
+        current_chunk = ""
+        
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+                
+            # If adding this sentence would exceed the limit, save current chunk and start new one
+            if len(current_chunk) + len(sentence) > self.max_chunk_size and current_chunk:
+                chunks.append(current_chunk)
+                current_chunk = sentence
+            else:
+                if current_chunk:
+                    current_chunk += "。" + sentence
+                else:
+                    current_chunk = sentence
+        
+        # Add the last chunk if it's not empty
+        if current_chunk:
+            chunks.append(current_chunk)
+            
+        return chunks
+
+    def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
+        """
+        Validate that the mapping follows the required format:
+        {
+            "原文1": "脱敏后1",
+            "原文2": "脱敏后2",
+            ...
+        }
+        """
+        if not isinstance(mapping, dict):
+            logger.warning("Mapping is not a dictionary")
+            return False
+            
+        # Check if any key or value is not a string
+        for key, value in mapping.items():
+            if not isinstance(key, str) or not isinstance(value, str):
+                logger.warning(f"Invalid mapping format - key or value is not a string: {key}: {value}")
+                return False
+                
+        # Check if the mapping has any nested structures
+        if any(isinstance(v, (dict, list)) for v in mapping.values()):
+            logger.warning("Invalid mapping format - contains nested structures")
+            return False
+            
+        return True
+
+    def _build_mapping(self, chunk: str) -> Dict[str, str]:
+        """Build mapping for a single chunk of text with retry logic"""
+        for attempt in range(self.max_retries):
+            try:
+                formatted_prompt = get_masking_mapping_prompt(chunk)
+                logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
+                response = self.ollama_client.generate(formatted_prompt)
+                logger.info(f"Raw response from LLM: {response}")
+                
+                # Parse the JSON response into a dictionary
+                mapping = LLMJsonExtractor.parse_raw_json_str(response)
+                logger.info(f"Parsed mapping: {mapping}")
+                
+                if mapping and self._validate_mapping_format(mapping):
+                    return mapping
+                else:
+                    logger.warning(f"Invalid mapping format received on attempt {attempt + 1}, retrying...")
+            except Exception as e:
+                logger.error(f"Error generating mapping on attempt {attempt + 1}: {e}")
+                if attempt < self.max_retries - 1:
+                    logger.info("Retrying...")
+                else:
+                    logger.error("Max retries reached, returning empty mapping")
+                    return {}
+
+    def _apply_mapping(self, text: str, mapping: Dict[str, str]) -> str:
+        """Apply the mapping to replace sensitive information"""
+        masked_text = text
+        for original, masked in mapping.items():
+            # Ensure masked value is a string
+            if isinstance(masked, dict):
+                # If it's a dict, use the first value or a default
+                masked = next(iter(masked.values()), "某")
+            elif not isinstance(masked, str):
+                # If it's not a string, convert to string or use default
+                masked = str(masked) if masked is not None else "某"
+            masked_text = masked_text.replace(original, masked)
+        return masked_text
+
+    def _get_next_suffix(self, value: str) -> str:
+        """Get the next available suffix for a value that already has a suffix"""
+        # Define the sequence of suffixes
+        suffixes = ['甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸']
+        
+        # Check if the value already has a suffix
+        for suffix in suffixes:
+            if value.endswith(suffix):
+                # Find the next suffix in the sequence
+                current_index = suffixes.index(suffix)
+                if current_index + 1 < len(suffixes):
+                    return value[:-1] + suffixes[current_index + 1]
+                else:
+                    # If we've used all suffixes, start over with the first one
+                    return value[:-1] + suffixes[0]
+        
+        # If no suffix found, return the value with the first suffix
+        return value + '甲'
+
+    def _merge_mappings(self, existing: Dict[str, str], new: Dict[str, str]) -> Dict[str, str]:
+        """
+        Merge two mappings following the rules:
+        1. If key exists in existing, keep existing value
+        2. If value exists in existing:
+           - If value ends with a suffix (甲乙丙丁...), add next suffix
+           - If no suffix, add '甲'
+        """
+        result = existing.copy()
+        
+        # Get all existing values
+        existing_values = set(result.values())
+        
+        for key, value in new.items():
+            if key in result:
+                # Rule 1: Keep existing value if key exists
+                continue
+                
+            if value in existing_values:
+                # Rule 2: Handle duplicate values
+                new_value = self._get_next_suffix(value)
+                result[key] = new_value
+                existing_values.add(new_value)
+            else:
+                # No conflict, add as is
+                result[key] = value
+                existing_values.add(value)
+                
+        return result
+
+    def process_content(self, content: str) -> str:
+        """Process document content by masking sensitive information"""
+        # Split content into sentences
+        sentences = content.split("。")
+        
+        # Split sentences into manageable chunks
+        chunks = self._split_into_chunks(sentences)
+        logger.info(f"Split content into {len(chunks)} chunks")
+        
+        # Build mapping for each chunk
+        combined_mapping = {}
+        for i, chunk in enumerate(chunks):
+            logger.info(f"Processing chunk {i+1}/{len(chunks)}")
+            chunk_mapping = self._build_mapping(chunk)
+            if chunk_mapping:  # Only update if we got a valid mapping
+                combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
+            else:
+                logger.warning(f"Failed to generate mapping for chunk {i+1}")
+        
+        # Apply the combined mapping to the entire content
+        masked_content = self._apply_mapping(content, combined_mapping)
+        logger.info("Successfully masked content")
+        
+        return masked_content
+
+    @abstractmethod
+    def save_content(self, content: str) -> None:
+        """Save processed content"""
+        pass
--- a/mineru/app/core/document_handlers/processors/init.py
+++ b/mineru/app/core/document_handlers/processors/init.py
@ -0,0 +1,6 @@
+from .txt_processor import TxtDocumentProcessor
+from .docx_processor import DocxDocumentProcessor
+from .pdf_processor import PdfDocumentProcessor
+from .md_processor import MarkdownDocumentProcessor
+
+__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
--- a/mineru/app/core/document_handlers/processors/docx_processor.py
+++ b/mineru/app/core/document_handlers/processors/docx_processor.py
@ -0,0 +1,77 @@
+import os
+import docx
+from ...document_handlers.document_processor import DocumentProcessor
+from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.data.read_api import read_local_office
+import logging
+from ...services.ollama_client import OllamaClient
+from ...config import settings
+from ...prompts.masking_prompts import get_masking_mapping_prompt
+
+logger = logging.getLogger(__name__)
+
+class DocxDocumentProcessor(DocumentProcessor):
+    def __init__(self, input_path: str, output_path: str):
+        super().__init__()  # Call parent class's __init__
+        self.input_path = input_path
+        self.output_path = output_path
+        self.output_dir = os.path.dirname(output_path)
+        self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
+        
+        # Setup output directories
+        self.local_image_dir = os.path.join(self.output_dir, "images")
+        self.image_dir = os.path.basename(self.local_image_dir)
+        os.makedirs(self.local_image_dir, exist_ok=True)
+        
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
+
+    def read_content(self) -> str:
+        try:
+            # Initialize writers
+            image_writer = FileBasedDataWriter(self.local_image_dir)
+            md_writer = FileBasedDataWriter(self.output_dir)
+            
+            # Create Dataset Instance and process
+            ds = read_local_office(self.input_path)[0]
+            pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
+            
+            # Generate markdown
+            md_content = pipe_result.get_markdown(self.image_dir)
+            pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
+            
+            return md_content
+        except Exception as e:
+            logger.error(f"Error converting DOCX to MD: {e}")
+            raise
+
+    # def process_content(self, content: str) -> str:
+    #     logger.info("Processing DOCX content")
+        
+    #     # Split content into sentences and apply masking
+    #     sentences = content.split("。")
+    #     final_md = ""
+    #     for sentence in sentences:
+    #         if sentence.strip():  # Only process non-empty sentences
+    #             formatted_prompt = get_masking_mapping_prompt(sentence)
+    #             logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
+    #             response = self.ollama_client.generate(formatted_prompt)
+    #             logger.info(f"Response generated: {response}")
+    #             final_md += response + "。"
+        
+    #     return final_md
+
+    def save_content(self, content: str) -> None:
+        # Ensure output path has .md extension
+        output_dir = os.path.dirname(self.output_path)
+        base_name = os.path.splitext(os.path.basename(self.output_path))[0]
+        md_output_path = os.path.join(output_dir, f"{base_name}.md")
+        
+        logger.info(f"Saving masked content to: {md_output_path}")
+        try:
+            with open(md_output_path, 'w', encoding='utf-8') as file:
+                file.write(content)
+            logger.info(f"Successfully saved content to {md_output_path}")
+        except Exception as e:
+            logger.error(f"Error saving content: {e}")
+            raise
--- a/mineru/app/core/document_handlers/processors/md_processor.py
+++ b/mineru/app/core/document_handlers/processors/md_processor.py
@ -0,0 +1,39 @@
+import os
+from ...document_handlers.document_processor import DocumentProcessor
+from ...services.ollama_client import OllamaClient
+import logging
+from ...config import settings
+
+logger = logging.getLogger(__name__)
+
+class MarkdownDocumentProcessor(DocumentProcessor):
+    def __init__(self, input_path: str, output_path: str):
+        super().__init__()  # Call parent class's __init__
+        self.input_path = input_path
+        self.output_path = output_path
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
+
+    def read_content(self) -> str:
+        """Read markdown content from file"""
+        try:
+            with open(self.input_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+            logger.info(f"Successfully read markdown content from {self.input_path}")
+            return content
+        except Exception as e:
+            logger.error(f"Error reading markdown file {self.input_path}: {e}")
+            raise
+
+    def save_content(self, content: str) -> None:
+        """Save processed markdown content"""
+        try:
+            # Ensure output directory exists
+            output_dir = os.path.dirname(self.output_path)
+            os.makedirs(output_dir, exist_ok=True)
+            
+            with open(self.output_path, 'w', encoding='utf-8') as file:
+                file.write(content)
+            logger.info(f"Successfully saved masked content to {self.output_path}")
+        except Exception as e:
+            logger.error(f"Error saving content to {self.output_path}: {e}")
+            raise
--- a/mineru/app/core/document_handlers/processors/pdf_processor.py
+++ b/mineru/app/core/document_handlers/processors/pdf_processor.py
@ -0,0 +1,105 @@
+import os
+import PyPDF2
+from ...document_handlers.document_processor import DocumentProcessor
+from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from ...prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
+import logging
+from ...services.ollama_client import OllamaClient
+from ...config import settings
+
+logger = logging.getLogger(__name__)
+
+class PdfDocumentProcessor(DocumentProcessor):
+    def __init__(self, input_path: str, output_path: str):
+        super().__init__()  # Call parent class's __init__
+        self.input_path = input_path
+        self.output_path = output_path
+        self.output_dir = os.path.dirname(output_path)
+        self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
+        
+        # Setup output directories
+        self.local_image_dir = os.path.join(self.output_dir, "images")
+        self.image_dir = os.path.basename(self.local_image_dir)
+        os.makedirs(self.local_image_dir, exist_ok=True)
+
+        # Setup work directory under output directory
+        self.work_dir = os.path.join(
+            os.path.dirname(output_path), 
+            ".work", 
+            os.path.splitext(os.path.basename(input_path))[0]
+        )
+        os.makedirs(self.work_dir, exist_ok=True)
+
+        self.work_local_image_dir = os.path.join(self.work_dir, "images")
+        self.work_image_dir = os.path.basename(self.work_local_image_dir)
+        os.makedirs(self.work_local_image_dir, exist_ok=True)   
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
+
+    def read_content(self) -> str:
+        logger.info("Starting PDF content processing")
+        
+        # Read the PDF file
+        with open(self.input_path, 'rb') as file:
+            content = file.read()
+
+        # Initialize writers
+        image_writer = FileBasedDataWriter(self.work_local_image_dir)
+        md_writer = FileBasedDataWriter(self.work_dir)
+
+        # Create Dataset Instance
+        ds = PymuDocDataset(content)
+        
+        logger.info("Classifying PDF type: %s", ds.classify())
+        # Process based on PDF type
+        if ds.classify() == SupportedPdfParseMethod.OCR:
+            infer_result = ds.apply(doc_analyze, ocr=True)
+            pipe_result = infer_result.pipe_ocr_mode(image_writer)
+        else:
+            infer_result = ds.apply(doc_analyze, ocr=False)
+            pipe_result = infer_result.pipe_txt_mode(image_writer)
+        
+        logger.info("Generating all outputs")
+        # Generate all outputs
+        infer_result.draw_model(os.path.join(self.work_dir, f"{self.name_without_suff}_model.pdf"))
+        model_inference_result = infer_result.get_infer_res()
+        
+        pipe_result.draw_layout(os.path.join(self.work_dir, f"{self.name_without_suff}_layout.pdf"))
+        pipe_result.draw_span(os.path.join(self.work_dir, f"{self.name_without_suff}_spans.pdf"))
+        
+        md_content = pipe_result.get_markdown(self.work_image_dir)
+        pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.work_image_dir)
+        
+        content_list = pipe_result.get_content_list(self.work_image_dir)
+        pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.work_image_dir)
+        
+        middle_json = pipe_result.get_middle_json()
+        pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json')
+
+        return md_content
+
+    # def process_content(self, content: str) -> str:
+    #     logger.info("Starting content masking process")
+    #     sentences = content.split("。")
+    #     final_md = ""
+    #     for sentence in sentences:
+    #         if not sentence.strip():  # Skip empty sentences
+    #             continue
+    #         formatted_prompt = get_masking_mapping_prompt(sentence)
+    #         logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
+    #         response = self.ollama_client.generate(formatted_prompt)
+    #         logger.info(f"Response generated: {response}")
+    #         final_md += response + "。"
+    #     return final_md
+
+    def save_content(self, content: str) -> None:
+        # Ensure output path has .md extension
+        output_dir = os.path.dirname(self.output_path)
+        base_name = os.path.splitext(os.path.basename(self.output_path))[0]
+        md_output_path = os.path.join(output_dir, f"{base_name}.md")
+        
+        logger.info(f"Saving masked content to: {md_output_path}")
+        with open(md_output_path, 'w', encoding='utf-8') as file:
+            file.write(content)
--- a/mineru/app/core/document_handlers/processors/txt_processor.py
+++ b/mineru/app/core/document_handlers/processors/txt_processor.py
@ -0,0 +1,28 @@
+from ...document_handlers.document_processor import DocumentProcessor
+from ...services.ollama_client import OllamaClient
+import logging
+from ...prompts.masking_prompts import get_masking_prompt
+from ...config import settings
+
+logger = logging.getLogger(__name__)
+class TxtDocumentProcessor(DocumentProcessor):
+    def __init__(self, input_path: str, output_path: str):
+        super().__init__()
+        self.input_path = input_path
+        self.output_path = output_path
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
+
+    def read_content(self) -> str:
+        with open(self.input_path, 'r', encoding='utf-8') as file:
+            return file.read()
+
+    # def process_content(self, content: str) -> str:
+
+    #     formatted_prompt = get_masking_prompt(content)
+    #     response = self.ollama_client.generate(formatted_prompt)
+    #     logger.debug(f"Processed content: {response}")
+    #     return response
+
+    def save_content(self, content: str) -> None:
+        with open(self.output_path, 'w', encoding='utf-8') as file:
+            file.write(content)
--- a/mineru/app/core/services/document_service.py
+++ b/mineru/app/core/services/document_service.py
@ -0,0 +1,30 @@
+import logging
+from ..document_handlers.document_factory import DocumentProcessorFactory
+from ..services.ollama_client import OllamaClient
+
+logger = logging.getLogger(__name__)
+
+class DocumentService:
+    def __init__(self):
+        pass
+
+    def process_document(self, input_path: str, output_path: str) -> bool:
+        try:
+            processor = DocumentProcessorFactory.create_processor(input_path, output_path)
+            if not processor:
+                logger.error(f"Unsupported file format: {input_path}")
+                return False
+
+            # Read content
+            content = processor.read_content()
+
+            # Process with Ollama
+            masked_content = processor.process_content(content)
+
+            # Save processed content
+            processor.save_content(masked_content)
+            return True
+
+        except Exception as e:
+            logger.error(f"Error processing document {input_path}: {str(e)}")
+            return False
--- a/mineru/app/core/utils/file_utils.py
+++ b/mineru/app/core/utils/file_utils.py
@ -0,0 +1,20 @@
+def read_file(file_path):
+    with open(file_path, 'r') as file:
+        return file.read()
+
+def write_file(file_path, content):
+    with open(file_path, 'w') as file:
+        file.write(content)
+
+def file_exists(file_path):
+    import os
+    return os.path.isfile(file_path)
+
+def delete_file(file_path):
+    import os
+    if file_exists(file_path):
+        os.remove(file_path)
+
+def list_files_in_directory(directory_path):
+    import os
+    return [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
--- a/mineru/app/main.py
+++ b/mineru/app/main.py
@ -0,0 +1,39 @@
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from .core.config import settings
+from .api.endpoints import mineru
+from .core.database import engine, Base
+
+# Create database tables
+Base.metadata.create_all(bind=engine)
+
+app = FastAPI(
+    title=settings.PROJECT_NAME,
+    openapi_url=f"{settings.API_V1_STR}/openapi.json"
+)
+
+# Set up CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with specific origins
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Include routers
+# app.include_router(
+#     files.router,
+#     prefix=f"{settings.API_V1_STR}/files",
+#     tags=["files"]
+# )
+
+app.include_router(
+    mineru.router,
+    prefix=f"{settings.API_V1_STR}/mineru",
+    tags=["mineru"]
+)
+
+@app.get("/")
+async def root():
+    return {"message": "Welcome to Legal Document Masker API"} 
--- a/mineru/app/models/file.py
+++ b/mineru/app/models/file.py
@ -0,0 +1,22 @@
+from sqlalchemy import Column, String, DateTime, Text
+from datetime import datetime
+import uuid
+from ..core.database import Base
+
+class FileStatus(str):
+    NOT_STARTED = "not_started"
+    PROCESSING = "processing"
+    SUCCESS = "success"
+    FAILED = "failed"
+
+class File(Base):
+    __tablename__ = "files"
+
+    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
+    filename = Column(String(255), nullable=False)
+    original_path = Column(String(255), nullable=False)
+    processed_path = Column(String(255))
+    status = Column(String(20), nullable=False, default=FileStatus.NOT_STARTED)
+    error_message = Column(Text)
+    created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
+    updated_at = Column(DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow) 
--- a/mineru/app/schemas/file.py
+++ b/mineru/app/schemas/file.py
@ -0,0 +1,21 @@
+from pydantic import BaseModel
+from datetime import datetime
+from typing import Optional
+from uuid import UUID
+
+class FileBase(BaseModel):
+    filename: str
+    status: str
+    error_message: Optional[str] = None
+
+class FileResponse(FileBase):
+    id: UUID
+    created_at: datetime
+    updated_at: datetime
+
+    class Config:
+        from_attributes = True
+
+class FileList(BaseModel):
+    files: list[FileResponse]
+    total: int 
--- a/mineru/docker-compose.yml
+++ b/mineru/docker-compose.yml
@ -0,0 +1,37 @@
+version: '3.8'
+
+services:
+  api:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./storage:/app/storage
+      - ./legal_doc_masker.db:/app/legal_doc_masker.db
+    env_file:
+      - .env
+    environment:
+      - CELERY_BROKER_URL=redis://redis:6379/0
+      - CELERY_RESULT_BACKEND=redis://redis:6379/0
+    depends_on:
+      - redis
+
+  celery_worker:
+    build: .
+    command: celery -A app.services.file_service worker --loglevel=info
+    volumes:
+      - ./storage:/app/storage
+      - ./legal_doc_masker.db:/app/legal_doc_masker.db
+    env_file:
+      - .env
+    environment:
+      - CELERY_BROKER_URL=redis://redis:6379/0
+      - CELERY_RESULT_BACKEND=redis://redis:6379/0
+    depends_on:
+      - redis
+      - api
+
+  redis:
+    image: redis:alpine
+    ports:
+      - "6379:6379" 
--- a/mineru/package-lock.json
+++ b/mineru/package-lock.json
@ -0,0 +1,6 @@
+{
+  "name": "mineru",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {}
+}
--- a/mineru/requirements.txt
+++ b/mineru/requirements.txt
@ -0,0 +1,30 @@
+# FastAPI and server
+fastapi>=0.104.0
+uvicorn>=0.24.0
+python-multipart>=0.0.6
+websockets>=12.0
+
+# Database
+sqlalchemy>=2.0.0
+alembic>=1.12.0
+
+# Background tasks
+celery>=5.3.0
+redis>=5.0.0
+
+# Security
+python-jose[cryptography]>=3.3.0
+passlib[bcrypt]>=1.7.4
+python-dotenv>=1.0.0
+
+# Testing
+pytest>=7.4.0
+httpx>=0.25.0
+
+
+watchdog==2.1.6
+requests==2.28.1
+mineru==2.0.6
+numpy==1.24.3
+scikit-learn==1.3.0
+
--- a/mineru/storage/legal_doc_masker.db
+++ b/mineru/storage/legal_doc_masker.db
--- a/mineru/test_mineru_api.py
+++ b/mineru/test_mineru_api.py
@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+Test script for Mineru API endpoints
+"""
+
+import requests
+import json
+from pathlib import Path
+
+# API base URL
+BASE_URL = "http://localhost:8000/api/v1/mineru"
+
+def test_health_check():
+    """Test the health check endpoint"""
+    print("Testing health check...")
+    response = requests.get(f"{BASE_URL}/health")
+    print(f"Status: {response.status_code}")
+    print(f"Response: {response.json()}")
+    print()
+
+def test_parse_document(file_path: str):
+    """Test document parsing endpoint"""
+    print(f"Testing document parsing with file: {file_path}")
+    
+    # Check if file exists
+    if not Path(file_path).exists():
+        print(f"Error: File {file_path} not found")
+        return
+    
+    # Prepare the file upload
+    with open(file_path, 'rb') as f:
+        files = {'file': (Path(file_path).name, f, 'application/pdf')}
+        
+        # Prepare parameters
+        params = {
+            'lang': 'ch',
+            'backend': 'pipeline',
+            'method': 'auto',
+            'formula_enable': True,
+            'table_enable': True,
+            'draw_layout_bbox': True,
+            'draw_span_bbox': True,
+            'dump_md': True,
+            'dump_middle_json': True,
+            'dump_model_output': True,
+            'dump_orig_pdf': True,
+            'dump_content_list': True,
+            'make_md_mode': 'MM_MD'
+        }
+        
+        # Make the request
+        response = requests.post(f"{BASE_URL}/parse", files=files, params=params)
+        
+        print(f"Status: {response.status_code}")
+        if response.status_code == 200:
+            result = response.json()
+            print("Parse successful!")
+            print(f"File name: {result['file_name']}")
+            print(f"Output directory: {result['output_directory']}")
+            print("Generated outputs:")
+            for output_type, output_path in result['outputs'].items():
+                print(f"  - {output_type}: {output_path}")
+        else:
+            print(f"Error: {response.text}")
+        print()
+
+def test_download_file(file_path: str):
+    """Test file download endpoint"""
+    print(f"Testing file download: {file_path}")
+    
+    response = requests.get(f"{BASE_URL}/download/{file_path}")
+    print(f"Status: {response.status_code}")
+    
+    if response.status_code == 200:
+        # Save the downloaded file
+        output_filename = f"downloaded_{Path(file_path).name}"
+        with open(output_filename, 'wb') as f:
+            f.write(response.content)
+        print(f"File downloaded successfully as: {output_filename}")
+    else:
+        print(f"Error: {response.text}")
+    print()
+
+if __name__ == "__main__":
+    print("Mineru API Test Script")
+    print("=" * 50)
+    
+    # Test health check
+    test_health_check()
+    
+    # Test document parsing (you'll need to provide a PDF file)
+    # Uncomment and modify the path below to test with your own file
+    # test_parse_document("path/to/your/document.pdf")
+    
+    # Example of how to test file download (after parsing)
+    # test_download_file("some_uuid/document_name.md")
+    
+    print("Test completed!")
+    print("\nTo test document parsing:")
+    print("1. Uncomment the test_parse_document line above")
+    print("2. Provide a valid PDF file path")
+    print("3. Run the script again")
+    print("\nTo test file download:")
+    print("1. First run a parse operation to get file paths")
+    print("2. Use the output paths from the parse result")
+    print("3. Uncomment and modify the test_download_file line")