diff --git a/mineru/Dockerfile b/mineru/Dockerfile new file mode 100644 index 0000000..c59ce26 --- /dev/null +++ b/mineru/Dockerfile @@ -0,0 +1,34 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + libreoffice \ + wget \ + && rm -rf /var/lib/apt/lists/* + + +# Copy requirements first to leverage Docker cache +COPY requirements.txt . +RUN pip install huggingface_hub +RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py +RUN python download_models_hf.py + + +RUN pip install --no-cache-dir -r requirements.txt +RUN pip install -U magic-pdf[full] + + +# Copy the rest of the application +COPY . . + +# Create storage directories +RUN mkdir -p storage/uploads storage/processed + +# Expose the port the app runs on +EXPOSE 8000 + +# Command to run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/mineru/MINERU_API_README.md b/mineru/MINERU_API_README.md new file mode 100644 index 0000000..91a3247 --- /dev/null +++ b/mineru/MINERU_API_README.md @@ -0,0 +1,201 @@ +# Mineru API Documentation + +This document describes the FastAPI interface for the Mineru document parsing service. + +## Overview + +The Mineru API provides endpoints for parsing documents (PDFs, images) using advanced OCR and layout analysis. It supports both pipeline and VLM backends for different use cases. + +## Base URL + +``` +http://localhost:8000/api/v1/mineru +``` + +## Endpoints + +### 1. Health Check + +**GET** `/health` + +Check if the Mineru service is running. + +**Response:** +```json +{ + "status": "healthy", + "service": "mineru" +} +``` + +### 2. Parse Document + +**POST** `/parse` + +Parse a document using Mineru's advanced parsing capabilities. + +**Parameters:** + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `file` | File | Required | The document file to parse (PDF, PNG, JPEG, JPG) | +| `lang` | string | "ch" | Language option ('ch', 'en', 'korean', 'japan', etc.) | +| `backend` | string | "pipeline" | Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client') | +| `method` | string | "auto" | Method for parsing ('auto', 'txt', 'ocr') | +| `server_url` | string | null | Server URL for vlm-sglang-client backend | +| `start_page_id` | int | 0 | Start page ID for parsing | +| `end_page_id` | int | null | End page ID for parsing | +| `formula_enable` | boolean | true | Enable formula parsing | +| `table_enable` | boolean | true | Enable table parsing | +| `draw_layout_bbox` | boolean | true | Whether to draw layout bounding boxes | +| `draw_span_bbox` | boolean | true | Whether to draw span bounding boxes | +| `dump_md` | boolean | true | Whether to dump markdown files | +| `dump_middle_json` | boolean | true | Whether to dump middle JSON files | +| `dump_model_output` | boolean | true | Whether to dump model output files | +| `dump_orig_pdf` | boolean | true | Whether to dump original PDF files | +| `dump_content_list` | boolean | true | Whether to dump content list files | +| `make_md_mode` | string | "MM_MD" | The mode for making markdown content | + +**Response:** +```json +{ + "status": "success", + "file_name": "document_name", + "outputs": { + "markdown": "/path/to/document_name.md", + "middle_json": "/path/to/document_name_middle.json", + "model_output": "/path/to/document_name_model.json", + "content_list": "/path/to/document_name_content_list.json", + "original_pdf": "/path/to/document_name_origin.pdf", + "layout_pdf": "/path/to/document_name_layout.pdf", + "span_pdf": "/path/to/document_name_span.pdf" + }, + "output_directory": "/path/to/output/directory" +} +``` + +### 3. Download Processed File + +**GET** `/download/{file_path}` + +Download a processed file from the Mineru output directory. + +**Parameters:** +- `file_path`: Path to the file relative to the mineru output directory + +**Response:** File download + +## Usage Examples + +### Python Example + +```python +import requests + +# Parse a document +with open('document.pdf', 'rb') as f: + files = {'file': ('document.pdf', f, 'application/pdf')} + params = { + 'lang': 'ch', + 'backend': 'pipeline', + 'method': 'auto', + 'formula_enable': True, + 'table_enable': True + } + + response = requests.post( + 'http://localhost:8000/api/v1/mineru/parse', + files=files, + params=params + ) + + if response.status_code == 200: + result = response.json() + print(f"Parsed successfully: {result['file_name']}") + + # Download the markdown file + md_path = result['outputs']['markdown'] + download_response = requests.get( + f'http://localhost:8000/api/v1/mineru/download/{md_path}' + ) + + with open('output.md', 'wb') as f: + f.write(download_response.content) +``` + +### cURL Example + +```bash +# Parse a document +curl -X POST "http://localhost:8000/api/v1/mineru/parse" \ + -F "file=@document.pdf" \ + -F "lang=ch" \ + -F "backend=pipeline" \ + -F "method=auto" + +# Download a processed file +curl -X GET "http://localhost:8000/api/v1/mineru/download/path/to/file.md" \ + -o downloaded_file.md +``` + +## Backend Options + +### Pipeline Backend +- **Use case**: General purpose, more robust +- **Advantages**: Better for complex layouts, supports multiple languages +- **Command**: `backend=pipeline` + +### VLM Backends +- **vlm-transformers**: General purpose VLM +- **vlm-sglang-engine**: Faster engine-based approach +- **vlm-sglang-client**: Fastest client-based approach (requires server_url) + +## Language Support + +Supported languages for the pipeline backend: +- `ch`: Chinese (Simplified) +- `en`: English +- `korean`: Korean +- `japan`: Japanese +- `chinese_cht`: Chinese (Traditional) +- `ta`: Tamil +- `te`: Telugu +- `ka`: Kannada + +## Output Files + +The API generates various output files depending on the parameters: + +1. **Markdown** (`.md`): Structured text content +2. **Middle JSON** (`.json`): Intermediate parsing results +3. **Model Output** (`.json` or `.txt`): Raw model predictions +4. **Content List** (`.json`): Structured content list +5. **Original PDF**: Copy of the input file +6. **Layout PDF**: PDF with layout bounding boxes +7. **Span PDF**: PDF with span bounding boxes + +## Error Handling + +The API returns appropriate HTTP status codes: + +- `200`: Success +- `400`: Bad request (invalid parameters, unsupported file type) +- `404`: File not found +- `500`: Internal server error + +Error responses include a detail message explaining the issue. + +## Testing + +Use the provided test script to verify the API: + +```bash +python test_mineru_api.py +``` + +## Notes + +- The API creates unique output directories for each request to avoid conflicts +- Temporary files are automatically cleaned up after processing +- File downloads are restricted to the processed folder for security +- Large files may take time to process depending on the backend and document complexity \ No newline at end of file diff --git a/mineru/README.md b/mineru/README.md new file mode 100644 index 0000000..36cae2a --- /dev/null +++ b/mineru/README.md @@ -0,0 +1,103 @@ +# Legal Document Masker API + +This is the backend API for the Legal Document Masking system. It provides endpoints for file upload, processing status tracking, and file download. + +## Prerequisites + +- Python 3.8+ +- Redis (for Celery) + +## File Storage + +Files are stored in the following structure: +``` +backend/ +├── storage/ +│ ├── uploads/ # Original uploaded files +│ └── processed/ # Masked/processed files +``` + +## Setup + +### Option 1: Local Development + +1. Create a virtual environment: +```bash +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +3. Set up environment variables: +Create a `.env` file in the backend directory with the following variables: +```env +SECRET_KEY=your-secret-key-here +``` + +The database (SQLite) will be automatically created when you first run the application. + +4. Start Redis (required for Celery): +```bash +redis-server +``` + +5. Start Celery worker: +```bash +celery -A app.services.file_service worker --loglevel=info +``` + +6. Start the FastAPI server: +```bash +uvicorn app.main:app --reload +``` + +### Option 2: Docker Deployment + +1. Build and start the services: +```bash +docker-compose up --build +``` + +This will start: +- FastAPI server on port 8000 +- Celery worker for background processing +- Redis for task queue + +## API Documentation + +Once the server is running, you can access: +- Swagger UI: `http://localhost:8000/docs` +- ReDoc: `http://localhost:8000/redoc` + +## API Endpoints + +- `POST /api/v1/files/upload` - Upload a new file +- `GET /api/v1/files` - List all files +- `GET /api/v1/files/{file_id}` - Get file details +- `GET /api/v1/files/{file_id}/download` - Download processed file +- `WS /api/v1/files/ws/status/{file_id}` - WebSocket for real-time status updates + +## Development + +### Running Tests +```bash +pytest +``` + +### Code Style +The project uses Black for code formatting: +```bash +black . +``` + +### Docker Commands + +- Start services: `docker-compose up` +- Start in background: `docker-compose up -d` +- Stop services: `docker-compose down` +- View logs: `docker-compose logs -f` +- Rebuild: `docker-compose up --build` \ No newline at end of file diff --git a/mineru/app/api/endpoints/mineru.py b/mineru/app/api/endpoints/mineru.py new file mode 100644 index 0000000..73182ab --- /dev/null +++ b/mineru/app/api/endpoints/mineru.py @@ -0,0 +1,329 @@ +from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks +from fastapi.responses import FileResponse +from typing import List, Optional +import os +import tempfile +import shutil +import json +from pathlib import Path +import uuid +from loguru import logger + +from ...core.config import settings + +# Import mineru functions +from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn +from mineru.data.data_reader_writer import FileBasedDataWriter +from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox +from mineru.utils.enum_class import MakeMode +from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze +from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze +from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make +from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json +from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make + +router = APIRouter() + +class MineruParseRequest: + def __init__( + self, + lang: str = "ch", + backend: str = "pipeline", + method: str = "auto", + server_url: Optional[str] = None, + start_page_id: int = 0, + end_page_id: Optional[int] = None, + formula_enable: bool = True, + table_enable: bool = True, + draw_layout_bbox: bool = True, + draw_span_bbox: bool = True, + dump_md: bool = True, + dump_middle_json: bool = True, + dump_model_output: bool = True, + dump_orig_pdf: bool = True, + dump_content_list: bool = True, + make_md_mode: str = "MM_MD" + ): + self.lang = lang + self.backend = backend + self.method = method + self.server_url = server_url + self.start_page_id = start_page_id + self.end_page_id = end_page_id + self.formula_enable = formula_enable + self.table_enable = table_enable + self.draw_layout_bbox = draw_layout_bbox + self.draw_span_bbox = draw_span_bbox + self.dump_md = dump_md + self.dump_middle_json = dump_middle_json + self.dump_model_output = dump_model_output + self.dump_orig_pdf = dump_orig_pdf + self.dump_content_list = dump_content_list + self.make_md_mode = MakeMode.MM_MD if make_md_mode == "MM_MD" else MakeMode.CONTENT_LIST + +async def process_mineru_document( + file: UploadFile, + request: MineruParseRequest, + output_dir: Path +) -> dict: + """Process a single document using Mineru""" + try: + # Read file content + content = await file.read() + + # Create temporary file + with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as temp_file: + temp_file.write(content) + temp_file_path = Path(temp_file.name) + + try: + # Prepare environment + file_name = Path(file.filename).stem + local_image_dir, local_md_dir = prepare_env(output_dir, file_name, request.method) + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) + + # Convert PDF bytes if needed + if request.backend == "pipeline": + new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2( + content, request.start_page_id, request.end_page_id + ) + + # Analyze document + infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze( + [new_pdf_bytes], [request.lang], + parse_method=request.method, + formula_enable=request.formula_enable, + table_enable=request.table_enable + ) + + # Process results + model_list = infer_results[0] + images_list = all_image_lists[0] + pdf_doc = all_pdf_docs[0] + _lang = lang_list[0] + _ocr_enable = ocr_enabled_list[0] + + middle_json = pipeline_result_to_middle_json( + model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, request.formula_enable + ) + + pdf_info = middle_json["pdf_info"] + + # Generate outputs + outputs = {} + + if request.draw_layout_bbox: + draw_layout_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_layout.pdf") + outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf") + + if request.draw_span_bbox: + draw_span_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_span.pdf") + outputs["span_pdf"] = str(local_md_dir / f"{file_name}_span.pdf") + + if request.dump_orig_pdf: + md_writer.write(f"{file_name}_origin.pdf", new_pdf_bytes) + outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf") + + if request.dump_md: + image_dir = str(os.path.basename(local_image_dir)) + md_content_str = pipeline_union_make(pdf_info, request.make_md_mode, image_dir) + md_writer.write_string(f"{file_name}.md", md_content_str) + outputs["markdown"] = str(local_md_dir / f"{file_name}.md") + + if request.dump_content_list: + image_dir = str(os.path.basename(local_image_dir)) + content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir) + md_writer.write_string( + f"{file_name}_content_list.json", + json.dumps(content_list, ensure_ascii=False, indent=4) + ) + outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json") + + if request.dump_middle_json: + md_writer.write_string( + f"{file_name}_middle.json", + json.dumps(middle_json, ensure_ascii=False, indent=4) + ) + outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json") + + if request.dump_model_output: + md_writer.write_string( + f"{file_name}_model.json", + json.dumps(model_list, ensure_ascii=False, indent=4) + ) + outputs["model_output"] = str(local_md_dir / f"{file_name}_model.json") + + else: + # VLM backend + if request.backend.startswith("vlm-"): + backend = request.backend[4:] + + middle_json, infer_result = vlm_doc_analyze( + content, image_writer=image_writer, + backend=backend, server_url=request.server_url + ) + + pdf_info = middle_json["pdf_info"] + + # Generate outputs for VLM + outputs = {} + + if request.draw_layout_bbox: + draw_layout_bbox(pdf_info, content, local_md_dir, f"{file_name}_layout.pdf") + outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf") + + if request.dump_orig_pdf: + md_writer.write(f"{file_name}_origin.pdf", content) + outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf") + + if request.dump_md: + image_dir = str(os.path.basename(local_image_dir)) + md_content_str = vlm_union_make(pdf_info, request.make_md_mode, image_dir) + md_writer.write_string(f"{file_name}.md", md_content_str) + outputs["markdown"] = str(local_md_dir / f"{file_name}.md") + + if request.dump_content_list: + image_dir = str(os.path.basename(local_image_dir)) + content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir) + md_writer.write_string( + f"{file_name}_content_list.json", + json.dumps(content_list, ensure_ascii=False, indent=4) + ) + outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json") + + if request.dump_middle_json: + md_writer.write_string( + f"{file_name}_middle.json", + json.dumps(middle_json, ensure_ascii=False, indent=4) + ) + outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json") + + if request.dump_model_output: + model_output = ("\n" + "-" * 50 + "\n").join(infer_result) + md_writer.write_string(f"{file_name}_model_output.txt", model_output) + outputs["model_output"] = str(local_md_dir / f"{file_name}_model_output.txt") + + return { + "status": "success", + "file_name": file_name, + "outputs": outputs, + "output_directory": str(local_md_dir) + } + + finally: + # Clean up temporary file + if temp_file_path.exists(): + temp_file_path.unlink() + + except Exception as e: + logger.exception(f"Error processing document: {e}") + raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}") + +@router.post("/parse") +async def parse_document( + file: UploadFile = File(...), + lang: str = "ch", + backend: str = "pipeline", + method: str = "auto", + server_url: Optional[str] = None, + start_page_id: int = 0, + end_page_id: Optional[int] = None, + formula_enable: bool = True, + table_enable: bool = True, + draw_layout_bbox: bool = True, + draw_span_bbox: bool = True, + dump_md: bool = True, + dump_middle_json: bool = True, + dump_model_output: bool = True, + dump_orig_pdf: bool = True, + dump_content_list: bool = True, + make_md_mode: str = "MM_MD" +): + """ + Parse a document using Mineru API + + Parameters: + - file: The document file to parse (PDF, image, etc.) + - lang: Language option (default: 'ch') + - backend: Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client') + - method: Method for parsing ('auto', 'txt', 'ocr') + - server_url: Server URL for vlm-sglang-client backend + - start_page_id: Start page ID for parsing + - end_page_id: End page ID for parsing + - formula_enable: Enable formula parsing + - table_enable: Enable table parsing + - draw_layout_bbox: Whether to draw layout bounding boxes + - draw_span_bbox: Whether to draw span bounding boxes + - dump_md: Whether to dump markdown files + - dump_middle_json: Whether to dump middle JSON files + - dump_model_output: Whether to dump model output files + - dump_orig_pdf: Whether to dump original PDF files + - dump_content_list: Whether to dump content list files + - make_md_mode: The mode for making markdown content + """ + + # Validate file type + allowed_extensions = {".pdf", ".png", ".jpeg", ".jpg"} + file_extension = Path(file.filename).suffix.lower() + if file_extension not in allowed_extensions: + raise HTTPException( + status_code=400, + detail=f"File type not allowed. Allowed types: {', '.join(allowed_extensions)}" + ) + + # Create request object + request = MineruParseRequest( + lang=lang, + backend=backend, + method=method, + server_url=server_url, + start_page_id=start_page_id, + end_page_id=end_page_id, + formula_enable=formula_enable, + table_enable=table_enable, + draw_layout_bbox=draw_layout_bbox, + draw_span_bbox=draw_span_bbox, + dump_md=dump_md, + dump_middle_json=dump_middle_json, + dump_model_output=dump_model_output, + dump_orig_pdf=dump_orig_pdf, + dump_content_list=dump_content_list, + make_md_mode=make_md_mode + ) + + # Create output directory + output_dir = settings.PROCESSED_FOLDER / "mineru" / str(uuid.uuid4()) + output_dir.mkdir(parents=True, exist_ok=True) + + # Process document + result = await process_mineru_document(file, request, output_dir) + + return result + +@router.get("/download/{file_path:path}") +async def download_processed_file(file_path: str): + """Download a processed file from the mineru output directory""" + try: + # Construct the full path + full_path = settings.PROCESSED_FOLDER / "mineru" / file_path + + # Security check: ensure the path is within the processed folder + if not str(full_path).startswith(str(settings.PROCESSED_FOLDER)): + raise HTTPException(status_code=400, detail="Invalid file path") + + if not full_path.exists(): + raise HTTPException(status_code=404, detail="File not found") + + return FileResponse( + path=str(full_path), + filename=full_path.name, + media_type="application/octet-stream" + ) + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error downloading file: {str(e)}") + +@router.get("/health") +async def health_check(): + """Health check endpoint for mineru service""" + return {"status": "healthy", "service": "mineru"} diff --git a/mineru/app/core/config.py b/mineru/app/core/config.py new file mode 100644 index 0000000..56c8bff --- /dev/null +++ b/mineru/app/core/config.py @@ -0,0 +1,54 @@ +from pydantic_settings import BaseSettings +from typing import Optional +import os +from pathlib import Path + +class Settings(BaseSettings): + # API Settings + API_V1_STR: str = "/api/v1" + PROJECT_NAME: str = "Legal Document Masker API" + + # Security + SECRET_KEY: str = "your-secret-key-here" # Change in production + ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days + + # Database + BASE_DIR: Path = Path(__file__).parent.parent.parent + DATABASE_URL: str = f"sqlite:///{BASE_DIR}/storage/legal_doc_masker.db" + + # File Storage + UPLOAD_FOLDER: Path = BASE_DIR / "storage" / "uploads" + PROCESSED_FOLDER: Path = BASE_DIR / "storage" / "processed" + MAX_FILE_SIZE: int = 50 * 1024 * 1024 # 50MB + ALLOWED_EXTENSIONS: set = {"pdf", "docx", "doc", "md"} + + # Celery + CELERY_BROKER_URL: str = "redis://redis:6379/0" + CELERY_RESULT_BACKEND: str = "redis://redis:6379/0" + + # Ollama API settings + OLLAMA_API_URL: str = "https://api.ollama.com" + OLLAMA_API_KEY: str = "" + OLLAMA_MODEL: str = "llama2" + + # Logging settings + LOG_LEVEL: str = "INFO" + LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S" + LOG_FILE: str = "app.log" + + class Config: + case_sensitive = True + env_file = ".env" + env_file_encoding = "utf-8" + extra = "allow" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # Create storage directories if they don't exist + self.UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) + self.PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True) + # Create storage directory for database + (self.BASE_DIR / "storage").mkdir(parents=True, exist_ok=True) + +settings = Settings() \ No newline at end of file diff --git a/mineru/app/core/config/logging_config.py b/mineru/app/core/config/logging_config.py new file mode 100644 index 0000000..3e3c7a3 --- /dev/null +++ b/mineru/app/core/config/logging_config.py @@ -0,0 +1,40 @@ +import logging.config +# from config.settings import settings +from .settings import settings + +LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": { + "format": settings.LOG_FORMAT, + "datefmt": settings.LOG_DATE_FORMAT + }, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "standard", + "level": settings.LOG_LEVEL, + "stream": "ext://sys.stdout" + }, + "file": { + "class": "logging.FileHandler", + "formatter": "standard", + "level": settings.LOG_LEVEL, + "filename": settings.LOG_FILE, + "mode": "a", + } + }, + "loggers": { + "": { # root logger + "handlers": ["console", "file"], + "level": settings.LOG_LEVEL, + "propagate": True + } + } +} + +def setup_logging(): + """Initialize logging configuration""" + logging.config.dictConfig(LOGGING_CONFIG) \ No newline at end of file diff --git a/mineru/app/core/config/settings.py b/mineru/app/core/config/settings.py new file mode 100644 index 0000000..e69de29 diff --git a/mineru/app/core/database.py b/mineru/app/core/database.py new file mode 100644 index 0000000..e1dca40 --- /dev/null +++ b/mineru/app/core/database.py @@ -0,0 +1,21 @@ +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from .config import settings + +# Create SQLite engine with check_same_thread=False for FastAPI +engine = create_engine( + settings.DATABASE_URL, + connect_args={"check_same_thread": False} +) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +Base = declarative_base() + +# Dependency +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() \ No newline at end of file diff --git a/mineru/app/core/document_handlers/document.py b/mineru/app/core/document_handlers/document.py new file mode 100644 index 0000000..d68b501 --- /dev/null +++ b/mineru/app/core/document_handlers/document.py @@ -0,0 +1,12 @@ +class Document: + def __init__(self, file_path): + self.file_path = file_path + self.content = "" + + def load(self): + with open(self.file_path, 'r') as file: + self.content = file.read() + + def save(self, target_path): + with open(target_path, 'w') as file: + file.write(self.content) \ No newline at end of file diff --git a/mineru/app/core/document_handlers/document_factory.py b/mineru/app/core/document_handlers/document_factory.py new file mode 100644 index 0000000..cb2a73f --- /dev/null +++ b/mineru/app/core/document_handlers/document_factory.py @@ -0,0 +1,28 @@ +import os +from typing import Optional +from .document_processor import DocumentProcessor +from .processors import ( + TxtDocumentProcessor, + DocxDocumentProcessor, + PdfDocumentProcessor, + MarkdownDocumentProcessor +) + +class DocumentProcessorFactory: + @staticmethod + def create_processor(input_path: str, output_path: str) -> Optional[DocumentProcessor]: + file_extension = os.path.splitext(input_path)[1].lower() + + processors = { + '.txt': TxtDocumentProcessor, + '.docx': DocxDocumentProcessor, + '.doc': DocxDocumentProcessor, + '.pdf': PdfDocumentProcessor, + '.md': MarkdownDocumentProcessor, + '.markdown': MarkdownDocumentProcessor + } + + processor_class = processors.get(file_extension) + if processor_class: + return processor_class(input_path, output_path) + return None \ No newline at end of file diff --git a/mineru/app/core/document_handlers/document_processor.py b/mineru/app/core/document_handlers/document_processor.py new file mode 100644 index 0000000..8fef8e0 --- /dev/null +++ b/mineru/app/core/document_handlers/document_processor.py @@ -0,0 +1,192 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict +from ..prompts.masking_prompts import get_masking_mapping_prompt +import logging +import json +from ..services.ollama_client import OllamaClient +from ...core.config import settings +from ..utils.json_extractor import LLMJsonExtractor + + + +logger = logging.getLogger(__name__) + +class DocumentProcessor(ABC): + def __init__(self): + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) + self.max_chunk_size = 1000 # Maximum number of characters per chunk + self.max_retries = 3 # Maximum number of retries for mapping generation + + @abstractmethod + def read_content(self) -> str: + """Read document content""" + pass + + def _split_into_chunks(self, sentences: list[str]) -> list[str]: + """Split sentences into chunks that don't exceed max_chunk_size""" + chunks = [] + current_chunk = "" + + for sentence in sentences: + if not sentence.strip(): + continue + + # If adding this sentence would exceed the limit, save current chunk and start new one + if len(current_chunk) + len(sentence) > self.max_chunk_size and current_chunk: + chunks.append(current_chunk) + current_chunk = sentence + else: + if current_chunk: + current_chunk += "。" + sentence + else: + current_chunk = sentence + + # Add the last chunk if it's not empty + if current_chunk: + chunks.append(current_chunk) + + return chunks + + def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool: + """ + Validate that the mapping follows the required format: + { + "原文1": "脱敏后1", + "原文2": "脱敏后2", + ... + } + """ + if not isinstance(mapping, dict): + logger.warning("Mapping is not a dictionary") + return False + + # Check if any key or value is not a string + for key, value in mapping.items(): + if not isinstance(key, str) or not isinstance(value, str): + logger.warning(f"Invalid mapping format - key or value is not a string: {key}: {value}") + return False + + # Check if the mapping has any nested structures + if any(isinstance(v, (dict, list)) for v in mapping.values()): + logger.warning("Invalid mapping format - contains nested structures") + return False + + return True + + def _build_mapping(self, chunk: str) -> Dict[str, str]: + """Build mapping for a single chunk of text with retry logic""" + for attempt in range(self.max_retries): + try: + formatted_prompt = get_masking_mapping_prompt(chunk) + logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}") + response = self.ollama_client.generate(formatted_prompt) + logger.info(f"Raw response from LLM: {response}") + + # Parse the JSON response into a dictionary + mapping = LLMJsonExtractor.parse_raw_json_str(response) + logger.info(f"Parsed mapping: {mapping}") + + if mapping and self._validate_mapping_format(mapping): + return mapping + else: + logger.warning(f"Invalid mapping format received on attempt {attempt + 1}, retrying...") + except Exception as e: + logger.error(f"Error generating mapping on attempt {attempt + 1}: {e}") + if attempt < self.max_retries - 1: + logger.info("Retrying...") + else: + logger.error("Max retries reached, returning empty mapping") + return {} + + def _apply_mapping(self, text: str, mapping: Dict[str, str]) -> str: + """Apply the mapping to replace sensitive information""" + masked_text = text + for original, masked in mapping.items(): + # Ensure masked value is a string + if isinstance(masked, dict): + # If it's a dict, use the first value or a default + masked = next(iter(masked.values()), "某") + elif not isinstance(masked, str): + # If it's not a string, convert to string or use default + masked = str(masked) if masked is not None else "某" + masked_text = masked_text.replace(original, masked) + return masked_text + + def _get_next_suffix(self, value: str) -> str: + """Get the next available suffix for a value that already has a suffix""" + # Define the sequence of suffixes + suffixes = ['甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸'] + + # Check if the value already has a suffix + for suffix in suffixes: + if value.endswith(suffix): + # Find the next suffix in the sequence + current_index = suffixes.index(suffix) + if current_index + 1 < len(suffixes): + return value[:-1] + suffixes[current_index + 1] + else: + # If we've used all suffixes, start over with the first one + return value[:-1] + suffixes[0] + + # If no suffix found, return the value with the first suffix + return value + '甲' + + def _merge_mappings(self, existing: Dict[str, str], new: Dict[str, str]) -> Dict[str, str]: + """ + Merge two mappings following the rules: + 1. If key exists in existing, keep existing value + 2. If value exists in existing: + - If value ends with a suffix (甲乙丙丁...), add next suffix + - If no suffix, add '甲' + """ + result = existing.copy() + + # Get all existing values + existing_values = set(result.values()) + + for key, value in new.items(): + if key in result: + # Rule 1: Keep existing value if key exists + continue + + if value in existing_values: + # Rule 2: Handle duplicate values + new_value = self._get_next_suffix(value) + result[key] = new_value + existing_values.add(new_value) + else: + # No conflict, add as is + result[key] = value + existing_values.add(value) + + return result + + def process_content(self, content: str) -> str: + """Process document content by masking sensitive information""" + # Split content into sentences + sentences = content.split("。") + + # Split sentences into manageable chunks + chunks = self._split_into_chunks(sentences) + logger.info(f"Split content into {len(chunks)} chunks") + + # Build mapping for each chunk + combined_mapping = {} + for i, chunk in enumerate(chunks): + logger.info(f"Processing chunk {i+1}/{len(chunks)}") + chunk_mapping = self._build_mapping(chunk) + if chunk_mapping: # Only update if we got a valid mapping + combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping) + else: + logger.warning(f"Failed to generate mapping for chunk {i+1}") + + # Apply the combined mapping to the entire content + masked_content = self._apply_mapping(content, combined_mapping) + logger.info("Successfully masked content") + + return masked_content + + @abstractmethod + def save_content(self, content: str) -> None: + """Save processed content""" + pass \ No newline at end of file diff --git a/mineru/app/core/document_handlers/processors/__init__.py b/mineru/app/core/document_handlers/processors/__init__.py new file mode 100644 index 0000000..d8d35f0 --- /dev/null +++ b/mineru/app/core/document_handlers/processors/__init__.py @@ -0,0 +1,6 @@ +from .txt_processor import TxtDocumentProcessor +from .docx_processor import DocxDocumentProcessor +from .pdf_processor import PdfDocumentProcessor +from .md_processor import MarkdownDocumentProcessor + +__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor'] \ No newline at end of file diff --git a/mineru/app/core/document_handlers/processors/docx_processor.py b/mineru/app/core/document_handlers/processors/docx_processor.py new file mode 100644 index 0000000..598ba09 --- /dev/null +++ b/mineru/app/core/document_handlers/processors/docx_processor.py @@ -0,0 +1,77 @@ +import os +import docx +from ...document_handlers.document_processor import DocumentProcessor +from magic_pdf.data.data_reader_writer import FileBasedDataWriter +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.data.read_api import read_local_office +import logging +from ...services.ollama_client import OllamaClient +from ...config import settings +from ...prompts.masking_prompts import get_masking_mapping_prompt + +logger = logging.getLogger(__name__) + +class DocxDocumentProcessor(DocumentProcessor): + def __init__(self, input_path: str, output_path: str): + super().__init__() # Call parent class's __init__ + self.input_path = input_path + self.output_path = output_path + self.output_dir = os.path.dirname(output_path) + self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0] + + # Setup output directories + self.local_image_dir = os.path.join(self.output_dir, "images") + self.image_dir = os.path.basename(self.local_image_dir) + os.makedirs(self.local_image_dir, exist_ok=True) + + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) + + def read_content(self) -> str: + try: + # Initialize writers + image_writer = FileBasedDataWriter(self.local_image_dir) + md_writer = FileBasedDataWriter(self.output_dir) + + # Create Dataset Instance and process + ds = read_local_office(self.input_path)[0] + pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer) + + # Generate markdown + md_content = pipe_result.get_markdown(self.image_dir) + pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir) + + return md_content + except Exception as e: + logger.error(f"Error converting DOCX to MD: {e}") + raise + + # def process_content(self, content: str) -> str: + # logger.info("Processing DOCX content") + + # # Split content into sentences and apply masking + # sentences = content.split("。") + # final_md = "" + # for sentence in sentences: + # if sentence.strip(): # Only process non-empty sentences + # formatted_prompt = get_masking_mapping_prompt(sentence) + # logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt) + # response = self.ollama_client.generate(formatted_prompt) + # logger.info(f"Response generated: {response}") + # final_md += response + "。" + + # return final_md + + def save_content(self, content: str) -> None: + # Ensure output path has .md extension + output_dir = os.path.dirname(self.output_path) + base_name = os.path.splitext(os.path.basename(self.output_path))[0] + md_output_path = os.path.join(output_dir, f"{base_name}.md") + + logger.info(f"Saving masked content to: {md_output_path}") + try: + with open(md_output_path, 'w', encoding='utf-8') as file: + file.write(content) + logger.info(f"Successfully saved content to {md_output_path}") + except Exception as e: + logger.error(f"Error saving content: {e}") + raise \ No newline at end of file diff --git a/mineru/app/core/document_handlers/processors/md_processor.py b/mineru/app/core/document_handlers/processors/md_processor.py new file mode 100644 index 0000000..4d48fc5 --- /dev/null +++ b/mineru/app/core/document_handlers/processors/md_processor.py @@ -0,0 +1,39 @@ +import os +from ...document_handlers.document_processor import DocumentProcessor +from ...services.ollama_client import OllamaClient +import logging +from ...config import settings + +logger = logging.getLogger(__name__) + +class MarkdownDocumentProcessor(DocumentProcessor): + def __init__(self, input_path: str, output_path: str): + super().__init__() # Call parent class's __init__ + self.input_path = input_path + self.output_path = output_path + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) + + def read_content(self) -> str: + """Read markdown content from file""" + try: + with open(self.input_path, 'r', encoding='utf-8') as file: + content = file.read() + logger.info(f"Successfully read markdown content from {self.input_path}") + return content + except Exception as e: + logger.error(f"Error reading markdown file {self.input_path}: {e}") + raise + + def save_content(self, content: str) -> None: + """Save processed markdown content""" + try: + # Ensure output directory exists + output_dir = os.path.dirname(self.output_path) + os.makedirs(output_dir, exist_ok=True) + + with open(self.output_path, 'w', encoding='utf-8') as file: + file.write(content) + logger.info(f"Successfully saved masked content to {self.output_path}") + except Exception as e: + logger.error(f"Error saving content to {self.output_path}: {e}") + raise diff --git a/mineru/app/core/document_handlers/processors/pdf_processor.py b/mineru/app/core/document_handlers/processors/pdf_processor.py new file mode 100644 index 0000000..d9bb881 --- /dev/null +++ b/mineru/app/core/document_handlers/processors/pdf_processor.py @@ -0,0 +1,105 @@ +import os +import PyPDF2 +from ...document_handlers.document_processor import DocumentProcessor +from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader +from magic_pdf.data.dataset import PymuDocDataset +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.config.enums import SupportedPdfParseMethod +from ...prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt +import logging +from ...services.ollama_client import OllamaClient +from ...config import settings + +logger = logging.getLogger(__name__) + +class PdfDocumentProcessor(DocumentProcessor): + def __init__(self, input_path: str, output_path: str): + super().__init__() # Call parent class's __init__ + self.input_path = input_path + self.output_path = output_path + self.output_dir = os.path.dirname(output_path) + self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0] + + # Setup output directories + self.local_image_dir = os.path.join(self.output_dir, "images") + self.image_dir = os.path.basename(self.local_image_dir) + os.makedirs(self.local_image_dir, exist_ok=True) + + # Setup work directory under output directory + self.work_dir = os.path.join( + os.path.dirname(output_path), + ".work", + os.path.splitext(os.path.basename(input_path))[0] + ) + os.makedirs(self.work_dir, exist_ok=True) + + self.work_local_image_dir = os.path.join(self.work_dir, "images") + self.work_image_dir = os.path.basename(self.work_local_image_dir) + os.makedirs(self.work_local_image_dir, exist_ok=True) + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) + + def read_content(self) -> str: + logger.info("Starting PDF content processing") + + # Read the PDF file + with open(self.input_path, 'rb') as file: + content = file.read() + + # Initialize writers + image_writer = FileBasedDataWriter(self.work_local_image_dir) + md_writer = FileBasedDataWriter(self.work_dir) + + # Create Dataset Instance + ds = PymuDocDataset(content) + + logger.info("Classifying PDF type: %s", ds.classify()) + # Process based on PDF type + if ds.classify() == SupportedPdfParseMethod.OCR: + infer_result = ds.apply(doc_analyze, ocr=True) + pipe_result = infer_result.pipe_ocr_mode(image_writer) + else: + infer_result = ds.apply(doc_analyze, ocr=False) + pipe_result = infer_result.pipe_txt_mode(image_writer) + + logger.info("Generating all outputs") + # Generate all outputs + infer_result.draw_model(os.path.join(self.work_dir, f"{self.name_without_suff}_model.pdf")) + model_inference_result = infer_result.get_infer_res() + + pipe_result.draw_layout(os.path.join(self.work_dir, f"{self.name_without_suff}_layout.pdf")) + pipe_result.draw_span(os.path.join(self.work_dir, f"{self.name_without_suff}_spans.pdf")) + + md_content = pipe_result.get_markdown(self.work_image_dir) + pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.work_image_dir) + + content_list = pipe_result.get_content_list(self.work_image_dir) + pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.work_image_dir) + + middle_json = pipe_result.get_middle_json() + pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json') + + return md_content + + # def process_content(self, content: str) -> str: + # logger.info("Starting content masking process") + # sentences = content.split("。") + # final_md = "" + # for sentence in sentences: + # if not sentence.strip(): # Skip empty sentences + # continue + # formatted_prompt = get_masking_mapping_prompt(sentence) + # logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt) + # response = self.ollama_client.generate(formatted_prompt) + # logger.info(f"Response generated: {response}") + # final_md += response + "。" + # return final_md + + def save_content(self, content: str) -> None: + # Ensure output path has .md extension + output_dir = os.path.dirname(self.output_path) + base_name = os.path.splitext(os.path.basename(self.output_path))[0] + md_output_path = os.path.join(output_dir, f"{base_name}.md") + + logger.info(f"Saving masked content to: {md_output_path}") + with open(md_output_path, 'w', encoding='utf-8') as file: + file.write(content) \ No newline at end of file diff --git a/mineru/app/core/document_handlers/processors/txt_processor.py b/mineru/app/core/document_handlers/processors/txt_processor.py new file mode 100644 index 0000000..65c16c0 --- /dev/null +++ b/mineru/app/core/document_handlers/processors/txt_processor.py @@ -0,0 +1,28 @@ +from ...document_handlers.document_processor import DocumentProcessor +from ...services.ollama_client import OllamaClient +import logging +from ...prompts.masking_prompts import get_masking_prompt +from ...config import settings + +logger = logging.getLogger(__name__) +class TxtDocumentProcessor(DocumentProcessor): + def __init__(self, input_path: str, output_path: str): + super().__init__() + self.input_path = input_path + self.output_path = output_path + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) + + def read_content(self) -> str: + with open(self.input_path, 'r', encoding='utf-8') as file: + return file.read() + + # def process_content(self, content: str) -> str: + + # formatted_prompt = get_masking_prompt(content) + # response = self.ollama_client.generate(formatted_prompt) + # logger.debug(f"Processed content: {response}") + # return response + + def save_content(self, content: str) -> None: + with open(self.output_path, 'w', encoding='utf-8') as file: + file.write(content) \ No newline at end of file diff --git a/mineru/app/core/services/document_service.py b/mineru/app/core/services/document_service.py new file mode 100644 index 0000000..c169bfa --- /dev/null +++ b/mineru/app/core/services/document_service.py @@ -0,0 +1,30 @@ +import logging +from ..document_handlers.document_factory import DocumentProcessorFactory +from ..services.ollama_client import OllamaClient + +logger = logging.getLogger(__name__) + +class DocumentService: + def __init__(self): + pass + + def process_document(self, input_path: str, output_path: str) -> bool: + try: + processor = DocumentProcessorFactory.create_processor(input_path, output_path) + if not processor: + logger.error(f"Unsupported file format: {input_path}") + return False + + # Read content + content = processor.read_content() + + # Process with Ollama + masked_content = processor.process_content(content) + + # Save processed content + processor.save_content(masked_content) + return True + + except Exception as e: + logger.error(f"Error processing document {input_path}: {str(e)}") + return False \ No newline at end of file diff --git a/mineru/app/core/utils/file_utils.py b/mineru/app/core/utils/file_utils.py new file mode 100644 index 0000000..f2c6935 --- /dev/null +++ b/mineru/app/core/utils/file_utils.py @@ -0,0 +1,20 @@ +def read_file(file_path): + with open(file_path, 'r') as file: + return file.read() + +def write_file(file_path, content): + with open(file_path, 'w') as file: + file.write(content) + +def file_exists(file_path): + import os + return os.path.isfile(file_path) + +def delete_file(file_path): + import os + if file_exists(file_path): + os.remove(file_path) + +def list_files_in_directory(directory_path): + import os + return [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))] \ No newline at end of file diff --git a/mineru/app/main.py b/mineru/app/main.py new file mode 100644 index 0000000..d5a2954 --- /dev/null +++ b/mineru/app/main.py @@ -0,0 +1,39 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from .core.config import settings +from .api.endpoints import mineru +from .core.database import engine, Base + +# Create database tables +Base.metadata.create_all(bind=engine) + +app = FastAPI( + title=settings.PROJECT_NAME, + openapi_url=f"{settings.API_V1_STR}/openapi.json" +) + +# Set up CORS +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # In production, replace with specific origins + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Include routers +# app.include_router( +# files.router, +# prefix=f"{settings.API_V1_STR}/files", +# tags=["files"] +# ) + +app.include_router( + mineru.router, + prefix=f"{settings.API_V1_STR}/mineru", + tags=["mineru"] +) + +@app.get("/") +async def root(): + return {"message": "Welcome to Legal Document Masker API"} \ No newline at end of file diff --git a/mineru/app/models/file.py b/mineru/app/models/file.py new file mode 100644 index 0000000..2d01fa2 --- /dev/null +++ b/mineru/app/models/file.py @@ -0,0 +1,22 @@ +from sqlalchemy import Column, String, DateTime, Text +from datetime import datetime +import uuid +from ..core.database import Base + +class FileStatus(str): + NOT_STARTED = "not_started" + PROCESSING = "processing" + SUCCESS = "success" + FAILED = "failed" + +class File(Base): + __tablename__ = "files" + + id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4())) + filename = Column(String(255), nullable=False) + original_path = Column(String(255), nullable=False) + processed_path = Column(String(255)) + status = Column(String(20), nullable=False, default=FileStatus.NOT_STARTED) + error_message = Column(Text) + created_at = Column(DateTime, nullable=False, default=datetime.utcnow) + updated_at = Column(DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow) \ No newline at end of file diff --git a/mineru/app/schemas/file.py b/mineru/app/schemas/file.py new file mode 100644 index 0000000..61a84ed --- /dev/null +++ b/mineru/app/schemas/file.py @@ -0,0 +1,21 @@ +from pydantic import BaseModel +from datetime import datetime +from typing import Optional +from uuid import UUID + +class FileBase(BaseModel): + filename: str + status: str + error_message: Optional[str] = None + +class FileResponse(FileBase): + id: UUID + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + +class FileList(BaseModel): + files: list[FileResponse] + total: int \ No newline at end of file diff --git a/mineru/docker-compose.yml b/mineru/docker-compose.yml new file mode 100644 index 0000000..e6f878d --- /dev/null +++ b/mineru/docker-compose.yml @@ -0,0 +1,37 @@ +version: '3.8' + +services: + api: + build: . + ports: + - "8000:8000" + volumes: + - ./storage:/app/storage + - ./legal_doc_masker.db:/app/legal_doc_masker.db + env_file: + - .env + environment: + - CELERY_BROKER_URL=redis://redis:6379/0 + - CELERY_RESULT_BACKEND=redis://redis:6379/0 + depends_on: + - redis + + celery_worker: + build: . + command: celery -A app.services.file_service worker --loglevel=info + volumes: + - ./storage:/app/storage + - ./legal_doc_masker.db:/app/legal_doc_masker.db + env_file: + - .env + environment: + - CELERY_BROKER_URL=redis://redis:6379/0 + - CELERY_RESULT_BACKEND=redis://redis:6379/0 + depends_on: + - redis + - api + + redis: + image: redis:alpine + ports: + - "6379:6379" \ No newline at end of file diff --git a/mineru/package-lock.json b/mineru/package-lock.json new file mode 100644 index 0000000..e3231d2 --- /dev/null +++ b/mineru/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "mineru", + "lockfileVersion": 3, + "requires": true, + "packages": {} +} diff --git a/mineru/requirements.txt b/mineru/requirements.txt new file mode 100644 index 0000000..fed1a26 --- /dev/null +++ b/mineru/requirements.txt @@ -0,0 +1,30 @@ +# FastAPI and server +fastapi>=0.104.0 +uvicorn>=0.24.0 +python-multipart>=0.0.6 +websockets>=12.0 + +# Database +sqlalchemy>=2.0.0 +alembic>=1.12.0 + +# Background tasks +celery>=5.3.0 +redis>=5.0.0 + +# Security +python-jose[cryptography]>=3.3.0 +passlib[bcrypt]>=1.7.4 +python-dotenv>=1.0.0 + +# Testing +pytest>=7.4.0 +httpx>=0.25.0 + + +watchdog==2.1.6 +requests==2.28.1 +mineru==2.0.6 +numpy==1.24.3 +scikit-learn==1.3.0 + diff --git a/mineru/storage/legal_doc_masker.db b/mineru/storage/legal_doc_masker.db new file mode 100644 index 0000000..e69de29 diff --git a/mineru/test_mineru_api.py b/mineru/test_mineru_api.py new file mode 100644 index 0000000..40e179d --- /dev/null +++ b/mineru/test_mineru_api.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +Test script for Mineru API endpoints +""" + +import requests +import json +from pathlib import Path + +# API base URL +BASE_URL = "http://localhost:8000/api/v1/mineru" + +def test_health_check(): + """Test the health check endpoint""" + print("Testing health check...") + response = requests.get(f"{BASE_URL}/health") + print(f"Status: {response.status_code}") + print(f"Response: {response.json()}") + print() + +def test_parse_document(file_path: str): + """Test document parsing endpoint""" + print(f"Testing document parsing with file: {file_path}") + + # Check if file exists + if not Path(file_path).exists(): + print(f"Error: File {file_path} not found") + return + + # Prepare the file upload + with open(file_path, 'rb') as f: + files = {'file': (Path(file_path).name, f, 'application/pdf')} + + # Prepare parameters + params = { + 'lang': 'ch', + 'backend': 'pipeline', + 'method': 'auto', + 'formula_enable': True, + 'table_enable': True, + 'draw_layout_bbox': True, + 'draw_span_bbox': True, + 'dump_md': True, + 'dump_middle_json': True, + 'dump_model_output': True, + 'dump_orig_pdf': True, + 'dump_content_list': True, + 'make_md_mode': 'MM_MD' + } + + # Make the request + response = requests.post(f"{BASE_URL}/parse", files=files, params=params) + + print(f"Status: {response.status_code}") + if response.status_code == 200: + result = response.json() + print("Parse successful!") + print(f"File name: {result['file_name']}") + print(f"Output directory: {result['output_directory']}") + print("Generated outputs:") + for output_type, output_path in result['outputs'].items(): + print(f" - {output_type}: {output_path}") + else: + print(f"Error: {response.text}") + print() + +def test_download_file(file_path: str): + """Test file download endpoint""" + print(f"Testing file download: {file_path}") + + response = requests.get(f"{BASE_URL}/download/{file_path}") + print(f"Status: {response.status_code}") + + if response.status_code == 200: + # Save the downloaded file + output_filename = f"downloaded_{Path(file_path).name}" + with open(output_filename, 'wb') as f: + f.write(response.content) + print(f"File downloaded successfully as: {output_filename}") + else: + print(f"Error: {response.text}") + print() + +if __name__ == "__main__": + print("Mineru API Test Script") + print("=" * 50) + + # Test health check + test_health_check() + + # Test document parsing (you'll need to provide a PDF file) + # Uncomment and modify the path below to test with your own file + # test_parse_document("path/to/your/document.pdf") + + # Example of how to test file download (after parsing) + # test_download_file("some_uuid/document_name.md") + + print("Test completed!") + print("\nTo test document parsing:") + print("1. Uncomment the test_parse_document line above") + print("2. Provide a valid PDF file path") + print("3. Run the script again") + print("\nTo test file download:") + print("1. First run a parse operation to get file paths") + print("2. Use the output paths from the parse result") + print("3. Uncomment and modify the test_download_file line") \ No newline at end of file