WIP:新增mineru部分
This commit is contained in:
parent
12c1b5f75e
commit
fcf88e36d6
|
|
@ -0,0 +1,34 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
libreoffice \
|
||||
wget \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# Copy requirements first to leverage Docker cache
|
||||
COPY requirements.txt .
|
||||
RUN pip install huggingface_hub
|
||||
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
|
||||
RUN python download_models_hf.py
|
||||
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install -U magic-pdf[full]
|
||||
|
||||
|
||||
# Copy the rest of the application
|
||||
COPY . .
|
||||
|
||||
# Create storage directories
|
||||
RUN mkdir -p storage/uploads storage/processed
|
||||
|
||||
# Expose the port the app runs on
|
||||
EXPOSE 8000
|
||||
|
||||
# Command to run the application
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
# Mineru API Documentation
|
||||
|
||||
This document describes the FastAPI interface for the Mineru document parsing service.
|
||||
|
||||
## Overview
|
||||
|
||||
The Mineru API provides endpoints for parsing documents (PDFs, images) using advanced OCR and layout analysis. It supports both pipeline and VLM backends for different use cases.
|
||||
|
||||
## Base URL
|
||||
|
||||
```
|
||||
http://localhost:8000/api/v1/mineru
|
||||
```
|
||||
|
||||
## Endpoints
|
||||
|
||||
### 1. Health Check
|
||||
|
||||
**GET** `/health`
|
||||
|
||||
Check if the Mineru service is running.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"status": "healthy",
|
||||
"service": "mineru"
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Parse Document
|
||||
|
||||
**POST** `/parse`
|
||||
|
||||
Parse a document using Mineru's advanced parsing capabilities.
|
||||
|
||||
**Parameters:**
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `file` | File | Required | The document file to parse (PDF, PNG, JPEG, JPG) |
|
||||
| `lang` | string | "ch" | Language option ('ch', 'en', 'korean', 'japan', etc.) |
|
||||
| `backend` | string | "pipeline" | Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client') |
|
||||
| `method` | string | "auto" | Method for parsing ('auto', 'txt', 'ocr') |
|
||||
| `server_url` | string | null | Server URL for vlm-sglang-client backend |
|
||||
| `start_page_id` | int | 0 | Start page ID for parsing |
|
||||
| `end_page_id` | int | null | End page ID for parsing |
|
||||
| `formula_enable` | boolean | true | Enable formula parsing |
|
||||
| `table_enable` | boolean | true | Enable table parsing |
|
||||
| `draw_layout_bbox` | boolean | true | Whether to draw layout bounding boxes |
|
||||
| `draw_span_bbox` | boolean | true | Whether to draw span bounding boxes |
|
||||
| `dump_md` | boolean | true | Whether to dump markdown files |
|
||||
| `dump_middle_json` | boolean | true | Whether to dump middle JSON files |
|
||||
| `dump_model_output` | boolean | true | Whether to dump model output files |
|
||||
| `dump_orig_pdf` | boolean | true | Whether to dump original PDF files |
|
||||
| `dump_content_list` | boolean | true | Whether to dump content list files |
|
||||
| `make_md_mode` | string | "MM_MD" | The mode for making markdown content |
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"file_name": "document_name",
|
||||
"outputs": {
|
||||
"markdown": "/path/to/document_name.md",
|
||||
"middle_json": "/path/to/document_name_middle.json",
|
||||
"model_output": "/path/to/document_name_model.json",
|
||||
"content_list": "/path/to/document_name_content_list.json",
|
||||
"original_pdf": "/path/to/document_name_origin.pdf",
|
||||
"layout_pdf": "/path/to/document_name_layout.pdf",
|
||||
"span_pdf": "/path/to/document_name_span.pdf"
|
||||
},
|
||||
"output_directory": "/path/to/output/directory"
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Download Processed File
|
||||
|
||||
**GET** `/download/{file_path}`
|
||||
|
||||
Download a processed file from the Mineru output directory.
|
||||
|
||||
**Parameters:**
|
||||
- `file_path`: Path to the file relative to the mineru output directory
|
||||
|
||||
**Response:** File download
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Python Example
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Parse a document
|
||||
with open('document.pdf', 'rb') as f:
|
||||
files = {'file': ('document.pdf', f, 'application/pdf')}
|
||||
params = {
|
||||
'lang': 'ch',
|
||||
'backend': 'pipeline',
|
||||
'method': 'auto',
|
||||
'formula_enable': True,
|
||||
'table_enable': True
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
'http://localhost:8000/api/v1/mineru/parse',
|
||||
files=files,
|
||||
params=params
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f"Parsed successfully: {result['file_name']}")
|
||||
|
||||
# Download the markdown file
|
||||
md_path = result['outputs']['markdown']
|
||||
download_response = requests.get(
|
||||
f'http://localhost:8000/api/v1/mineru/download/{md_path}'
|
||||
)
|
||||
|
||||
with open('output.md', 'wb') as f:
|
||||
f.write(download_response.content)
|
||||
```
|
||||
|
||||
### cURL Example
|
||||
|
||||
```bash
|
||||
# Parse a document
|
||||
curl -X POST "http://localhost:8000/api/v1/mineru/parse" \
|
||||
-F "file=@document.pdf" \
|
||||
-F "lang=ch" \
|
||||
-F "backend=pipeline" \
|
||||
-F "method=auto"
|
||||
|
||||
# Download a processed file
|
||||
curl -X GET "http://localhost:8000/api/v1/mineru/download/path/to/file.md" \
|
||||
-o downloaded_file.md
|
||||
```
|
||||
|
||||
## Backend Options
|
||||
|
||||
### Pipeline Backend
|
||||
- **Use case**: General purpose, more robust
|
||||
- **Advantages**: Better for complex layouts, supports multiple languages
|
||||
- **Command**: `backend=pipeline`
|
||||
|
||||
### VLM Backends
|
||||
- **vlm-transformers**: General purpose VLM
|
||||
- **vlm-sglang-engine**: Faster engine-based approach
|
||||
- **vlm-sglang-client**: Fastest client-based approach (requires server_url)
|
||||
|
||||
## Language Support
|
||||
|
||||
Supported languages for the pipeline backend:
|
||||
- `ch`: Chinese (Simplified)
|
||||
- `en`: English
|
||||
- `korean`: Korean
|
||||
- `japan`: Japanese
|
||||
- `chinese_cht`: Chinese (Traditional)
|
||||
- `ta`: Tamil
|
||||
- `te`: Telugu
|
||||
- `ka`: Kannada
|
||||
|
||||
## Output Files
|
||||
|
||||
The API generates various output files depending on the parameters:
|
||||
|
||||
1. **Markdown** (`.md`): Structured text content
|
||||
2. **Middle JSON** (`.json`): Intermediate parsing results
|
||||
3. **Model Output** (`.json` or `.txt`): Raw model predictions
|
||||
4. **Content List** (`.json`): Structured content list
|
||||
5. **Original PDF**: Copy of the input file
|
||||
6. **Layout PDF**: PDF with layout bounding boxes
|
||||
7. **Span PDF**: PDF with span bounding boxes
|
||||
|
||||
## Error Handling
|
||||
|
||||
The API returns appropriate HTTP status codes:
|
||||
|
||||
- `200`: Success
|
||||
- `400`: Bad request (invalid parameters, unsupported file type)
|
||||
- `404`: File not found
|
||||
- `500`: Internal server error
|
||||
|
||||
Error responses include a detail message explaining the issue.
|
||||
|
||||
## Testing
|
||||
|
||||
Use the provided test script to verify the API:
|
||||
|
||||
```bash
|
||||
python test_mineru_api.py
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- The API creates unique output directories for each request to avoid conflicts
|
||||
- Temporary files are automatically cleaned up after processing
|
||||
- File downloads are restricted to the processed folder for security
|
||||
- Large files may take time to process depending on the backend and document complexity
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
# Legal Document Masker API
|
||||
|
||||
This is the backend API for the Legal Document Masking system. It provides endpoints for file upload, processing status tracking, and file download.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.8+
|
||||
- Redis (for Celery)
|
||||
|
||||
## File Storage
|
||||
|
||||
Files are stored in the following structure:
|
||||
```
|
||||
backend/
|
||||
├── storage/
|
||||
│ ├── uploads/ # Original uploaded files
|
||||
│ └── processed/ # Masked/processed files
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
### Option 1: Local Development
|
||||
|
||||
1. Create a virtual environment:
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Set up environment variables:
|
||||
Create a `.env` file in the backend directory with the following variables:
|
||||
```env
|
||||
SECRET_KEY=your-secret-key-here
|
||||
```
|
||||
|
||||
The database (SQLite) will be automatically created when you first run the application.
|
||||
|
||||
4. Start Redis (required for Celery):
|
||||
```bash
|
||||
redis-server
|
||||
```
|
||||
|
||||
5. Start Celery worker:
|
||||
```bash
|
||||
celery -A app.services.file_service worker --loglevel=info
|
||||
```
|
||||
|
||||
6. Start the FastAPI server:
|
||||
```bash
|
||||
uvicorn app.main:app --reload
|
||||
```
|
||||
|
||||
### Option 2: Docker Deployment
|
||||
|
||||
1. Build and start the services:
|
||||
```bash
|
||||
docker-compose up --build
|
||||
```
|
||||
|
||||
This will start:
|
||||
- FastAPI server on port 8000
|
||||
- Celery worker for background processing
|
||||
- Redis for task queue
|
||||
|
||||
## API Documentation
|
||||
|
||||
Once the server is running, you can access:
|
||||
- Swagger UI: `http://localhost:8000/docs`
|
||||
- ReDoc: `http://localhost:8000/redoc`
|
||||
|
||||
## API Endpoints
|
||||
|
||||
- `POST /api/v1/files/upload` - Upload a new file
|
||||
- `GET /api/v1/files` - List all files
|
||||
- `GET /api/v1/files/{file_id}` - Get file details
|
||||
- `GET /api/v1/files/{file_id}/download` - Download processed file
|
||||
- `WS /api/v1/files/ws/status/{file_id}` - WebSocket for real-time status updates
|
||||
|
||||
## Development
|
||||
|
||||
### Running Tests
|
||||
```bash
|
||||
pytest
|
||||
```
|
||||
|
||||
### Code Style
|
||||
The project uses Black for code formatting:
|
||||
```bash
|
||||
black .
|
||||
```
|
||||
|
||||
### Docker Commands
|
||||
|
||||
- Start services: `docker-compose up`
|
||||
- Start in background: `docker-compose up -d`
|
||||
- Stop services: `docker-compose down`
|
||||
- View logs: `docker-compose logs -f`
|
||||
- Rebuild: `docker-compose up --build`
|
||||
|
|
@ -0,0 +1,329 @@
|
|||
from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
|
||||
from fastapi.responses import FileResponse
|
||||
from typing import List, Optional
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
import json
|
||||
from pathlib import Path
|
||||
import uuid
|
||||
from loguru import logger
|
||||
|
||||
from ...core.config import settings
|
||||
|
||||
# Import mineru functions
|
||||
from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
|
||||
from mineru.data.data_reader_writer import FileBasedDataWriter
|
||||
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
|
||||
from mineru.utils.enum_class import MakeMode
|
||||
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
||||
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
|
||||
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
|
||||
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
|
||||
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
class MineruParseRequest:
|
||||
def __init__(
|
||||
self,
|
||||
lang: str = "ch",
|
||||
backend: str = "pipeline",
|
||||
method: str = "auto",
|
||||
server_url: Optional[str] = None,
|
||||
start_page_id: int = 0,
|
||||
end_page_id: Optional[int] = None,
|
||||
formula_enable: bool = True,
|
||||
table_enable: bool = True,
|
||||
draw_layout_bbox: bool = True,
|
||||
draw_span_bbox: bool = True,
|
||||
dump_md: bool = True,
|
||||
dump_middle_json: bool = True,
|
||||
dump_model_output: bool = True,
|
||||
dump_orig_pdf: bool = True,
|
||||
dump_content_list: bool = True,
|
||||
make_md_mode: str = "MM_MD"
|
||||
):
|
||||
self.lang = lang
|
||||
self.backend = backend
|
||||
self.method = method
|
||||
self.server_url = server_url
|
||||
self.start_page_id = start_page_id
|
||||
self.end_page_id = end_page_id
|
||||
self.formula_enable = formula_enable
|
||||
self.table_enable = table_enable
|
||||
self.draw_layout_bbox = draw_layout_bbox
|
||||
self.draw_span_bbox = draw_span_bbox
|
||||
self.dump_md = dump_md
|
||||
self.dump_middle_json = dump_middle_json
|
||||
self.dump_model_output = dump_model_output
|
||||
self.dump_orig_pdf = dump_orig_pdf
|
||||
self.dump_content_list = dump_content_list
|
||||
self.make_md_mode = MakeMode.MM_MD if make_md_mode == "MM_MD" else MakeMode.CONTENT_LIST
|
||||
|
||||
async def process_mineru_document(
|
||||
file: UploadFile,
|
||||
request: MineruParseRequest,
|
||||
output_dir: Path
|
||||
) -> dict:
|
||||
"""Process a single document using Mineru"""
|
||||
try:
|
||||
# Read file content
|
||||
content = await file.read()
|
||||
|
||||
# Create temporary file
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as temp_file:
|
||||
temp_file.write(content)
|
||||
temp_file_path = Path(temp_file.name)
|
||||
|
||||
try:
|
||||
# Prepare environment
|
||||
file_name = Path(file.filename).stem
|
||||
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, request.method)
|
||||
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
||||
|
||||
# Convert PDF bytes if needed
|
||||
if request.backend == "pipeline":
|
||||
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(
|
||||
content, request.start_page_id, request.end_page_id
|
||||
)
|
||||
|
||||
# Analyze document
|
||||
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(
|
||||
[new_pdf_bytes], [request.lang],
|
||||
parse_method=request.method,
|
||||
formula_enable=request.formula_enable,
|
||||
table_enable=request.table_enable
|
||||
)
|
||||
|
||||
# Process results
|
||||
model_list = infer_results[0]
|
||||
images_list = all_image_lists[0]
|
||||
pdf_doc = all_pdf_docs[0]
|
||||
_lang = lang_list[0]
|
||||
_ocr_enable = ocr_enabled_list[0]
|
||||
|
||||
middle_json = pipeline_result_to_middle_json(
|
||||
model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, request.formula_enable
|
||||
)
|
||||
|
||||
pdf_info = middle_json["pdf_info"]
|
||||
|
||||
# Generate outputs
|
||||
outputs = {}
|
||||
|
||||
if request.draw_layout_bbox:
|
||||
draw_layout_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_layout.pdf")
|
||||
outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf")
|
||||
|
||||
if request.draw_span_bbox:
|
||||
draw_span_bbox(pdf_info, new_pdf_bytes, local_md_dir, f"{file_name}_span.pdf")
|
||||
outputs["span_pdf"] = str(local_md_dir / f"{file_name}_span.pdf")
|
||||
|
||||
if request.dump_orig_pdf:
|
||||
md_writer.write(f"{file_name}_origin.pdf", new_pdf_bytes)
|
||||
outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf")
|
||||
|
||||
if request.dump_md:
|
||||
image_dir = str(os.path.basename(local_image_dir))
|
||||
md_content_str = pipeline_union_make(pdf_info, request.make_md_mode, image_dir)
|
||||
md_writer.write_string(f"{file_name}.md", md_content_str)
|
||||
outputs["markdown"] = str(local_md_dir / f"{file_name}.md")
|
||||
|
||||
if request.dump_content_list:
|
||||
image_dir = str(os.path.basename(local_image_dir))
|
||||
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{file_name}_content_list.json",
|
||||
json.dumps(content_list, ensure_ascii=False, indent=4)
|
||||
)
|
||||
outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json")
|
||||
|
||||
if request.dump_middle_json:
|
||||
md_writer.write_string(
|
||||
f"{file_name}_middle.json",
|
||||
json.dumps(middle_json, ensure_ascii=False, indent=4)
|
||||
)
|
||||
outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json")
|
||||
|
||||
if request.dump_model_output:
|
||||
md_writer.write_string(
|
||||
f"{file_name}_model.json",
|
||||
json.dumps(model_list, ensure_ascii=False, indent=4)
|
||||
)
|
||||
outputs["model_output"] = str(local_md_dir / f"{file_name}_model.json")
|
||||
|
||||
else:
|
||||
# VLM backend
|
||||
if request.backend.startswith("vlm-"):
|
||||
backend = request.backend[4:]
|
||||
|
||||
middle_json, infer_result = vlm_doc_analyze(
|
||||
content, image_writer=image_writer,
|
||||
backend=backend, server_url=request.server_url
|
||||
)
|
||||
|
||||
pdf_info = middle_json["pdf_info"]
|
||||
|
||||
# Generate outputs for VLM
|
||||
outputs = {}
|
||||
|
||||
if request.draw_layout_bbox:
|
||||
draw_layout_bbox(pdf_info, content, local_md_dir, f"{file_name}_layout.pdf")
|
||||
outputs["layout_pdf"] = str(local_md_dir / f"{file_name}_layout.pdf")
|
||||
|
||||
if request.dump_orig_pdf:
|
||||
md_writer.write(f"{file_name}_origin.pdf", content)
|
||||
outputs["original_pdf"] = str(local_md_dir / f"{file_name}_origin.pdf")
|
||||
|
||||
if request.dump_md:
|
||||
image_dir = str(os.path.basename(local_image_dir))
|
||||
md_content_str = vlm_union_make(pdf_info, request.make_md_mode, image_dir)
|
||||
md_writer.write_string(f"{file_name}.md", md_content_str)
|
||||
outputs["markdown"] = str(local_md_dir / f"{file_name}.md")
|
||||
|
||||
if request.dump_content_list:
|
||||
image_dir = str(os.path.basename(local_image_dir))
|
||||
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
|
||||
md_writer.write_string(
|
||||
f"{file_name}_content_list.json",
|
||||
json.dumps(content_list, ensure_ascii=False, indent=4)
|
||||
)
|
||||
outputs["content_list"] = str(local_md_dir / f"{file_name}_content_list.json")
|
||||
|
||||
if request.dump_middle_json:
|
||||
md_writer.write_string(
|
||||
f"{file_name}_middle.json",
|
||||
json.dumps(middle_json, ensure_ascii=False, indent=4)
|
||||
)
|
||||
outputs["middle_json"] = str(local_md_dir / f"{file_name}_middle.json")
|
||||
|
||||
if request.dump_model_output:
|
||||
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
|
||||
md_writer.write_string(f"{file_name}_model_output.txt", model_output)
|
||||
outputs["model_output"] = str(local_md_dir / f"{file_name}_model_output.txt")
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"file_name": file_name,
|
||||
"outputs": outputs,
|
||||
"output_directory": str(local_md_dir)
|
||||
}
|
||||
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
if temp_file_path.exists():
|
||||
temp_file_path.unlink()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error processing document: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
|
||||
|
||||
@router.post("/parse")
|
||||
async def parse_document(
|
||||
file: UploadFile = File(...),
|
||||
lang: str = "ch",
|
||||
backend: str = "pipeline",
|
||||
method: str = "auto",
|
||||
server_url: Optional[str] = None,
|
||||
start_page_id: int = 0,
|
||||
end_page_id: Optional[int] = None,
|
||||
formula_enable: bool = True,
|
||||
table_enable: bool = True,
|
||||
draw_layout_bbox: bool = True,
|
||||
draw_span_bbox: bool = True,
|
||||
dump_md: bool = True,
|
||||
dump_middle_json: bool = True,
|
||||
dump_model_output: bool = True,
|
||||
dump_orig_pdf: bool = True,
|
||||
dump_content_list: bool = True,
|
||||
make_md_mode: str = "MM_MD"
|
||||
):
|
||||
"""
|
||||
Parse a document using Mineru API
|
||||
|
||||
Parameters:
|
||||
- file: The document file to parse (PDF, image, etc.)
|
||||
- lang: Language option (default: 'ch')
|
||||
- backend: Backend for parsing ('pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client')
|
||||
- method: Method for parsing ('auto', 'txt', 'ocr')
|
||||
- server_url: Server URL for vlm-sglang-client backend
|
||||
- start_page_id: Start page ID for parsing
|
||||
- end_page_id: End page ID for parsing
|
||||
- formula_enable: Enable formula parsing
|
||||
- table_enable: Enable table parsing
|
||||
- draw_layout_bbox: Whether to draw layout bounding boxes
|
||||
- draw_span_bbox: Whether to draw span bounding boxes
|
||||
- dump_md: Whether to dump markdown files
|
||||
- dump_middle_json: Whether to dump middle JSON files
|
||||
- dump_model_output: Whether to dump model output files
|
||||
- dump_orig_pdf: Whether to dump original PDF files
|
||||
- dump_content_list: Whether to dump content list files
|
||||
- make_md_mode: The mode for making markdown content
|
||||
"""
|
||||
|
||||
# Validate file type
|
||||
allowed_extensions = {".pdf", ".png", ".jpeg", ".jpg"}
|
||||
file_extension = Path(file.filename).suffix.lower()
|
||||
if file_extension not in allowed_extensions:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"File type not allowed. Allowed types: {', '.join(allowed_extensions)}"
|
||||
)
|
||||
|
||||
# Create request object
|
||||
request = MineruParseRequest(
|
||||
lang=lang,
|
||||
backend=backend,
|
||||
method=method,
|
||||
server_url=server_url,
|
||||
start_page_id=start_page_id,
|
||||
end_page_id=end_page_id,
|
||||
formula_enable=formula_enable,
|
||||
table_enable=table_enable,
|
||||
draw_layout_bbox=draw_layout_bbox,
|
||||
draw_span_bbox=draw_span_bbox,
|
||||
dump_md=dump_md,
|
||||
dump_middle_json=dump_middle_json,
|
||||
dump_model_output=dump_model_output,
|
||||
dump_orig_pdf=dump_orig_pdf,
|
||||
dump_content_list=dump_content_list,
|
||||
make_md_mode=make_md_mode
|
||||
)
|
||||
|
||||
# Create output directory
|
||||
output_dir = settings.PROCESSED_FOLDER / "mineru" / str(uuid.uuid4())
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process document
|
||||
result = await process_mineru_document(file, request, output_dir)
|
||||
|
||||
return result
|
||||
|
||||
@router.get("/download/{file_path:path}")
|
||||
async def download_processed_file(file_path: str):
|
||||
"""Download a processed file from the mineru output directory"""
|
||||
try:
|
||||
# Construct the full path
|
||||
full_path = settings.PROCESSED_FOLDER / "mineru" / file_path
|
||||
|
||||
# Security check: ensure the path is within the processed folder
|
||||
if not str(full_path).startswith(str(settings.PROCESSED_FOLDER)):
|
||||
raise HTTPException(status_code=400, detail="Invalid file path")
|
||||
|
||||
if not full_path.exists():
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
return FileResponse(
|
||||
path=str(full_path),
|
||||
filename=full_path.name,
|
||||
media_type="application/octet-stream"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Error downloading file: {str(e)}")
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint for mineru service"""
|
||||
return {"status": "healthy", "service": "mineru"}
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
from pydantic_settings import BaseSettings
|
||||
from typing import Optional
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# API Settings
|
||||
API_V1_STR: str = "/api/v1"
|
||||
PROJECT_NAME: str = "Legal Document Masker API"
|
||||
|
||||
# Security
|
||||
SECRET_KEY: str = "your-secret-key-here" # Change in production
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
|
||||
|
||||
# Database
|
||||
BASE_DIR: Path = Path(__file__).parent.parent.parent
|
||||
DATABASE_URL: str = f"sqlite:///{BASE_DIR}/storage/legal_doc_masker.db"
|
||||
|
||||
# File Storage
|
||||
UPLOAD_FOLDER: Path = BASE_DIR / "storage" / "uploads"
|
||||
PROCESSED_FOLDER: Path = BASE_DIR / "storage" / "processed"
|
||||
MAX_FILE_SIZE: int = 50 * 1024 * 1024 # 50MB
|
||||
ALLOWED_EXTENSIONS: set = {"pdf", "docx", "doc", "md"}
|
||||
|
||||
# Celery
|
||||
CELERY_BROKER_URL: str = "redis://redis:6379/0"
|
||||
CELERY_RESULT_BACKEND: str = "redis://redis:6379/0"
|
||||
|
||||
# Ollama API settings
|
||||
OLLAMA_API_URL: str = "https://api.ollama.com"
|
||||
OLLAMA_API_KEY: str = ""
|
||||
OLLAMA_MODEL: str = "llama2"
|
||||
|
||||
# Logging settings
|
||||
LOG_LEVEL: str = "INFO"
|
||||
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
|
||||
LOG_FILE: str = "app.log"
|
||||
|
||||
class Config:
|
||||
case_sensitive = True
|
||||
env_file = ".env"
|
||||
env_file_encoding = "utf-8"
|
||||
extra = "allow"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
# Create storage directories if they don't exist
|
||||
self.UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
self.PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
# Create storage directory for database
|
||||
(self.BASE_DIR / "storage").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
settings = Settings()
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
import logging.config
|
||||
# from config.settings import settings
|
||||
from .settings import settings
|
||||
|
||||
LOGGING_CONFIG = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": {
|
||||
"standard": {
|
||||
"format": settings.LOG_FORMAT,
|
||||
"datefmt": settings.LOG_DATE_FORMAT
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"class": "logging.StreamHandler",
|
||||
"formatter": "standard",
|
||||
"level": settings.LOG_LEVEL,
|
||||
"stream": "ext://sys.stdout"
|
||||
},
|
||||
"file": {
|
||||
"class": "logging.FileHandler",
|
||||
"formatter": "standard",
|
||||
"level": settings.LOG_LEVEL,
|
||||
"filename": settings.LOG_FILE,
|
||||
"mode": "a",
|
||||
}
|
||||
},
|
||||
"loggers": {
|
||||
"": { # root logger
|
||||
"handlers": ["console", "file"],
|
||||
"level": settings.LOG_LEVEL,
|
||||
"propagate": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def setup_logging():
|
||||
"""Initialize logging configuration"""
|
||||
logging.config.dictConfig(LOGGING_CONFIG)
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from .config import settings
|
||||
|
||||
# Create SQLite engine with check_same_thread=False for FastAPI
|
||||
engine = create_engine(
|
||||
settings.DATABASE_URL,
|
||||
connect_args={"check_same_thread": False}
|
||||
)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
# Dependency
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
class Document:
|
||||
def __init__(self, file_path):
|
||||
self.file_path = file_path
|
||||
self.content = ""
|
||||
|
||||
def load(self):
|
||||
with open(self.file_path, 'r') as file:
|
||||
self.content = file.read()
|
||||
|
||||
def save(self, target_path):
|
||||
with open(target_path, 'w') as file:
|
||||
file.write(self.content)
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
import os
|
||||
from typing import Optional
|
||||
from .document_processor import DocumentProcessor
|
||||
from .processors import (
|
||||
TxtDocumentProcessor,
|
||||
DocxDocumentProcessor,
|
||||
PdfDocumentProcessor,
|
||||
MarkdownDocumentProcessor
|
||||
)
|
||||
|
||||
class DocumentProcessorFactory:
|
||||
@staticmethod
|
||||
def create_processor(input_path: str, output_path: str) -> Optional[DocumentProcessor]:
|
||||
file_extension = os.path.splitext(input_path)[1].lower()
|
||||
|
||||
processors = {
|
||||
'.txt': TxtDocumentProcessor,
|
||||
'.docx': DocxDocumentProcessor,
|
||||
'.doc': DocxDocumentProcessor,
|
||||
'.pdf': PdfDocumentProcessor,
|
||||
'.md': MarkdownDocumentProcessor,
|
||||
'.markdown': MarkdownDocumentProcessor
|
||||
}
|
||||
|
||||
processor_class = processors.get(file_extension)
|
||||
if processor_class:
|
||||
return processor_class(input_path, output_path)
|
||||
return None
|
||||
|
|
@ -0,0 +1,192 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict
|
||||
from ..prompts.masking_prompts import get_masking_mapping_prompt
|
||||
import logging
|
||||
import json
|
||||
from ..services.ollama_client import OllamaClient
|
||||
from ...core.config import settings
|
||||
from ..utils.json_extractor import LLMJsonExtractor
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentProcessor(ABC):
|
||||
def __init__(self):
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
self.max_chunk_size = 1000 # Maximum number of characters per chunk
|
||||
self.max_retries = 3 # Maximum number of retries for mapping generation
|
||||
|
||||
@abstractmethod
|
||||
def read_content(self) -> str:
|
||||
"""Read document content"""
|
||||
pass
|
||||
|
||||
def _split_into_chunks(self, sentences: list[str]) -> list[str]:
|
||||
"""Split sentences into chunks that don't exceed max_chunk_size"""
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for sentence in sentences:
|
||||
if not sentence.strip():
|
||||
continue
|
||||
|
||||
# If adding this sentence would exceed the limit, save current chunk and start new one
|
||||
if len(current_chunk) + len(sentence) > self.max_chunk_size and current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
current_chunk = sentence
|
||||
else:
|
||||
if current_chunk:
|
||||
current_chunk += "。" + sentence
|
||||
else:
|
||||
current_chunk = sentence
|
||||
|
||||
# Add the last chunk if it's not empty
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
return chunks
|
||||
|
||||
def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Validate that the mapping follows the required format:
|
||||
{
|
||||
"原文1": "脱敏后1",
|
||||
"原文2": "脱敏后2",
|
||||
...
|
||||
}
|
||||
"""
|
||||
if not isinstance(mapping, dict):
|
||||
logger.warning("Mapping is not a dictionary")
|
||||
return False
|
||||
|
||||
# Check if any key or value is not a string
|
||||
for key, value in mapping.items():
|
||||
if not isinstance(key, str) or not isinstance(value, str):
|
||||
logger.warning(f"Invalid mapping format - key or value is not a string: {key}: {value}")
|
||||
return False
|
||||
|
||||
# Check if the mapping has any nested structures
|
||||
if any(isinstance(v, (dict, list)) for v in mapping.values()):
|
||||
logger.warning("Invalid mapping format - contains nested structures")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _build_mapping(self, chunk: str) -> Dict[str, str]:
|
||||
"""Build mapping for a single chunk of text with retry logic"""
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
formatted_prompt = get_masking_mapping_prompt(chunk)
|
||||
logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
|
||||
response = self.ollama_client.generate(formatted_prompt)
|
||||
logger.info(f"Raw response from LLM: {response}")
|
||||
|
||||
# Parse the JSON response into a dictionary
|
||||
mapping = LLMJsonExtractor.parse_raw_json_str(response)
|
||||
logger.info(f"Parsed mapping: {mapping}")
|
||||
|
||||
if mapping and self._validate_mapping_format(mapping):
|
||||
return mapping
|
||||
else:
|
||||
logger.warning(f"Invalid mapping format received on attempt {attempt + 1}, retrying...")
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating mapping on attempt {attempt + 1}: {e}")
|
||||
if attempt < self.max_retries - 1:
|
||||
logger.info("Retrying...")
|
||||
else:
|
||||
logger.error("Max retries reached, returning empty mapping")
|
||||
return {}
|
||||
|
||||
def _apply_mapping(self, text: str, mapping: Dict[str, str]) -> str:
|
||||
"""Apply the mapping to replace sensitive information"""
|
||||
masked_text = text
|
||||
for original, masked in mapping.items():
|
||||
# Ensure masked value is a string
|
||||
if isinstance(masked, dict):
|
||||
# If it's a dict, use the first value or a default
|
||||
masked = next(iter(masked.values()), "某")
|
||||
elif not isinstance(masked, str):
|
||||
# If it's not a string, convert to string or use default
|
||||
masked = str(masked) if masked is not None else "某"
|
||||
masked_text = masked_text.replace(original, masked)
|
||||
return masked_text
|
||||
|
||||
def _get_next_suffix(self, value: str) -> str:
|
||||
"""Get the next available suffix for a value that already has a suffix"""
|
||||
# Define the sequence of suffixes
|
||||
suffixes = ['甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸']
|
||||
|
||||
# Check if the value already has a suffix
|
||||
for suffix in suffixes:
|
||||
if value.endswith(suffix):
|
||||
# Find the next suffix in the sequence
|
||||
current_index = suffixes.index(suffix)
|
||||
if current_index + 1 < len(suffixes):
|
||||
return value[:-1] + suffixes[current_index + 1]
|
||||
else:
|
||||
# If we've used all suffixes, start over with the first one
|
||||
return value[:-1] + suffixes[0]
|
||||
|
||||
# If no suffix found, return the value with the first suffix
|
||||
return value + '甲'
|
||||
|
||||
def _merge_mappings(self, existing: Dict[str, str], new: Dict[str, str]) -> Dict[str, str]:
|
||||
"""
|
||||
Merge two mappings following the rules:
|
||||
1. If key exists in existing, keep existing value
|
||||
2. If value exists in existing:
|
||||
- If value ends with a suffix (甲乙丙丁...), add next suffix
|
||||
- If no suffix, add '甲'
|
||||
"""
|
||||
result = existing.copy()
|
||||
|
||||
# Get all existing values
|
||||
existing_values = set(result.values())
|
||||
|
||||
for key, value in new.items():
|
||||
if key in result:
|
||||
# Rule 1: Keep existing value if key exists
|
||||
continue
|
||||
|
||||
if value in existing_values:
|
||||
# Rule 2: Handle duplicate values
|
||||
new_value = self._get_next_suffix(value)
|
||||
result[key] = new_value
|
||||
existing_values.add(new_value)
|
||||
else:
|
||||
# No conflict, add as is
|
||||
result[key] = value
|
||||
existing_values.add(value)
|
||||
|
||||
return result
|
||||
|
||||
def process_content(self, content: str) -> str:
|
||||
"""Process document content by masking sensitive information"""
|
||||
# Split content into sentences
|
||||
sentences = content.split("。")
|
||||
|
||||
# Split sentences into manageable chunks
|
||||
chunks = self._split_into_chunks(sentences)
|
||||
logger.info(f"Split content into {len(chunks)} chunks")
|
||||
|
||||
# Build mapping for each chunk
|
||||
combined_mapping = {}
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
|
||||
chunk_mapping = self._build_mapping(chunk)
|
||||
if chunk_mapping: # Only update if we got a valid mapping
|
||||
combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
|
||||
else:
|
||||
logger.warning(f"Failed to generate mapping for chunk {i+1}")
|
||||
|
||||
# Apply the combined mapping to the entire content
|
||||
masked_content = self._apply_mapping(content, combined_mapping)
|
||||
logger.info("Successfully masked content")
|
||||
|
||||
return masked_content
|
||||
|
||||
@abstractmethod
|
||||
def save_content(self, content: str) -> None:
|
||||
"""Save processed content"""
|
||||
pass
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
from .txt_processor import TxtDocumentProcessor
|
||||
from .docx_processor import DocxDocumentProcessor
|
||||
from .pdf_processor import PdfDocumentProcessor
|
||||
from .md_processor import MarkdownDocumentProcessor
|
||||
|
||||
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
import os
|
||||
import docx
|
||||
from ...document_handlers.document_processor import DocumentProcessor
|
||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.data.read_api import read_local_office
|
||||
import logging
|
||||
from ...services.ollama_client import OllamaClient
|
||||
from ...config import settings
|
||||
from ...prompts.masking_prompts import get_masking_mapping_prompt
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocxDocumentProcessor(DocumentProcessor):
|
||||
def __init__(self, input_path: str, output_path: str):
|
||||
super().__init__() # Call parent class's __init__
|
||||
self.input_path = input_path
|
||||
self.output_path = output_path
|
||||
self.output_dir = os.path.dirname(output_path)
|
||||
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
||||
|
||||
# Setup output directories
|
||||
self.local_image_dir = os.path.join(self.output_dir, "images")
|
||||
self.image_dir = os.path.basename(self.local_image_dir)
|
||||
os.makedirs(self.local_image_dir, exist_ok=True)
|
||||
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
|
||||
def read_content(self) -> str:
|
||||
try:
|
||||
# Initialize writers
|
||||
image_writer = FileBasedDataWriter(self.local_image_dir)
|
||||
md_writer = FileBasedDataWriter(self.output_dir)
|
||||
|
||||
# Create Dataset Instance and process
|
||||
ds = read_local_office(self.input_path)[0]
|
||||
pipe_result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
|
||||
|
||||
# Generate markdown
|
||||
md_content = pipe_result.get_markdown(self.image_dir)
|
||||
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.image_dir)
|
||||
|
||||
return md_content
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting DOCX to MD: {e}")
|
||||
raise
|
||||
|
||||
# def process_content(self, content: str) -> str:
|
||||
# logger.info("Processing DOCX content")
|
||||
|
||||
# # Split content into sentences and apply masking
|
||||
# sentences = content.split("。")
|
||||
# final_md = ""
|
||||
# for sentence in sentences:
|
||||
# if sentence.strip(): # Only process non-empty sentences
|
||||
# formatted_prompt = get_masking_mapping_prompt(sentence)
|
||||
# logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
|
||||
# response = self.ollama_client.generate(formatted_prompt)
|
||||
# logger.info(f"Response generated: {response}")
|
||||
# final_md += response + "。"
|
||||
|
||||
# return final_md
|
||||
|
||||
def save_content(self, content: str) -> None:
|
||||
# Ensure output path has .md extension
|
||||
output_dir = os.path.dirname(self.output_path)
|
||||
base_name = os.path.splitext(os.path.basename(self.output_path))[0]
|
||||
md_output_path = os.path.join(output_dir, f"{base_name}.md")
|
||||
|
||||
logger.info(f"Saving masked content to: {md_output_path}")
|
||||
try:
|
||||
with open(md_output_path, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
logger.info(f"Successfully saved content to {md_output_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving content: {e}")
|
||||
raise
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
import os
|
||||
from ...document_handlers.document_processor import DocumentProcessor
|
||||
from ...services.ollama_client import OllamaClient
|
||||
import logging
|
||||
from ...config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MarkdownDocumentProcessor(DocumentProcessor):
|
||||
def __init__(self, input_path: str, output_path: str):
|
||||
super().__init__() # Call parent class's __init__
|
||||
self.input_path = input_path
|
||||
self.output_path = output_path
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
|
||||
def read_content(self) -> str:
|
||||
"""Read markdown content from file"""
|
||||
try:
|
||||
with open(self.input_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
logger.info(f"Successfully read markdown content from {self.input_path}")
|
||||
return content
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading markdown file {self.input_path}: {e}")
|
||||
raise
|
||||
|
||||
def save_content(self, content: str) -> None:
|
||||
"""Save processed markdown content"""
|
||||
try:
|
||||
# Ensure output directory exists
|
||||
output_dir = os.path.dirname(self.output_path)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
with open(self.output_path, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
logger.info(f"Successfully saved masked content to {self.output_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving content to {self.output_path}: {e}")
|
||||
raise
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
import os
|
||||
import PyPDF2
|
||||
from ...document_handlers.document_processor import DocumentProcessor
|
||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
||||
from magic_pdf.data.dataset import PymuDocDataset
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.config.enums import SupportedPdfParseMethod
|
||||
from ...prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
|
||||
import logging
|
||||
from ...services.ollama_client import OllamaClient
|
||||
from ...config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PdfDocumentProcessor(DocumentProcessor):
|
||||
def __init__(self, input_path: str, output_path: str):
|
||||
super().__init__() # Call parent class's __init__
|
||||
self.input_path = input_path
|
||||
self.output_path = output_path
|
||||
self.output_dir = os.path.dirname(output_path)
|
||||
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
|
||||
|
||||
# Setup output directories
|
||||
self.local_image_dir = os.path.join(self.output_dir, "images")
|
||||
self.image_dir = os.path.basename(self.local_image_dir)
|
||||
os.makedirs(self.local_image_dir, exist_ok=True)
|
||||
|
||||
# Setup work directory under output directory
|
||||
self.work_dir = os.path.join(
|
||||
os.path.dirname(output_path),
|
||||
".work",
|
||||
os.path.splitext(os.path.basename(input_path))[0]
|
||||
)
|
||||
os.makedirs(self.work_dir, exist_ok=True)
|
||||
|
||||
self.work_local_image_dir = os.path.join(self.work_dir, "images")
|
||||
self.work_image_dir = os.path.basename(self.work_local_image_dir)
|
||||
os.makedirs(self.work_local_image_dir, exist_ok=True)
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
|
||||
def read_content(self) -> str:
|
||||
logger.info("Starting PDF content processing")
|
||||
|
||||
# Read the PDF file
|
||||
with open(self.input_path, 'rb') as file:
|
||||
content = file.read()
|
||||
|
||||
# Initialize writers
|
||||
image_writer = FileBasedDataWriter(self.work_local_image_dir)
|
||||
md_writer = FileBasedDataWriter(self.work_dir)
|
||||
|
||||
# Create Dataset Instance
|
||||
ds = PymuDocDataset(content)
|
||||
|
||||
logger.info("Classifying PDF type: %s", ds.classify())
|
||||
# Process based on PDF type
|
||||
if ds.classify() == SupportedPdfParseMethod.OCR:
|
||||
infer_result = ds.apply(doc_analyze, ocr=True)
|
||||
pipe_result = infer_result.pipe_ocr_mode(image_writer)
|
||||
else:
|
||||
infer_result = ds.apply(doc_analyze, ocr=False)
|
||||
pipe_result = infer_result.pipe_txt_mode(image_writer)
|
||||
|
||||
logger.info("Generating all outputs")
|
||||
# Generate all outputs
|
||||
infer_result.draw_model(os.path.join(self.work_dir, f"{self.name_without_suff}_model.pdf"))
|
||||
model_inference_result = infer_result.get_infer_res()
|
||||
|
||||
pipe_result.draw_layout(os.path.join(self.work_dir, f"{self.name_without_suff}_layout.pdf"))
|
||||
pipe_result.draw_span(os.path.join(self.work_dir, f"{self.name_without_suff}_spans.pdf"))
|
||||
|
||||
md_content = pipe_result.get_markdown(self.work_image_dir)
|
||||
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.work_image_dir)
|
||||
|
||||
content_list = pipe_result.get_content_list(self.work_image_dir)
|
||||
pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.work_image_dir)
|
||||
|
||||
middle_json = pipe_result.get_middle_json()
|
||||
pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json')
|
||||
|
||||
return md_content
|
||||
|
||||
# def process_content(self, content: str) -> str:
|
||||
# logger.info("Starting content masking process")
|
||||
# sentences = content.split("。")
|
||||
# final_md = ""
|
||||
# for sentence in sentences:
|
||||
# if not sentence.strip(): # Skip empty sentences
|
||||
# continue
|
||||
# formatted_prompt = get_masking_mapping_prompt(sentence)
|
||||
# logger.info("Calling ollama to generate response, prompt: %s", formatted_prompt)
|
||||
# response = self.ollama_client.generate(formatted_prompt)
|
||||
# logger.info(f"Response generated: {response}")
|
||||
# final_md += response + "。"
|
||||
# return final_md
|
||||
|
||||
def save_content(self, content: str) -> None:
|
||||
# Ensure output path has .md extension
|
||||
output_dir = os.path.dirname(self.output_path)
|
||||
base_name = os.path.splitext(os.path.basename(self.output_path))[0]
|
||||
md_output_path = os.path.join(output_dir, f"{base_name}.md")
|
||||
|
||||
logger.info(f"Saving masked content to: {md_output_path}")
|
||||
with open(md_output_path, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
from ...document_handlers.document_processor import DocumentProcessor
|
||||
from ...services.ollama_client import OllamaClient
|
||||
import logging
|
||||
from ...prompts.masking_prompts import get_masking_prompt
|
||||
from ...config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
class TxtDocumentProcessor(DocumentProcessor):
|
||||
def __init__(self, input_path: str, output_path: str):
|
||||
super().__init__()
|
||||
self.input_path = input_path
|
||||
self.output_path = output_path
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
|
||||
def read_content(self) -> str:
|
||||
with open(self.input_path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
|
||||
# def process_content(self, content: str) -> str:
|
||||
|
||||
# formatted_prompt = get_masking_prompt(content)
|
||||
# response = self.ollama_client.generate(formatted_prompt)
|
||||
# logger.debug(f"Processed content: {response}")
|
||||
# return response
|
||||
|
||||
def save_content(self, content: str) -> None:
|
||||
with open(self.output_path, 'w', encoding='utf-8') as file:
|
||||
file.write(content)
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
import logging
|
||||
from ..document_handlers.document_factory import DocumentProcessorFactory
|
||||
from ..services.ollama_client import OllamaClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentService:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def process_document(self, input_path: str, output_path: str) -> bool:
|
||||
try:
|
||||
processor = DocumentProcessorFactory.create_processor(input_path, output_path)
|
||||
if not processor:
|
||||
logger.error(f"Unsupported file format: {input_path}")
|
||||
return False
|
||||
|
||||
# Read content
|
||||
content = processor.read_content()
|
||||
|
||||
# Process with Ollama
|
||||
masked_content = processor.process_content(content)
|
||||
|
||||
# Save processed content
|
||||
processor.save_content(masked_content)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing document {input_path}: {str(e)}")
|
||||
return False
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
def read_file(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
return file.read()
|
||||
|
||||
def write_file(file_path, content):
|
||||
with open(file_path, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
def file_exists(file_path):
|
||||
import os
|
||||
return os.path.isfile(file_path)
|
||||
|
||||
def delete_file(file_path):
|
||||
import os
|
||||
if file_exists(file_path):
|
||||
os.remove(file_path)
|
||||
|
||||
def list_files_in_directory(directory_path):
|
||||
import os
|
||||
return [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from .core.config import settings
|
||||
from .api.endpoints import mineru
|
||||
from .core.database import engine, Base
|
||||
|
||||
# Create database tables
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
app = FastAPI(
|
||||
title=settings.PROJECT_NAME,
|
||||
openapi_url=f"{settings.API_V1_STR}/openapi.json"
|
||||
)
|
||||
|
||||
# Set up CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # In production, replace with specific origins
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Include routers
|
||||
# app.include_router(
|
||||
# files.router,
|
||||
# prefix=f"{settings.API_V1_STR}/files",
|
||||
# tags=["files"]
|
||||
# )
|
||||
|
||||
app.include_router(
|
||||
mineru.router,
|
||||
prefix=f"{settings.API_V1_STR}/mineru",
|
||||
tags=["mineru"]
|
||||
)
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return {"message": "Welcome to Legal Document Masker API"}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
from sqlalchemy import Column, String, DateTime, Text
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
from ..core.database import Base
|
||||
|
||||
class FileStatus(str):
|
||||
NOT_STARTED = "not_started"
|
||||
PROCESSING = "processing"
|
||||
SUCCESS = "success"
|
||||
FAILED = "failed"
|
||||
|
||||
class File(Base):
|
||||
__tablename__ = "files"
|
||||
|
||||
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
filename = Column(String(255), nullable=False)
|
||||
original_path = Column(String(255), nullable=False)
|
||||
processed_path = Column(String(255))
|
||||
status = Column(String(20), nullable=False, default=FileStatus.NOT_STARTED)
|
||||
error_message = Column(Text)
|
||||
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from uuid import UUID
|
||||
|
||||
class FileBase(BaseModel):
|
||||
filename: str
|
||||
status: str
|
||||
error_message: Optional[str] = None
|
||||
|
||||
class FileResponse(FileBase):
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
class FileList(BaseModel):
|
||||
files: list[FileResponse]
|
||||
total: int
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
version: '3.8'
|
||||
|
||||
services:
|
||||
api:
|
||||
build: .
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- ./storage:/app/storage
|
||||
- ./legal_doc_masker.db:/app/legal_doc_masker.db
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
celery_worker:
|
||||
build: .
|
||||
command: celery -A app.services.file_service worker --loglevel=info
|
||||
volumes:
|
||||
- ./storage:/app/storage
|
||||
- ./legal_doc_masker.db:/app/legal_doc_masker.db
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
depends_on:
|
||||
- redis
|
||||
- api
|
||||
|
||||
redis:
|
||||
image: redis:alpine
|
||||
ports:
|
||||
- "6379:6379"
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"name": "mineru",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {}
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
# FastAPI and server
|
||||
fastapi>=0.104.0
|
||||
uvicorn>=0.24.0
|
||||
python-multipart>=0.0.6
|
||||
websockets>=12.0
|
||||
|
||||
# Database
|
||||
sqlalchemy>=2.0.0
|
||||
alembic>=1.12.0
|
||||
|
||||
# Background tasks
|
||||
celery>=5.3.0
|
||||
redis>=5.0.0
|
||||
|
||||
# Security
|
||||
python-jose[cryptography]>=3.3.0
|
||||
passlib[bcrypt]>=1.7.4
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# Testing
|
||||
pytest>=7.4.0
|
||||
httpx>=0.25.0
|
||||
|
||||
|
||||
watchdog==2.1.6
|
||||
requests==2.28.1
|
||||
mineru==2.0.6
|
||||
numpy==1.24.3
|
||||
scikit-learn==1.3.0
|
||||
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for Mineru API endpoints
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# API base URL
|
||||
BASE_URL = "http://localhost:8000/api/v1/mineru"
|
||||
|
||||
def test_health_check():
|
||||
"""Test the health check endpoint"""
|
||||
print("Testing health check...")
|
||||
response = requests.get(f"{BASE_URL}/health")
|
||||
print(f"Status: {response.status_code}")
|
||||
print(f"Response: {response.json()}")
|
||||
print()
|
||||
|
||||
def test_parse_document(file_path: str):
|
||||
"""Test document parsing endpoint"""
|
||||
print(f"Testing document parsing with file: {file_path}")
|
||||
|
||||
# Check if file exists
|
||||
if not Path(file_path).exists():
|
||||
print(f"Error: File {file_path} not found")
|
||||
return
|
||||
|
||||
# Prepare the file upload
|
||||
with open(file_path, 'rb') as f:
|
||||
files = {'file': (Path(file_path).name, f, 'application/pdf')}
|
||||
|
||||
# Prepare parameters
|
||||
params = {
|
||||
'lang': 'ch',
|
||||
'backend': 'pipeline',
|
||||
'method': 'auto',
|
||||
'formula_enable': True,
|
||||
'table_enable': True,
|
||||
'draw_layout_bbox': True,
|
||||
'draw_span_bbox': True,
|
||||
'dump_md': True,
|
||||
'dump_middle_json': True,
|
||||
'dump_model_output': True,
|
||||
'dump_orig_pdf': True,
|
||||
'dump_content_list': True,
|
||||
'make_md_mode': 'MM_MD'
|
||||
}
|
||||
|
||||
# Make the request
|
||||
response = requests.post(f"{BASE_URL}/parse", files=files, params=params)
|
||||
|
||||
print(f"Status: {response.status_code}")
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print("Parse successful!")
|
||||
print(f"File name: {result['file_name']}")
|
||||
print(f"Output directory: {result['output_directory']}")
|
||||
print("Generated outputs:")
|
||||
for output_type, output_path in result['outputs'].items():
|
||||
print(f" - {output_type}: {output_path}")
|
||||
else:
|
||||
print(f"Error: {response.text}")
|
||||
print()
|
||||
|
||||
def test_download_file(file_path: str):
|
||||
"""Test file download endpoint"""
|
||||
print(f"Testing file download: {file_path}")
|
||||
|
||||
response = requests.get(f"{BASE_URL}/download/{file_path}")
|
||||
print(f"Status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
# Save the downloaded file
|
||||
output_filename = f"downloaded_{Path(file_path).name}"
|
||||
with open(output_filename, 'wb') as f:
|
||||
f.write(response.content)
|
||||
print(f"File downloaded successfully as: {output_filename}")
|
||||
else:
|
||||
print(f"Error: {response.text}")
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Mineru API Test Script")
|
||||
print("=" * 50)
|
||||
|
||||
# Test health check
|
||||
test_health_check()
|
||||
|
||||
# Test document parsing (you'll need to provide a PDF file)
|
||||
# Uncomment and modify the path below to test with your own file
|
||||
# test_parse_document("path/to/your/document.pdf")
|
||||
|
||||
# Example of how to test file download (after parsing)
|
||||
# test_download_file("some_uuid/document_name.md")
|
||||
|
||||
print("Test completed!")
|
||||
print("\nTo test document parsing:")
|
||||
print("1. Uncomment the test_parse_document line above")
|
||||
print("2. Provide a valid PDF file path")
|
||||
print("3. Run the script again")
|
||||
print("\nTo test file download:")
|
||||
print("1. First run a parse operation to get file paths")
|
||||
print("2. Use the output paths from the parse result")
|
||||
print("3. Uncomment and modify the test_download_file line")
|
||||
Loading…
Reference in New Issue