feat: 增加backend

This commit is contained in:
oliviamn 2025-05-24 22:06:28 +08:00
parent 47e78c35bb
commit 76b0351f8f
11 changed files with 485 additions and 0 deletions

24
backend/Dockerfile Normal file
View File

@ -0,0 +1,24 @@
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy the rest of the application
COPY . .
# Create storage directories
RUN mkdir -p storage/uploads storage/processed
# Expose the port the app runs on
EXPOSE 8000
# Command to run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

103
backend/README.md Normal file
View File

@ -0,0 +1,103 @@
# Legal Document Masker API
This is the backend API for the Legal Document Masking system. It provides endpoints for file upload, processing status tracking, and file download.
## Prerequisites
- Python 3.8+
- Redis (for Celery)
## File Storage
Files are stored in the following structure:
```
backend/
├── storage/
│ ├── uploads/ # Original uploaded files
│ └── processed/ # Masked/processed files
```
## Setup
### Option 1: Local Development
1. Create a virtual environment:
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Set up environment variables:
Create a `.env` file in the backend directory with the following variables:
```env
SECRET_KEY=your-secret-key-here
```
The database (SQLite) will be automatically created when you first run the application.
4. Start Redis (required for Celery):
```bash
redis-server
```
5. Start Celery worker:
```bash
celery -A app.services.file_service worker --loglevel=info
```
6. Start the FastAPI server:
```bash
uvicorn app.main:app --reload
```
### Option 2: Docker Deployment
1. Build and start the services:
```bash
docker-compose up --build
```
This will start:
- FastAPI server on port 8000
- Celery worker for background processing
- Redis for task queue
## API Documentation
Once the server is running, you can access:
- Swagger UI: `http://localhost:8000/docs`
- ReDoc: `http://localhost:8000/redoc`
## API Endpoints
- `POST /api/v1/files/upload` - Upload a new file
- `GET /api/v1/files` - List all files
- `GET /api/v1/files/{file_id}` - Get file details
- `GET /api/v1/files/{file_id}/download` - Download processed file
- `WS /api/v1/files/ws/status/{file_id}` - WebSocket for real-time status updates
## Development
### Running Tests
```bash
pytest
```
### Code Style
The project uses Black for code formatting:
```bash
black .
```
### Docker Commands
- Start services: `docker-compose up`
- Start in background: `docker-compose up -d`
- Stop services: `docker-compose down`
- View logs: `docker-compose logs -f`
- Rebuild: `docker-compose up --build`

View File

@ -0,0 +1,111 @@
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, WebSocket, Response
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from typing import List
import os
from ...core.config import settings
from ...core.database import get_db
from ...models.file import File as FileModel, FileStatus
from ...services.file_service import process_file
from ...schemas.file import FileResponse as FileResponseSchema, FileList
import asyncio
from fastapi import WebSocketDisconnect
router = APIRouter()
@router.post("/upload", response_model=FileResponseSchema)
async def upload_file(
file: UploadFile = File(...),
db: Session = Depends(get_db)
):
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
if not any(file.filename.lower().endswith(ext) for ext in settings.ALLOWED_EXTENSIONS):
raise HTTPException(
status_code=400,
detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
)
# Save file
file_path = settings.UPLOAD_FOLDER / file.filename
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
# Create database entry
db_file = FileModel(
filename=file.filename,
original_path=str(file_path),
status=FileStatus.NOT_STARTED
)
db.add(db_file)
db.commit()
db.refresh(db_file)
# Start processing
process_file.delay(str(db_file.id))
return db_file
@router.get("/files", response_model=List[FileResponseSchema])
def list_files(
skip: int = 0,
limit: int = 100,
db: Session = Depends(get_db)
):
files = db.query(FileModel).offset(skip).limit(limit).all()
return files
@router.get("/files/{file_id}", response_model=FileResponseSchema)
def get_file(
file_id: str,
db: Session = Depends(get_db)
):
file = db.query(FileModel).filter(FileModel.id == file_id).first()
if not file:
raise HTTPException(status_code=404, detail="File not found")
return file
@router.get("/files/{file_id}/download")
async def download_file(
file_id: str,
db: Session = Depends(get_db)
):
file = db.query(FileModel).filter(FileModel.id == file_id).first()
if not file:
raise HTTPException(status_code=404, detail="File not found")
if file.status != FileStatus.SUCCESS:
raise HTTPException(status_code=400, detail="File is not ready for download")
if not os.path.exists(file.processed_path):
raise HTTPException(status_code=404, detail="Processed file not found")
return FileResponse(
path=file.processed_path,
filename=file.filename,
media_type="application/octet-stream"
)
@router.websocket("/ws/status/{file_id}")
async def websocket_endpoint(websocket: WebSocket, file_id: str, db: Session = Depends(get_db)):
await websocket.accept()
try:
while True:
file = db.query(FileModel).filter(FileModel.id == file_id).first()
if not file:
await websocket.send_json({"error": "File not found"})
break
await websocket.send_json({
"status": file.status,
"error": file.error_message
})
if file.status in [FileStatus.SUCCESS, FileStatus.FAILED]:
break
await asyncio.sleep(1)
except WebSocketDisconnect:
pass

View File

@ -0,0 +1,39 @@
from pydantic_settings import BaseSettings
from typing import Optional
import os
from pathlib import Path
class Settings(BaseSettings):
# API Settings
API_V1_STR: str = "/api/v1"
PROJECT_NAME: str = "Legal Document Masker API"
# Security
SECRET_KEY: str = "your-secret-key-here" # Change in production
ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
# Database
DATABASE_URL: str = "sqlite:///./legal_doc_masker.db"
# File Storage
BASE_DIR: Path = Path(__file__).parent.parent.parent
UPLOAD_FOLDER: Path = BASE_DIR / "storage" / "uploads"
PROCESSED_FOLDER: Path = BASE_DIR / "storage" / "processed"
MAX_FILE_SIZE: int = 50 * 1024 * 1024 # 50MB
ALLOWED_EXTENSIONS: set = {"pdf", "docx", "doc"}
# Celery
CELERY_BROKER_URL: str = "redis://localhost:6379/0"
CELERY_RESULT_BACKEND: str = "redis://localhost:6379/0"
class Config:
case_sensitive = True
env_file = ".env"
def __init__(self, **kwargs):
super().__init__(**kwargs)
# Create storage directories if they don't exist
self.UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
self.PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True)
settings = Settings()

View File

@ -0,0 +1,21 @@
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from .config import settings
# Create SQLite engine with check_same_thread=False for FastAPI
engine = create_engine(
settings.DATABASE_URL,
connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
# Dependency
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()

33
backend/app/main.py Normal file
View File

@ -0,0 +1,33 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from .core.config import settings
from .api.endpoints import files
from .core.database import engine, Base
# Create database tables
Base.metadata.create_all(bind=engine)
app = FastAPI(
title=settings.PROJECT_NAME,
openapi_url=f"{settings.API_V1_STR}/openapi.json"
)
# Set up CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, replace with specific origins
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(
files.router,
prefix=f"{settings.API_V1_STR}/files",
tags=["files"]
)
@app.get("/")
async def root():
return {"message": "Welcome to Legal Document Masker API"}

View File

@ -0,0 +1,22 @@
from sqlalchemy import Column, String, DateTime, Text
from datetime import datetime
import uuid
from ..core.database import Base
class FileStatus(str):
NOT_STARTED = "not_started"
PROCESSING = "processing"
SUCCESS = "success"
FAILED = "failed"
class File(Base):
__tablename__ = "files"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
filename = Column(String(255), nullable=False)
original_path = Column(String(255), nullable=False)
processed_path = Column(String(255))
status = Column(String(20), nullable=False, default=FileStatus.NOT_STARTED)
error_message = Column(Text)
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
updated_at = Column(DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow)

View File

@ -0,0 +1,21 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional
from uuid import UUID
class FileBase(BaseModel):
filename: str
status: str
error_message: Optional[str] = None
class FileResponse(FileBase):
id: UUID
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class FileList(BaseModel):
files: list[FileResponse]
total: int

View File

@ -0,0 +1,47 @@
from celery import Celery
from ..core.config import settings
from ..models.file import File, FileStatus
from sqlalchemy.orm import Session
from ..core.database import SessionLocal
import sys
import os
# Add the parent directory to Python path to import the masking system
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))))
from src.main import process_document # Import your existing masking function
celery = Celery(
'file_service',
broker=settings.CELERY_BROKER_URL,
backend=settings.CELERY_RESULT_BACKEND
)
@celery.task
def process_file(file_id: str):
db = SessionLocal()
try:
file = db.query(File).filter(File.id == file_id).first()
if not file:
return
# Update status to processing
file.status = FileStatus.PROCESSING
db.commit()
try:
# Process the file using your existing masking system
output_path = process_document(file.original_path)
# Update file record with processed path
file.processed_path = output_path
file.status = FileStatus.SUCCESS
db.commit()
except Exception as e:
file.status = FileStatus.FAILED
file.error_message = str(e)
db.commit()
raise
finally:
db.close()

View File

@ -0,0 +1,33 @@
version: '3.8'
services:
api:
build: .
ports:
- "8000:8000"
volumes:
- ./storage:/app/storage
- ./legal_doc_masker.db:/app/legal_doc_masker.db
environment:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
depends_on:
- redis
celery_worker:
build: .
command: celery -A app.services.file_service worker --loglevel=info
volumes:
- ./storage:/app/storage
- ./legal_doc_masker.db:/app/legal_doc_masker.db
environment:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
depends_on:
- redis
- api
redis:
image: redis:alpine
ports:
- "6379:6379"

31
backend/requirements.txt Normal file
View File

@ -0,0 +1,31 @@
# FastAPI and server
fastapi>=0.104.0
uvicorn>=0.24.0
python-multipart>=0.0.6
websockets>=12.0
# Database
sqlalchemy>=2.0.0
alembic>=1.12.0
# Background tasks
celery>=5.3.0
redis>=5.0.0
# Security
python-jose[cryptography]>=3.3.0
passlib[bcrypt]>=1.7.4
python-dotenv>=1.0.0
# Testing
pytest>=7.4.0
httpx>=0.25.0
# Existing project dependencies
pydantic-settings>=2.0.0
watchdog==2.1.6
requests==2.28.1
python-docx>=0.8.11
PyPDF2>=3.0.0
pandas>=2.0.0
magic-pdf[full]