fix: 解决md不允许上传的问题

refine
refine: 删除文档数据文件夹，用sample_doc取代
2025-05-26 00:06:37 +08:00 · 2025-05-25 16:45:48 +08:00 · 2025-05-25 16:43:32 +08:00 · 2025-05-25 00:37:20 +08:00 · 2025-05-25 00:04:19 +08:00 · 2025-05-24 23:28:33 +08:00
48 changed files with 18025 additions and 142 deletions
--- a/.gitignore
+++ b/.gitignore
@ -71,3 +71,6 @@ __pycache__
 data/doc_dest
 data/doc_src
 data/doc_intermediate
 node_modules
 backend/storage/
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@ -0,0 +1,27 @@
 FROM python:3.11-slim
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    build-essential \
    libreoffice \
    && rm -rf /var/lib/apt/lists/*
 # Copy requirements first to leverage Docker cache
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install -U magic-pdf[full]
 # Copy the rest of the application
 COPY . .
 # Create storage directories
 RUN mkdir -p storage/uploads storage/processed
 # Expose the port the app runs on
 EXPOSE 8000
 # Command to run the application
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] 
--- a/backend/README.md
+++ b/backend/README.md
@ -0,0 +1,103 @@
 # Legal Document Masker API
 This is the backend API for the Legal Document Masking system. It provides endpoints for file upload, processing status tracking, and file download.
 ## Prerequisites
 - Python 3.8+
 - Redis (for Celery)
 ## File Storage
 Files are stored in the following structure:
 ```
 backend/
 ├── storage/
 │   ├── uploads/     # Original uploaded files
 │   └── processed/   # Masked/processed files
 ```
 ## Setup
 ### Option 1: Local Development
 1. Create a virtual environment:
 ```bash
 python -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 ```
 2. Install dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 3. Set up environment variables:
 Create a `.env` file in the backend directory with the following variables:
 ```env
 SECRET_KEY=your-secret-key-here
 ```
 The database (SQLite) will be automatically created when you first run the application.
 4. Start Redis (required for Celery):
 ```bash
 redis-server
 ```
 5. Start Celery worker:
 ```bash
 celery -A app.services.file_service worker --loglevel=info
 ```
 6. Start the FastAPI server:
 ```bash
 uvicorn app.main:app --reload
 ```
 ### Option 2: Docker Deployment
 1. Build and start the services:
 ```bash
 docker-compose up --build
 ```
 This will start:
 - FastAPI server on port 8000
 - Celery worker for background processing
 - Redis for task queue
 ## API Documentation
 Once the server is running, you can access:
 - Swagger UI: `http://localhost:8000/docs`
 - ReDoc: `http://localhost:8000/redoc`
 ## API Endpoints
 - `POST /api/v1/files/upload` - Upload a new file
 - `GET /api/v1/files` - List all files
 - `GET /api/v1/files/{file_id}` - Get file details
 - `GET /api/v1/files/{file_id}/download` - Download processed file
 - `WS /api/v1/files/ws/status/{file_id}` - WebSocket for real-time status updates
 ## Development
 ### Running Tests
 ```bash
 pytest
 ```
 ### Code Style
 The project uses Black for code formatting:
 ```bash
 black .
 ```
 ### Docker Commands
 - Start services: `docker-compose up`
 - Start in background: `docker-compose up -d`
 - Stop services: `docker-compose down`
 - View logs: `docker-compose logs -f`
 - Rebuild: `docker-compose up --build` 
--- a/backend/app/api/endpoints/files.py
+++ b/backend/app/api/endpoints/files.py
@ -0,0 +1,111 @@
 from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, WebSocket, Response
 from fastapi.responses import FileResponse
 from sqlalchemy.orm import Session
 from typing import List
 import os
 from ...core.config import settings
 from ...core.database import get_db
 from ...models.file import File as FileModel, FileStatus
 from ...services.file_service import process_file
 from ...schemas.file import FileResponse as FileResponseSchema, FileList
 import asyncio
 from fastapi import WebSocketDisconnect
 router = APIRouter()
@router.post("/upload", response_model=FileResponseSchema)
 async def upload_file(
    file: UploadFile = File(...),
    db: Session = Depends(get_db)
 ):
    if not file.filename:
        raise HTTPException(status_code=400, detail="No file provided")
    if not any(file.filename.lower().endswith(ext) for ext in settings.ALLOWED_EXTENSIONS):
        raise HTTPException(
            status_code=400,
            detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
        )
    # Save file
    file_path = settings.UPLOAD_FOLDER / file.filename
    with open(file_path, "wb") as buffer:
        content = await file.read()
        buffer.write(content)
    # Create database entry
    db_file = FileModel(
        filename=file.filename,
        original_path=str(file_path),
        status=FileStatus.NOT_STARTED
    )
    db.add(db_file)
    db.commit()
    db.refresh(db_file)
    # Start processing
    process_file.delay(str(db_file.id))
    return db_file
@router.get("/files", response_model=List[FileResponseSchema])
 def list_files(
    skip: int = 0,
    limit: int = 100,
    db: Session = Depends(get_db)
 ):
    files = db.query(FileModel).offset(skip).limit(limit).all()
    return files
@router.get("/files/{file_id}", response_model=FileResponseSchema)
 def get_file(
    file_id: str,
    db: Session = Depends(get_db)
 ):
    file = db.query(FileModel).filter(FileModel.id == file_id).first()
    if not file:
        raise HTTPException(status_code=404, detail="File not found")
    return file
@router.get("/files/{file_id}/download")
 async def download_file(
    file_id: str,
    db: Session = Depends(get_db)
 ):
    file = db.query(FileModel).filter(FileModel.id == file_id).first()
    if not file:
        raise HTTPException(status_code=404, detail="File not found")
    if file.status != FileStatus.SUCCESS:
        raise HTTPException(status_code=400, detail="File is not ready for download")
    if not os.path.exists(file.processed_path):
        raise HTTPException(status_code=404, detail="Processed file not found")
    return FileResponse(
        path=file.processed_path,
        filename=file.filename,
        media_type="application/octet-stream"
    )
@router.websocket("/ws/status/{file_id}")
 async def websocket_endpoint(websocket: WebSocket, file_id: str, db: Session = Depends(get_db)):
    await websocket.accept()
    try:
        while True:
            file = db.query(FileModel).filter(FileModel.id == file_id).first()
            if not file:
                await websocket.send_json({"error": "File not found"})
                break
            await websocket.send_json({
                "status": file.status,
                "error": file.error_message
            })
            if file.status in [FileStatus.SUCCESS, FileStatus.FAILED]:
                break
            await asyncio.sleep(1)
    except WebSocketDisconnect:
        pass 
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -0,0 +1,54 @@
 from pydantic_settings import BaseSettings
 from typing import Optional
 import os
 from pathlib import Path
 class Settings(BaseSettings):
    # API Settings
    API_V1_STR: str = "/api/v1"
    PROJECT_NAME: str = "Legal Document Masker API"
    # Security
    SECRET_KEY: str = "your-secret-key-here"  # Change in production
    ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8  # 8 days
    # Database
    BASE_DIR: Path = Path(__file__).parent.parent.parent
    DATABASE_URL: str = f"sqlite:///{BASE_DIR}/storage/legal_doc_masker.db"
    # File Storage
    UPLOAD_FOLDER: Path = BASE_DIR / "storage" / "uploads"
    PROCESSED_FOLDER: Path = BASE_DIR / "storage" / "processed"
    MAX_FILE_SIZE: int = 50 * 1024 * 1024  # 50MB
    ALLOWED_EXTENSIONS: set = {"pdf", "docx", "doc", "md"}
    # Celery
    CELERY_BROKER_URL: str = "redis://redis:6379/0"
    CELERY_RESULT_BACKEND: str = "redis://redis:6379/0"
    # Ollama API settings
    OLLAMA_API_URL: str = "https://api.ollama.com"
    OLLAMA_API_KEY: str = ""
    OLLAMA_MODEL: str = "llama2"
    # Logging settings
    LOG_LEVEL: str = "INFO"
    LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
    LOG_FILE: str = "app.log"
    class Config:
        case_sensitive = True
        env_file = ".env"
        env_file_encoding = "utf-8"
        extra = "allow"
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # Create storage directories if they don't exist
        self.UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
        self.PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True)
        # Create storage directory for database
        (self.BASE_DIR / "storage").mkdir(parents=True, exist_ok=True)
 settings = Settings()
--- a/backend/app/core/config/logging_config.py
+++ b/backend/app/core/config/logging_config.py
@ -1,5 +1,6 @@
 import logging.config
-from config.settings import settings
+# from config.settings import settings
 from .settings import settings
 LOGGING_CONFIG = {
    "version": 1,
--- a/backend/app/core/config/settings.py
+++ b/backend/app/core/config/settings.py
--- a/backend/app/core/database.py
+++ b/backend/app/core/database.py
@ -0,0 +1,21 @@
 from sqlalchemy import create_engine
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
 from .config import settings
 # Create SQLite engine with check_same_thread=False for FastAPI
 engine = create_engine(
    settings.DATABASE_URL,
    connect_args={"check_same_thread": False}
 )
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 Base = declarative_base()
 # Dependency
 def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close() 
--- a/backend/app/core/document_handlers/document.py
+++ b/backend/app/core/document_handlers/document.py
--- a/backend/app/core/document_handlers/document_factory.py
+++ b/backend/app/core/document_handlers/document_factory.py
@ -1,7 +1,7 @@
 import os
 from typing import Optional
-from document_handlers.document_processor import DocumentProcessor
+from .document_processor import DocumentProcessor
-from document_handlers.processors import (
+from .processors import (
    TxtDocumentProcessor,
    DocxDocumentProcessor,
    PdfDocumentProcessor,
--- a/backend/app/core/document_handlers/document_processor.py
+++ b/backend/app/core/document_handlers/document_processor.py
@ -1,11 +1,13 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict
-from prompts.masking_prompts import get_masking_mapping_prompt
+from ..prompts.masking_prompts import get_masking_mapping_prompt
 import logging
 import json
-from services.ollama_client import OllamaClient
+from ..services.ollama_client import OllamaClient
-from config.settings import settings
+from ...core.config import settings
-from utils.json_extractor import LLMJsonExtractor
+from ..utils.json_extractor import LLMJsonExtractor
 logger = logging.getLogger(__name__)
--- a/backend/app/core/document_handlers/processors/init.py
+++ b/backend/app/core/document_handlers/processors/init.py
@ -0,0 +1,6 @@
 from .txt_processor import TxtDocumentProcessor
 from .docx_processor import DocxDocumentProcessor
 from .pdf_processor import PdfDocumentProcessor
 from .md_processor import MarkdownDocumentProcessor
 __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
--- a/backend/app/core/document_handlers/processors/docx_processor.py
+++ b/backend/app/core/document_handlers/processors/docx_processor.py
@ -1,13 +1,13 @@
 import os
 import docx
-from document_handlers.document_processor import DocumentProcessor
+from ...document_handlers.document_processor import DocumentProcessor
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.data.read_api import read_local_office
 import logging
-from services.ollama_client import OllamaClient
+from ...services.ollama_client import OllamaClient
-from config.settings import settings
+from ...config import settings
-from prompts.masking_prompts import get_masking_mapping_prompt
+from ...prompts.masking_prompts import get_masking_mapping_prompt
 logger = logging.getLogger(__name__)
--- a/backend/app/core/document_handlers/processors/md_processor.py
+++ b/backend/app/core/document_handlers/processors/md_processor.py
@ -1,8 +1,8 @@
 import os
-from document_handlers.document_processor import DocumentProcessor
+from ...document_handlers.document_processor import DocumentProcessor
-from services.ollama_client import OllamaClient
+from ...services.ollama_client import OllamaClient
 import logging
-from config.settings import settings
+from ...config import settings
 logger = logging.getLogger(__name__)
--- a/backend/app/core/document_handlers/processors/pdf_processor.py
+++ b/backend/app/core/document_handlers/processors/pdf_processor.py
@ -1,14 +1,14 @@
 import os
 import PyPDF2
-from document_handlers.document_processor import DocumentProcessor
+from ...document_handlers.document_processor import DocumentProcessor
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.config.enums import SupportedPdfParseMethod
-from prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
+from ...prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
 import logging
-from services.ollama_client import OllamaClient
+from ...services.ollama_client import OllamaClient
-from config.settings import settings
+from ...config import settings
 logger = logging.getLogger(__name__)
--- a/backend/app/core/document_handlers/processors/txt_processor.py
+++ b/backend/app/core/document_handlers/processors/txt_processor.py
@ -1,8 +1,8 @@
-from document_handlers.document_processor import DocumentProcessor
+from ...document_handlers.document_processor import DocumentProcessor
-from services.ollama_client import OllamaClient
+from ...services.ollama_client import OllamaClient
 import logging
-from prompts.masking_prompts import get_masking_prompt
+from ...prompts.masking_prompts import get_masking_prompt
-from config.settings import settings
+from ...config import settings
 logger = logging.getLogger(__name__)
 class TxtDocumentProcessor(DocumentProcessor):
--- a/backend/app/core/prompts/masking_prompts.py
+++ b/backend/app/core/prompts/masking_prompts.py
--- a/backend/app/core/services/document_service.py
+++ b/backend/app/core/services/document_service.py
@ -1,12 +1,12 @@
 import logging
-from document_handlers.document_factory import DocumentProcessorFactory
+from ..document_handlers.document_factory import DocumentProcessorFactory
-from services.ollama_client import OllamaClient
+from ..services.ollama_client import OllamaClient
 logger = logging.getLogger(__name__)
 class DocumentService:
-    def __init__(self, ollama_client: OllamaClient):
+    def __init__(self):
-        self.ollama_client = ollama_client
+        pass
    def process_document(self, input_path: str, output_path: str) -> bool:
        try:
--- a/backend/app/core/services/ollama_client.py
+++ b/backend/app/core/services/ollama_client.py
--- a/backend/app/core/utils/file_utils.py
+++ b/backend/app/core/utils/file_utils.py
--- a/backend/app/core/utils/json_extractor.py
+++ b/backend/app/core/utils/json_extractor.py
--- a/backend/app/main.py
+++ b/backend/app/main.py
@ -0,0 +1,33 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from .core.config import settings
 from .api.endpoints import files
 from .core.database import engine, Base
 # Create database tables
 Base.metadata.create_all(bind=engine)
 app = FastAPI(
    title=settings.PROJECT_NAME,
    openapi_url=f"{settings.API_V1_STR}/openapi.json"
 )
 # Set up CORS
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # In production, replace with specific origins
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Include routers
 app.include_router(
    files.router,
    prefix=f"{settings.API_V1_STR}/files",
    tags=["files"]
 )
@app.get("/")
 async def root():
    return {"message": "Welcome to Legal Document Masker API"} 
--- a/backend/app/models/file.py
+++ b/backend/app/models/file.py
@ -0,0 +1,22 @@
 from sqlalchemy import Column, String, DateTime, Text
 from datetime import datetime
 import uuid
 from ..core.database import Base
 class FileStatus(str):
    NOT_STARTED = "not_started"
    PROCESSING = "processing"
    SUCCESS = "success"
    FAILED = "failed"
 class File(Base):
    __tablename__ = "files"
    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    filename = Column(String(255), nullable=False)
    original_path = Column(String(255), nullable=False)
    processed_path = Column(String(255))
    status = Column(String(20), nullable=False, default=FileStatus.NOT_STARTED)
    error_message = Column(Text)
    created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
    updated_at = Column(DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow) 
--- a/backend/app/schemas/file.py
+++ b/backend/app/schemas/file.py
@ -0,0 +1,21 @@
 from pydantic import BaseModel
 from datetime import datetime
 from typing import Optional
 from uuid import UUID
 class FileBase(BaseModel):
    filename: str
    status: str
    error_message: Optional[str] = None
 class FileResponse(FileBase):
    id: UUID
    created_at: datetime
    updated_at: datetime
    class Config:
        from_attributes = True
 class FileList(BaseModel):
    files: list[FileResponse]
    total: int 
--- a/backend/app/services/file_service.py
+++ b/backend/app/services/file_service.py
@ -0,0 +1,54 @@
 from celery import Celery
 from ..core.config import settings
 from ..models.file import File, FileStatus
 from sqlalchemy.orm import Session
 from ..core.database import SessionLocal
 import sys
 import os
 from ..core.services.document_service import DocumentService
 from pathlib import Path
 celery = Celery(
    'file_service',
    broker=settings.CELERY_BROKER_URL,
    backend=settings.CELERY_RESULT_BACKEND
 )
@celery.task
 def process_file(file_id: str):
    db = SessionLocal()
    try:
        file = db.query(File).filter(File.id == file_id).first()
        if not file:
            return
        # Update status to processing
        file.status = FileStatus.PROCESSING
        db.commit()
        try:
            # Process the file using your existing masking system
            process_service = DocumentService()
            # Determine output path
            input_path = Path(file.original_path)
            output_filename = f"processed_{input_path.name}"
            output_path = str(settings.PROCESSED_FOLDER / output_filename)
            # Process document with both input and output paths
            process_service.process_document(file.original_path, output_path)
            # Update file record with processed path
            file.processed_path = output_path
            file.status = FileStatus.SUCCESS
            db.commit()
        except Exception as e:
            file.status = FileStatus.FAILED
            file.error_message = str(e)
            db.commit()
            raise
    finally:
        db.close() 
--- a/backend/docker-compose.yml
+++ b/backend/docker-compose.yml
@ -0,0 +1,37 @@
 version: '3.8'
 services:
  api:
    build: .
    ports:
      - "8000:8000"
    volumes:
      - ./storage:/app/storage
      - ./legal_doc_masker.db:/app/legal_doc_masker.db
    env_file:
      - .env
    environment:
      - CELERY_BROKER_URL=redis://redis:6379/0
      - CELERY_RESULT_BACKEND=redis://redis:6379/0
    depends_on:
      - redis
  celery_worker:
    build: .
    command: celery -A app.services.file_service worker --loglevel=info
    volumes:
      - ./storage:/app/storage
      - ./legal_doc_masker.db:/app/legal_doc_masker.db
    env_file:
      - .env
    environment:
      - CELERY_BROKER_URL=redis://redis:6379/0
      - CELERY_RESULT_BACKEND=redis://redis:6379/0
    depends_on:
      - redis
      - api
  redis:
    image: redis:alpine
    ports:
      - "6379:6379" 
--- a/backend/package-lock.json
+++ b/backend/package-lock.json
@ -0,0 +1,6 @@
 {
  "name": "backend",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {}
 }
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -0,0 +1,31 @@
 # FastAPI and server
 fastapi>=0.104.0
 uvicorn>=0.24.0
 python-multipart>=0.0.6
 websockets>=12.0
 # Database
 sqlalchemy>=2.0.0
 alembic>=1.12.0
 # Background tasks
 celery>=5.3.0
 redis>=5.0.0
 # Security
 python-jose[cryptography]>=3.3.0
 passlib[bcrypt]>=1.7.4
 python-dotenv>=1.0.0
 # Testing
 pytest>=7.4.0
 httpx>=0.25.0
 # Existing project dependencies
 pydantic-settings>=2.0.0
 watchdog==2.1.6
 requests==2.28.1
 python-docx>=0.8.11
 PyPDF2>=3.0.0
 pandas>=2.0.0
 magic-pdf[full] 
--- a/data/test.sh
+++ b/data/test.sh
@ -1,2 +0,0 @@
 rm ./doc_src/*.md
 cp ./doc/*.md ./doc_src/
--- a/frontend/README.md
+++ b/frontend/README.md
@ -0,0 +1,55 @@
 # Legal Document Masker Frontend
 This is the frontend application for the Legal Document Masker service. It provides a user interface for uploading legal documents, monitoring their processing status, and downloading the masked versions.
 ## Features
 - Drag and drop file upload
 - Real-time status updates
 - File list with processing status
 - Multi-file selection and download
 - Modern Material-UI interface
 ## Prerequisites
 - Node.js (v14 or higher)
 - npm (v6 or higher)
 ## Installation
 1. Install dependencies:
 ```bash
 npm install
 ```
 2. Start the development server:
 ```bash
 npm start
 ```
 The application will be available at http://localhost:3000
 ## Development
 The frontend is built with:
 - React 18
 - TypeScript
 - Material-UI
 - React Query for data fetching
 - React Dropzone for file uploads
 ## Building for Production
 To create a production build:
 ```bash
 npm run build
 ```
 The build artifacts will be stored in the `build/` directory.
 ## Environment Variables
 The following environment variables can be configured:
 - `REACT_APP_API_URL`: The URL of the backend API (default: http://localhost:8000/api/v1) 
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
--- a/frontend/package.json
+++ b/frontend/package.json
@ -0,0 +1,50 @@
 {
  "name": "legal-doc-masker-frontend",
  "version": "0.1.0",
  "private": true,
  "dependencies": {
    "@emotion/react": "^11.11.3",
    "@emotion/styled": "^11.11.0",
    "@mui/icons-material": "^5.15.10",
    "@mui/material": "^5.15.10",
    "@testing-library/jest-dom": "^5.17.0",
    "@testing-library/react": "^13.4.0",
    "@testing-library/user-event": "^13.5.0",
    "@types/jest": "^27.5.2",
    "@types/node": "^16.18.80",
    "@types/react": "^18.2.55",
    "@types/react-dom": "^18.2.19",
    "axios": "^1.6.7",
    "react": "^18.2.0",
    "react-dom": "^18.2.0",
    "react-dropzone": "^14.2.3",
    "react-query": "^3.39.3",
    "react-scripts": "5.0.1",
    "typescript": "^4.9.5",
    "web-vitals": "^2.1.4"
  },
  "scripts": {
    "start": "react-scripts start",
    "build": "react-scripts build",
    "test": "react-scripts test",
    "eject": "react-scripts eject"
  },
  "eslintConfig": {
    "extends": [
      "react-app",
      "react-app/jest"
    ]
  },
  "browserslist": {
    "production": [
      ">0.2%",
      "not dead",
      "not op_mini all"
    ],
    "development": [
      "last 1 chrome version",
      "last 1 firefox version",
      "last 1 safari version"
    ]
  }
 } 
--- a/frontend/public/index.html
+++ b/frontend/public/index.html
@ -0,0 +1,20 @@
 <!DOCTYPE html>
 <html lang="en">
  <head>
    <meta charset="utf-8" />
    <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <meta name="theme-color" content="#000000" />
    <meta
      name="description"
      content="Legal Document Masker - Upload and process legal documents"
    />
    <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
    <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
    <title>Legal Document Masker</title>
  </head>
  <body>
    <noscript>You need to enable JavaScript to run this app.</noscript>
    <div id="root"></div>
  </body>
 </html> 
--- a/frontend/public/manifest.json
+++ b/frontend/public/manifest.json
@ -0,0 +1,15 @@
 {
  "short_name": "Legal Doc Masker",
  "name": "Legal Document Masker",
  "icons": [
    {
      "src": "favicon.ico",
      "sizes": "64x64 32x32 24x24 16x16",
      "type": "image/x-icon"
    }
  ],
  "start_url": ".",
  "display": "standalone",
  "theme_color": "#000000",
  "background_color": "#ffffff"
 } 
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@ -0,0 +1,58 @@
 import React, { useEffect, useState } from 'react';
 import { Container, Typography, Box } from '@mui/material';
 import { useQuery, useQueryClient } from 'react-query';
 import FileUpload from './components/FileUpload';
 import FileList from './components/FileList';
 import { File } from './types/file';
 import { api } from './services/api';
 function App() {
  const queryClient = useQueryClient();
  const [files, setFiles] = useState<File[]>([]);
  const { data, isLoading, error } = useQuery<File[]>('files', api.listFiles, {
    refetchInterval: 5000, // Poll every 5 seconds
  });
  useEffect(() => {
    if (data) {
      setFiles(data);
    }
  }, [data]);
  const handleUploadComplete = () => {
    queryClient.invalidateQueries('files');
  };
  if (isLoading) {
    return (
      <Container>
        <Typography>Loading...</Typography>
      </Container>
    );
  }
  if (error) {
    return (
      <Container>
        <Typography color="error">Error loading files</Typography>
      </Container>
    );
  }
  return (
    <Container maxWidth="lg">
      <Box sx={{ my: 4 }}>
        <Typography variant="h4" component="h1" gutterBottom>
          Legal Document Masker
        </Typography>
        <Box sx={{ mb: 4 }}>
          <FileUpload onUploadComplete={handleUploadComplete} />
        </Box>
        <FileList files={files} onFileStatusChange={handleUploadComplete} />
      </Box>
    </Container>
  );
 }
 export default App; 
--- a/frontend/src/components/FileList.tsx
+++ b/frontend/src/components/FileList.tsx
@ -0,0 +1,144 @@
 import React, { useState } from 'react';
 import {
  Table,
  TableBody,
  TableCell,
  TableContainer,
  TableHead,
  TableRow,
  Paper,
  IconButton,
  Checkbox,
  Button,
  Chip,
 } from '@mui/material';
 import { Download as DownloadIcon } from '@mui/icons-material';
 import { File, FileStatus } from '../types/file';
 import { api } from '../services/api';
 interface FileListProps {
  files: File[];
  onFileStatusChange: () => void;
 }
 const FileList: React.FC<FileListProps> = ({ files, onFileStatusChange }) => {
  const [selectedFiles, setSelectedFiles] = useState<string[]>([]);
  const handleSelectFile = (fileId: string) => {
    setSelectedFiles((prev) =>
      prev.includes(fileId)
        ? prev.filter((id) => id !== fileId)
        : [...prev, fileId]
    );
  };
  const handleSelectAll = () => {
    setSelectedFiles((prev) =>
      prev.length === files.length ? [] : files.map((file) => file.id)
    );
  };
  const handleDownload = async (fileId: string) => {
    try {
      const blob = await api.downloadFile(fileId);
      const url = window.URL.createObjectURL(blob);
      const a = document.createElement('a');
      a.href = url;
      a.download = files.find((f) => f.id === fileId)?.filename || 'downloaded-file';
      document.body.appendChild(a);
      a.click();
      window.URL.revokeObjectURL(url);
      document.body.removeChild(a);
    } catch (error) {
      console.error('Error downloading file:', error);
    }
  };
  const handleDownloadSelected = async () => {
    for (const fileId of selectedFiles) {
      await handleDownload(fileId);
    }
  };
  const getStatusColor = (status: FileStatus) => {
    switch (status) {
      case FileStatus.SUCCESS:
        return 'success';
      case FileStatus.FAILED:
        return 'error';
      case FileStatus.PROCESSING:
        return 'warning';
      default:
        return 'default';
    }
  };
  return (
    <div>
      <div style={{ marginBottom: '1rem' }}>
        <Button
          variant="contained"
          color="primary"
          onClick={handleDownloadSelected}
          disabled={selectedFiles.length === 0}
        >
          Download Selected
        </Button>
      </div>
      <TableContainer component={Paper}>
        <Table>
          <TableHead>
            <TableRow>
              <TableCell padding="checkbox">
                <Checkbox
                  checked={selectedFiles.length === files.length}
                  indeterminate={selectedFiles.length > 0 && selectedFiles.length < files.length}
                  onChange={handleSelectAll}
                />
              </TableCell>
              <TableCell>Filename</TableCell>
              <TableCell>Status</TableCell>
              <TableCell>Created At</TableCell>
              <TableCell>Actions</TableCell>
            </TableRow>
          </TableHead>
          <TableBody>
            {files.map((file) => (
              <TableRow key={file.id}>
                <TableCell padding="checkbox">
                  <Checkbox
                    checked={selectedFiles.includes(file.id)}
                    onChange={() => handleSelectFile(file.id)}
                  />
                </TableCell>
                <TableCell>{file.filename}</TableCell>
                <TableCell>
                  <Chip
                    label={file.status}
                    color={getStatusColor(file.status) as any}
                    size="small"
                  />
                </TableCell>
                <TableCell>
                  {new Date(file.created_at).toLocaleString()}
                </TableCell>
                <TableCell>
                  {file.status === FileStatus.SUCCESS && (
                    <IconButton
                      onClick={() => handleDownload(file.id)}
                      size="small"
                    >
                      <DownloadIcon />
                    </IconButton>
                  )}
                </TableCell>
              </TableRow>
            ))}
          </TableBody>
        </Table>
      </TableContainer>
    </div>
  );
 };
 export default FileList; 
--- a/frontend/src/components/FileUpload.tsx
+++ b/frontend/src/components/FileUpload.tsx
@ -0,0 +1,66 @@
 import React, { useCallback } from 'react';
 import { useDropzone } from 'react-dropzone';
 import { Box, Typography, CircularProgress } from '@mui/material';
 import { api } from '../services/api';
 interface FileUploadProps {
  onUploadComplete: () => void;
 }
 const FileUpload: React.FC<FileUploadProps> = ({ onUploadComplete }) => {
  const [isUploading, setIsUploading] = React.useState(false);
  const onDrop = useCallback(async (acceptedFiles: File[]) => {
    setIsUploading(true);
    try {
      for (const file of acceptedFiles) {
        await api.uploadFile(file);
      }
      onUploadComplete();
    } catch (error) {
      console.error('Error uploading files:', error);
    } finally {
      setIsUploading(false);
    }
  }, [onUploadComplete]);
  const { getRootProps, getInputProps, isDragActive } = useDropzone({
    onDrop,
    accept: {
      'application/pdf': ['.pdf'],
      'application/msword': ['.doc'],
      'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
      'text/markdown': ['.md'],
    },
  });
  return (
    <Box
      {...getRootProps()}
      sx={{
        border: '2px dashed #ccc',
        borderRadius: 2,
        p: 3,
        textAlign: 'center',
        cursor: 'pointer',
        bgcolor: isDragActive ? 'action.hover' : 'background.paper',
        '&:hover': {
          bgcolor: 'action.hover',
        },
      }}
    >
      <input {...getInputProps()} />
      {isUploading ? (
        <CircularProgress />
      ) : (
        <Typography>
          {isDragActive
            ? 'Drop the files here...'
            : 'Drag and drop files here, or click to select files'}
        </Typography>
      )}
    </Box>
  );
 };
 export default FileUpload; 
--- a/frontend/src/index.tsx
+++ b/frontend/src/index.tsx
@ -0,0 +1,29 @@
 import React from 'react';
 import ReactDOM from 'react-dom/client';
 import { QueryClient, QueryClientProvider } from 'react-query';
 import { ThemeProvider, createTheme } from '@mui/material';
 import CssBaseline from '@mui/material/CssBaseline';
 import App from './App';
 const queryClient = new QueryClient();
 const theme = createTheme({
  palette: {
    mode: 'light',
  },
 });
 const root = ReactDOM.createRoot(
  document.getElementById('root') as HTMLElement
 );
 root.render(
  <React.StrictMode>
    <QueryClientProvider client={queryClient}>
      <ThemeProvider theme={theme}>
        <CssBaseline />
        <App />
      </ThemeProvider>
    </QueryClientProvider>
  </React.StrictMode>
 ); 
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@ -0,0 +1,34 @@
 import axios from 'axios';
 import { File, FileUploadResponse } from '../types/file';
 const API_BASE_URL = 'http://localhost:8000/api/v1';
 export const api = {
  uploadFile: async (file: globalThis.File): Promise<FileUploadResponse> => {
    const formData = new FormData();
    formData.append('file', file);
    const response = await axios.post(`${API_BASE_URL}/files/upload`, formData, {
      headers: {
        'Content-Type': 'multipart/form-data',
      },
    });
    return response.data;
  },
  listFiles: async (): Promise<File[]> => {
    const response = await axios.get(`${API_BASE_URL}/files/files`);
    return response.data;
  },
  getFile: async (fileId: string): Promise<File> => {
    const response = await axios.get(`${API_BASE_URL}/files/files/${fileId}`);
    return response.data;
  },
  downloadFile: async (fileId: string): Promise<Blob> => {
    const response = await axios.get(`${API_BASE_URL}/files/files/${fileId}/download`, {
      responseType: 'blob',
    });
    return response.data;
  },
 }; 
--- a/frontend/src/types/file.ts
+++ b/frontend/src/types/file.ts
@ -0,0 +1,23 @@
 export enum FileStatus {
  NOT_STARTED = "not_started",
  PROCESSING = "processing",
  SUCCESS = "success",
  FAILED = "failed"
 }
 export interface File {
  id: string;
  filename: string;
  status: FileStatus;
  error_message?: string;
  created_at: string;
  updated_at: string;
 }
 export interface FileUploadResponse {
  id: string;
  filename: string;
  status: FileStatus;
  created_at: string;
  updated_at: string;
 } 
--- a/frontend/tsconfig.json
+++ b/frontend/tsconfig.json
@ -0,0 +1,26 @@
 {
  "compilerOptions": {
    "target": "es5",
    "lib": [
      "dom",
      "dom.iterable",
      "esnext"
    ],
    "allowJs": true,
    "skipLibCheck": true,
    "esModuleInterop": true,
    "allowSyntheticDefaultImports": true,
    "strict": true,
    "forceConsistentCasingInFileNames": true,
    "noFallthroughCasesInSwitch": true,
    "module": "esnext",
    "moduleResolution": "node",
    "resolveJsonModule": true,
    "isolatedModules": true,
    "noEmit": true,
    "jsx": "react-jsx"
  },
  "include": [
    "src"
  ]
 } 
--- a/sample_doc/20220707_na_decision-2.docx
+++ b/sample_doc/20220707_na_decision-2.docx
--- a/sample_doc/20220707_na_decision-2.md
+++ b/sample_doc/20220707_na_decision-2.md
--- a/sample_doc/20220707_na_decision-2.pdf
+++ b/sample_doc/20220707_na_decision-2.pdf
--- a/src/config/settings.py
+++ b/src/config/settings.py
@ -1,31 +0,0 @@
 # settings.py
 from pydantic_settings import BaseSettings
 from typing import Optional
 class Settings(BaseSettings):
    # Storage paths
    OBJECT_STORAGE_PATH: str = ""
    TARGET_DIRECTORY_PATH: str = ""
    # Ollama API settings
    OLLAMA_API_URL: str = "https://api.ollama.com"
    OLLAMA_API_KEY: str = ""
    OLLAMA_MODEL: str = "llama2"
    # File monitoring settings
    MONITOR_INTERVAL: int = 5
    # Logging settings
    LOG_LEVEL: str = "INFO"
    LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
    LOG_FILE: str = "app.log"
    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"
        extra = "allow"
 # Create settings instance
 settings = Settings()
--- a/src/document_handlers/processors/init.py
+++ b/src/document_handlers/processors/init.py
@ -1,6 +0,0 @@
 from document_handlers.processors.txt_processor import TxtDocumentProcessor
 from document_handlers.processors.docx_processor import DocxDocumentProcessor
 from document_handlers.processors.pdf_processor import PdfDocumentProcessor
 from document_handlers.processors.md_processor import MarkdownDocumentProcessor
 __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
--- a/src/main.py
+++ b/src/main.py
@ -1,22 +0,0 @@
 from config.logging_config import setup_logging
 def main():
    # Setup logging first
    setup_logging()
    from services.file_monitor import FileMonitor
    from config.settings import settings
    import logging
    logger = logging.getLogger(__name__)
    logger.info("Starting the application")
    logger.info(f"Monitoring directory: {settings.OBJECT_STORAGE_PATH}")
    logger.info(f"Target directory: {settings.TARGET_DIRECTORY_PATH}")
    # Initialize the file monitor
    file_monitor = FileMonitor(settings.OBJECT_STORAGE_PATH, settings.TARGET_DIRECTORY_PATH)
    # Start monitoring the directory for new files
    file_monitor.start_monitoring()
 if __name__ == "__main__":
    main()
--- a/src/services/file_monitor.py
+++ b/src/services/file_monitor.py
@ -1,54 +0,0 @@
 import logging
 import os
 from services.document_service import DocumentService
 from services.ollama_client import OllamaClient
 from config.settings import settings
 logger = logging.getLogger(__name__)
 class FileMonitor:
    def __init__(self, input_directory: str, output_directory: str):
        self.input_directory = input_directory
        self.output_directory = output_directory
        # Create OllamaClient instance using settings
        ollama_client = OllamaClient(
            model_name=settings.OLLAMA_MODEL,
            base_url=settings.OLLAMA_API_URL
        )
        # Inject OllamaClient into DocumentService
        self.document_service = DocumentService(ollama_client=ollama_client)
    def process_new_file(self, file_path: str) -> None:
        try:
            # Get the filename without directory path
            filename = os.path.basename(file_path)
            # Create output path
            output_path = os.path.join(self.output_directory, filename)
            logger.info(f"Processing file: {filename}")
            # Process the document using document service
            self.document_service.process_document(file_path, output_path)
            logger.info(f"File processed successfully: {filename}")
        except Exception as e:
            logger.error(f"Error processing file {file_path}: {str(e)}")
    def start_monitoring(self):
        import time
        # Ensure output directory exists
        os.makedirs(self.output_directory, exist_ok=True)
        already_seen = set(os.listdir(self.input_directory))
        while True:
            time.sleep(1)  # Check every second
            current_files = set(os.listdir(self.input_directory))
            new_files = current_files - already_seen
            for new_file in new_files:
                file_path = os.path.join(self.input_directory, new_file)
                logger.info(f"New file found: {new_file}")
                self.process_new_file(file_path)
            already_seen = current_files
Author	SHA1	Message	Date
oliviamn	345fd05a2b	fix: 解决md不允许上传的问题	2025-05-26 00:06:37 +08:00
oliviamn	b3cf9f98a7	refine	2025-05-25 16:45:48 +08:00
oliviamn	24c5bbd5d7	refine: 删除文档数据文件夹，用sample_doc取代	2025-05-25 16:43:32 +08:00
oliviamn	13ef24a3da	feat：增加前端	2025-05-25 00:37:20 +08:00
oliviamn	900a614b09	refine: 解决了导入路径的问题	2025-05-25 00:04:19 +08:00
oliviamn	3e9c44e8c4	refine: 将原src的内容复制到backend/app/core	2025-05-24 23:28:33 +08:00
oliviamn	e0695e7f0e	refine: src rename to core	2025-05-24 22:13:20 +08:00
oliviamn	76b0351f8f	feat: 增加backend	2025-05-24 22:06:28 +08:00
		`@ -1,2 +0,0 @@`
			`rm ./doc_src/*.md`
			`cp ./doc/*.md ./doc_src/`