Compare commits

..

8 Commits

Author SHA1 Message Date
oliviamn 345fd05a2b fix: 解决md不允许上传的问题 2025-05-26 00:06:37 +08:00
oliviamn b3cf9f98a7 refine 2025-05-25 16:45:48 +08:00
oliviamn 24c5bbd5d7 refine: 删除文档数据文件夹,用sample_doc取代 2025-05-25 16:43:32 +08:00
oliviamn 13ef24a3da feat:增加前端 2025-05-25 00:37:20 +08:00
oliviamn 900a614b09 refine: 解决了导入路径的问题 2025-05-25 00:04:19 +08:00
oliviamn 3e9c44e8c4 refine: 将原src的内容复制到backend/app/core 2025-05-24 23:28:33 +08:00
oliviamn e0695e7f0e refine: src rename to core 2025-05-24 22:13:20 +08:00
oliviamn 76b0351f8f feat: 增加backend 2025-05-24 22:06:28 +08:00
48 changed files with 18025 additions and 142 deletions

3
.gitignore vendored
View File

@ -71,3 +71,6 @@ __pycache__
data/doc_dest data/doc_dest
data/doc_src data/doc_src
data/doc_intermediate data/doc_intermediate
node_modules
backend/storage/

27
backend/Dockerfile Normal file
View File

@ -0,0 +1,27 @@
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
libreoffice \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install -U magic-pdf[full]
# Copy the rest of the application
COPY . .
# Create storage directories
RUN mkdir -p storage/uploads storage/processed
# Expose the port the app runs on
EXPOSE 8000
# Command to run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

103
backend/README.md Normal file
View File

@ -0,0 +1,103 @@
# Legal Document Masker API
This is the backend API for the Legal Document Masking system. It provides endpoints for file upload, processing status tracking, and file download.
## Prerequisites
- Python 3.8+
- Redis (for Celery)
## File Storage
Files are stored in the following structure:
```
backend/
├── storage/
│ ├── uploads/ # Original uploaded files
│ └── processed/ # Masked/processed files
```
## Setup
### Option 1: Local Development
1. Create a virtual environment:
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Set up environment variables:
Create a `.env` file in the backend directory with the following variables:
```env
SECRET_KEY=your-secret-key-here
```
The database (SQLite) will be automatically created when you first run the application.
4. Start Redis (required for Celery):
```bash
redis-server
```
5. Start Celery worker:
```bash
celery -A app.services.file_service worker --loglevel=info
```
6. Start the FastAPI server:
```bash
uvicorn app.main:app --reload
```
### Option 2: Docker Deployment
1. Build and start the services:
```bash
docker-compose up --build
```
This will start:
- FastAPI server on port 8000
- Celery worker for background processing
- Redis for task queue
## API Documentation
Once the server is running, you can access:
- Swagger UI: `http://localhost:8000/docs`
- ReDoc: `http://localhost:8000/redoc`
## API Endpoints
- `POST /api/v1/files/upload` - Upload a new file
- `GET /api/v1/files` - List all files
- `GET /api/v1/files/{file_id}` - Get file details
- `GET /api/v1/files/{file_id}/download` - Download processed file
- `WS /api/v1/files/ws/status/{file_id}` - WebSocket for real-time status updates
## Development
### Running Tests
```bash
pytest
```
### Code Style
The project uses Black for code formatting:
```bash
black .
```
### Docker Commands
- Start services: `docker-compose up`
- Start in background: `docker-compose up -d`
- Stop services: `docker-compose down`
- View logs: `docker-compose logs -f`
- Rebuild: `docker-compose up --build`

View File

@ -0,0 +1,111 @@
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, WebSocket, Response
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from typing import List
import os
from ...core.config import settings
from ...core.database import get_db
from ...models.file import File as FileModel, FileStatus
from ...services.file_service import process_file
from ...schemas.file import FileResponse as FileResponseSchema, FileList
import asyncio
from fastapi import WebSocketDisconnect
router = APIRouter()
@router.post("/upload", response_model=FileResponseSchema)
async def upload_file(
file: UploadFile = File(...),
db: Session = Depends(get_db)
):
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
if not any(file.filename.lower().endswith(ext) for ext in settings.ALLOWED_EXTENSIONS):
raise HTTPException(
status_code=400,
detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
)
# Save file
file_path = settings.UPLOAD_FOLDER / file.filename
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
# Create database entry
db_file = FileModel(
filename=file.filename,
original_path=str(file_path),
status=FileStatus.NOT_STARTED
)
db.add(db_file)
db.commit()
db.refresh(db_file)
# Start processing
process_file.delay(str(db_file.id))
return db_file
@router.get("/files", response_model=List[FileResponseSchema])
def list_files(
skip: int = 0,
limit: int = 100,
db: Session = Depends(get_db)
):
files = db.query(FileModel).offset(skip).limit(limit).all()
return files
@router.get("/files/{file_id}", response_model=FileResponseSchema)
def get_file(
file_id: str,
db: Session = Depends(get_db)
):
file = db.query(FileModel).filter(FileModel.id == file_id).first()
if not file:
raise HTTPException(status_code=404, detail="File not found")
return file
@router.get("/files/{file_id}/download")
async def download_file(
file_id: str,
db: Session = Depends(get_db)
):
file = db.query(FileModel).filter(FileModel.id == file_id).first()
if not file:
raise HTTPException(status_code=404, detail="File not found")
if file.status != FileStatus.SUCCESS:
raise HTTPException(status_code=400, detail="File is not ready for download")
if not os.path.exists(file.processed_path):
raise HTTPException(status_code=404, detail="Processed file not found")
return FileResponse(
path=file.processed_path,
filename=file.filename,
media_type="application/octet-stream"
)
@router.websocket("/ws/status/{file_id}")
async def websocket_endpoint(websocket: WebSocket, file_id: str, db: Session = Depends(get_db)):
await websocket.accept()
try:
while True:
file = db.query(FileModel).filter(FileModel.id == file_id).first()
if not file:
await websocket.send_json({"error": "File not found"})
break
await websocket.send_json({
"status": file.status,
"error": file.error_message
})
if file.status in [FileStatus.SUCCESS, FileStatus.FAILED]:
break
await asyncio.sleep(1)
except WebSocketDisconnect:
pass

View File

@ -0,0 +1,54 @@
from pydantic_settings import BaseSettings
from typing import Optional
import os
from pathlib import Path
class Settings(BaseSettings):
# API Settings
API_V1_STR: str = "/api/v1"
PROJECT_NAME: str = "Legal Document Masker API"
# Security
SECRET_KEY: str = "your-secret-key-here" # Change in production
ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
# Database
BASE_DIR: Path = Path(__file__).parent.parent.parent
DATABASE_URL: str = f"sqlite:///{BASE_DIR}/storage/legal_doc_masker.db"
# File Storage
UPLOAD_FOLDER: Path = BASE_DIR / "storage" / "uploads"
PROCESSED_FOLDER: Path = BASE_DIR / "storage" / "processed"
MAX_FILE_SIZE: int = 50 * 1024 * 1024 # 50MB
ALLOWED_EXTENSIONS: set = {"pdf", "docx", "doc", "md"}
# Celery
CELERY_BROKER_URL: str = "redis://redis:6379/0"
CELERY_RESULT_BACKEND: str = "redis://redis:6379/0"
# Ollama API settings
OLLAMA_API_URL: str = "https://api.ollama.com"
OLLAMA_API_KEY: str = ""
OLLAMA_MODEL: str = "llama2"
# Logging settings
LOG_LEVEL: str = "INFO"
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
LOG_FILE: str = "app.log"
class Config:
case_sensitive = True
env_file = ".env"
env_file_encoding = "utf-8"
extra = "allow"
def __init__(self, **kwargs):
super().__init__(**kwargs)
# Create storage directories if they don't exist
self.UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
self.PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True)
# Create storage directory for database
(self.BASE_DIR / "storage").mkdir(parents=True, exist_ok=True)
settings = Settings()

View File

@ -1,5 +1,6 @@
import logging.config import logging.config
from config.settings import settings # from config.settings import settings
from .settings import settings
LOGGING_CONFIG = { LOGGING_CONFIG = {
"version": 1, "version": 1,

View File

View File

@ -0,0 +1,21 @@
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from .config import settings
# Create SQLite engine with check_same_thread=False for FastAPI
engine = create_engine(
settings.DATABASE_URL,
connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
# Dependency
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()

View File

@ -1,7 +1,7 @@
import os import os
from typing import Optional from typing import Optional
from document_handlers.document_processor import DocumentProcessor from .document_processor import DocumentProcessor
from document_handlers.processors import ( from .processors import (
TxtDocumentProcessor, TxtDocumentProcessor,
DocxDocumentProcessor, DocxDocumentProcessor,
PdfDocumentProcessor, PdfDocumentProcessor,

View File

@ -1,11 +1,13 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Dict from typing import Any, Dict
from prompts.masking_prompts import get_masking_mapping_prompt from ..prompts.masking_prompts import get_masking_mapping_prompt
import logging import logging
import json import json
from services.ollama_client import OllamaClient from ..services.ollama_client import OllamaClient
from config.settings import settings from ...core.config import settings
from utils.json_extractor import LLMJsonExtractor from ..utils.json_extractor import LLMJsonExtractor
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@ -0,0 +1,6 @@
from .txt_processor import TxtDocumentProcessor
from .docx_processor import DocxDocumentProcessor
from .pdf_processor import PdfDocumentProcessor
from .md_processor import MarkdownDocumentProcessor
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']

View File

@ -1,13 +1,13 @@
import os import os
import docx import docx
from document_handlers.document_processor import DocumentProcessor from ...document_handlers.document_processor import DocumentProcessor
from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office from magic_pdf.data.read_api import read_local_office
import logging import logging
from services.ollama_client import OllamaClient from ...services.ollama_client import OllamaClient
from config.settings import settings from ...config import settings
from prompts.masking_prompts import get_masking_mapping_prompt from ...prompts.masking_prompts import get_masking_mapping_prompt
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@ -1,8 +1,8 @@
import os import os
from document_handlers.document_processor import DocumentProcessor from ...document_handlers.document_processor import DocumentProcessor
from services.ollama_client import OllamaClient from ...services.ollama_client import OllamaClient
import logging import logging
from config.settings import settings from ...config import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@ -1,14 +1,14 @@
import os import os
import PyPDF2 import PyPDF2
from document_handlers.document_processor import DocumentProcessor from ...document_handlers.document_processor import DocumentProcessor
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt from ...prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
import logging import logging
from services.ollama_client import OllamaClient from ...services.ollama_client import OllamaClient
from config.settings import settings from ...config import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@ -1,8 +1,8 @@
from document_handlers.document_processor import DocumentProcessor from ...document_handlers.document_processor import DocumentProcessor
from services.ollama_client import OllamaClient from ...services.ollama_client import OllamaClient
import logging import logging
from prompts.masking_prompts import get_masking_prompt from ...prompts.masking_prompts import get_masking_prompt
from config.settings import settings from ...config import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class TxtDocumentProcessor(DocumentProcessor): class TxtDocumentProcessor(DocumentProcessor):

View File

@ -1,12 +1,12 @@
import logging import logging
from document_handlers.document_factory import DocumentProcessorFactory from ..document_handlers.document_factory import DocumentProcessorFactory
from services.ollama_client import OllamaClient from ..services.ollama_client import OllamaClient
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class DocumentService: class DocumentService:
def __init__(self, ollama_client: OllamaClient): def __init__(self):
self.ollama_client = ollama_client pass
def process_document(self, input_path: str, output_path: str) -> bool: def process_document(self, input_path: str, output_path: str) -> bool:
try: try:

33
backend/app/main.py Normal file
View File

@ -0,0 +1,33 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from .core.config import settings
from .api.endpoints import files
from .core.database import engine, Base
# Create database tables
Base.metadata.create_all(bind=engine)
app = FastAPI(
title=settings.PROJECT_NAME,
openapi_url=f"{settings.API_V1_STR}/openapi.json"
)
# Set up CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, replace with specific origins
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(
files.router,
prefix=f"{settings.API_V1_STR}/files",
tags=["files"]
)
@app.get("/")
async def root():
return {"message": "Welcome to Legal Document Masker API"}

View File

@ -0,0 +1,22 @@
from sqlalchemy import Column, String, DateTime, Text
from datetime import datetime
import uuid
from ..core.database import Base
class FileStatus(str):
NOT_STARTED = "not_started"
PROCESSING = "processing"
SUCCESS = "success"
FAILED = "failed"
class File(Base):
__tablename__ = "files"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
filename = Column(String(255), nullable=False)
original_path = Column(String(255), nullable=False)
processed_path = Column(String(255))
status = Column(String(20), nullable=False, default=FileStatus.NOT_STARTED)
error_message = Column(Text)
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
updated_at = Column(DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow)

View File

@ -0,0 +1,21 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional
from uuid import UUID
class FileBase(BaseModel):
filename: str
status: str
error_message: Optional[str] = None
class FileResponse(FileBase):
id: UUID
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class FileList(BaseModel):
files: list[FileResponse]
total: int

View File

@ -0,0 +1,54 @@
from celery import Celery
from ..core.config import settings
from ..models.file import File, FileStatus
from sqlalchemy.orm import Session
from ..core.database import SessionLocal
import sys
import os
from ..core.services.document_service import DocumentService
from pathlib import Path
celery = Celery(
'file_service',
broker=settings.CELERY_BROKER_URL,
backend=settings.CELERY_RESULT_BACKEND
)
@celery.task
def process_file(file_id: str):
db = SessionLocal()
try:
file = db.query(File).filter(File.id == file_id).first()
if not file:
return
# Update status to processing
file.status = FileStatus.PROCESSING
db.commit()
try:
# Process the file using your existing masking system
process_service = DocumentService()
# Determine output path
input_path = Path(file.original_path)
output_filename = f"processed_{input_path.name}"
output_path = str(settings.PROCESSED_FOLDER / output_filename)
# Process document with both input and output paths
process_service.process_document(file.original_path, output_path)
# Update file record with processed path
file.processed_path = output_path
file.status = FileStatus.SUCCESS
db.commit()
except Exception as e:
file.status = FileStatus.FAILED
file.error_message = str(e)
db.commit()
raise
finally:
db.close()

View File

@ -0,0 +1,37 @@
version: '3.8'
services:
api:
build: .
ports:
- "8000:8000"
volumes:
- ./storage:/app/storage
- ./legal_doc_masker.db:/app/legal_doc_masker.db
env_file:
- .env
environment:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
depends_on:
- redis
celery_worker:
build: .
command: celery -A app.services.file_service worker --loglevel=info
volumes:
- ./storage:/app/storage
- ./legal_doc_masker.db:/app/legal_doc_masker.db
env_file:
- .env
environment:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
depends_on:
- redis
- api
redis:
image: redis:alpine
ports:
- "6379:6379"

6
backend/package-lock.json generated Normal file
View File

@ -0,0 +1,6 @@
{
"name": "backend",
"lockfileVersion": 3,
"requires": true,
"packages": {}
}

31
backend/requirements.txt Normal file
View File

@ -0,0 +1,31 @@
# FastAPI and server
fastapi>=0.104.0
uvicorn>=0.24.0
python-multipart>=0.0.6
websockets>=12.0
# Database
sqlalchemy>=2.0.0
alembic>=1.12.0
# Background tasks
celery>=5.3.0
redis>=5.0.0
# Security
python-jose[cryptography]>=3.3.0
passlib[bcrypt]>=1.7.4
python-dotenv>=1.0.0
# Testing
pytest>=7.4.0
httpx>=0.25.0
# Existing project dependencies
pydantic-settings>=2.0.0
watchdog==2.1.6
requests==2.28.1
python-docx>=0.8.11
PyPDF2>=3.0.0
pandas>=2.0.0
magic-pdf[full]

View File

@ -1,2 +0,0 @@
rm ./doc_src/*.md
cp ./doc/*.md ./doc_src/

55
frontend/README.md Normal file
View File

@ -0,0 +1,55 @@
# Legal Document Masker Frontend
This is the frontend application for the Legal Document Masker service. It provides a user interface for uploading legal documents, monitoring their processing status, and downloading the masked versions.
## Features
- Drag and drop file upload
- Real-time status updates
- File list with processing status
- Multi-file selection and download
- Modern Material-UI interface
## Prerequisites
- Node.js (v14 or higher)
- npm (v6 or higher)
## Installation
1. Install dependencies:
```bash
npm install
```
2. Start the development server:
```bash
npm start
```
The application will be available at http://localhost:3000
## Development
The frontend is built with:
- React 18
- TypeScript
- Material-UI
- React Query for data fetching
- React Dropzone for file uploads
## Building for Production
To create a production build:
```bash
npm run build
```
The build artifacts will be stored in the `build/` directory.
## Environment Variables
The following environment variables can be configured:
- `REACT_APP_API_URL`: The URL of the backend API (default: http://localhost:8000/api/v1)

16946
frontend/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

50
frontend/package.json Normal file
View File

@ -0,0 +1,50 @@
{
"name": "legal-doc-masker-frontend",
"version": "0.1.0",
"private": true,
"dependencies": {
"@emotion/react": "^11.11.3",
"@emotion/styled": "^11.11.0",
"@mui/icons-material": "^5.15.10",
"@mui/material": "^5.15.10",
"@testing-library/jest-dom": "^5.17.0",
"@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^13.5.0",
"@types/jest": "^27.5.2",
"@types/node": "^16.18.80",
"@types/react": "^18.2.55",
"@types/react-dom": "^18.2.19",
"axios": "^1.6.7",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-dropzone": "^14.2.3",
"react-query": "^3.39.3",
"react-scripts": "5.0.1",
"typescript": "^4.9.5",
"web-vitals": "^2.1.4"
},
"scripts": {
"start": "react-scripts start",
"build": "react-scripts build",
"test": "react-scripts test",
"eject": "react-scripts eject"
},
"eslintConfig": {
"extends": [
"react-app",
"react-app/jest"
]
},
"browserslist": {
"production": [
">0.2%",
"not dead",
"not op_mini all"
],
"development": [
"last 1 chrome version",
"last 1 firefox version",
"last 1 safari version"
]
}
}

View File

@ -0,0 +1,20 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="theme-color" content="#000000" />
<meta
name="description"
content="Legal Document Masker - Upload and process legal documents"
/>
<link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
<link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
<title>Legal Document Masker</title>
</head>
<body>
<noscript>You need to enable JavaScript to run this app.</noscript>
<div id="root"></div>
</body>
</html>

View File

@ -0,0 +1,15 @@
{
"short_name": "Legal Doc Masker",
"name": "Legal Document Masker",
"icons": [
{
"src": "favicon.ico",
"sizes": "64x64 32x32 24x24 16x16",
"type": "image/x-icon"
}
],
"start_url": ".",
"display": "standalone",
"theme_color": "#000000",
"background_color": "#ffffff"
}

58
frontend/src/App.tsx Normal file
View File

@ -0,0 +1,58 @@
import React, { useEffect, useState } from 'react';
import { Container, Typography, Box } from '@mui/material';
import { useQuery, useQueryClient } from 'react-query';
import FileUpload from './components/FileUpload';
import FileList from './components/FileList';
import { File } from './types/file';
import { api } from './services/api';
function App() {
const queryClient = useQueryClient();
const [files, setFiles] = useState<File[]>([]);
const { data, isLoading, error } = useQuery<File[]>('files', api.listFiles, {
refetchInterval: 5000, // Poll every 5 seconds
});
useEffect(() => {
if (data) {
setFiles(data);
}
}, [data]);
const handleUploadComplete = () => {
queryClient.invalidateQueries('files');
};
if (isLoading) {
return (
<Container>
<Typography>Loading...</Typography>
</Container>
);
}
if (error) {
return (
<Container>
<Typography color="error">Error loading files</Typography>
</Container>
);
}
return (
<Container maxWidth="lg">
<Box sx={{ my: 4 }}>
<Typography variant="h4" component="h1" gutterBottom>
Legal Document Masker
</Typography>
<Box sx={{ mb: 4 }}>
<FileUpload onUploadComplete={handleUploadComplete} />
</Box>
<FileList files={files} onFileStatusChange={handleUploadComplete} />
</Box>
</Container>
);
}
export default App;

View File

@ -0,0 +1,144 @@
import React, { useState } from 'react';
import {
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
Paper,
IconButton,
Checkbox,
Button,
Chip,
} from '@mui/material';
import { Download as DownloadIcon } from '@mui/icons-material';
import { File, FileStatus } from '../types/file';
import { api } from '../services/api';
interface FileListProps {
files: File[];
onFileStatusChange: () => void;
}
const FileList: React.FC<FileListProps> = ({ files, onFileStatusChange }) => {
const [selectedFiles, setSelectedFiles] = useState<string[]>([]);
const handleSelectFile = (fileId: string) => {
setSelectedFiles((prev) =>
prev.includes(fileId)
? prev.filter((id) => id !== fileId)
: [...prev, fileId]
);
};
const handleSelectAll = () => {
setSelectedFiles((prev) =>
prev.length === files.length ? [] : files.map((file) => file.id)
);
};
const handleDownload = async (fileId: string) => {
try {
const blob = await api.downloadFile(fileId);
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = files.find((f) => f.id === fileId)?.filename || 'downloaded-file';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
} catch (error) {
console.error('Error downloading file:', error);
}
};
const handleDownloadSelected = async () => {
for (const fileId of selectedFiles) {
await handleDownload(fileId);
}
};
const getStatusColor = (status: FileStatus) => {
switch (status) {
case FileStatus.SUCCESS:
return 'success';
case FileStatus.FAILED:
return 'error';
case FileStatus.PROCESSING:
return 'warning';
default:
return 'default';
}
};
return (
<div>
<div style={{ marginBottom: '1rem' }}>
<Button
variant="contained"
color="primary"
onClick={handleDownloadSelected}
disabled={selectedFiles.length === 0}
>
Download Selected
</Button>
</div>
<TableContainer component={Paper}>
<Table>
<TableHead>
<TableRow>
<TableCell padding="checkbox">
<Checkbox
checked={selectedFiles.length === files.length}
indeterminate={selectedFiles.length > 0 && selectedFiles.length < files.length}
onChange={handleSelectAll}
/>
</TableCell>
<TableCell>Filename</TableCell>
<TableCell>Status</TableCell>
<TableCell>Created At</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{files.map((file) => (
<TableRow key={file.id}>
<TableCell padding="checkbox">
<Checkbox
checked={selectedFiles.includes(file.id)}
onChange={() => handleSelectFile(file.id)}
/>
</TableCell>
<TableCell>{file.filename}</TableCell>
<TableCell>
<Chip
label={file.status}
color={getStatusColor(file.status) as any}
size="small"
/>
</TableCell>
<TableCell>
{new Date(file.created_at).toLocaleString()}
</TableCell>
<TableCell>
{file.status === FileStatus.SUCCESS && (
<IconButton
onClick={() => handleDownload(file.id)}
size="small"
>
<DownloadIcon />
</IconButton>
)}
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
</div>
);
};
export default FileList;

View File

@ -0,0 +1,66 @@
import React, { useCallback } from 'react';
import { useDropzone } from 'react-dropzone';
import { Box, Typography, CircularProgress } from '@mui/material';
import { api } from '../services/api';
interface FileUploadProps {
onUploadComplete: () => void;
}
const FileUpload: React.FC<FileUploadProps> = ({ onUploadComplete }) => {
const [isUploading, setIsUploading] = React.useState(false);
const onDrop = useCallback(async (acceptedFiles: File[]) => {
setIsUploading(true);
try {
for (const file of acceptedFiles) {
await api.uploadFile(file);
}
onUploadComplete();
} catch (error) {
console.error('Error uploading files:', error);
} finally {
setIsUploading(false);
}
}, [onUploadComplete]);
const { getRootProps, getInputProps, isDragActive } = useDropzone({
onDrop,
accept: {
'application/pdf': ['.pdf'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'text/markdown': ['.md'],
},
});
return (
<Box
{...getRootProps()}
sx={{
border: '2px dashed #ccc',
borderRadius: 2,
p: 3,
textAlign: 'center',
cursor: 'pointer',
bgcolor: isDragActive ? 'action.hover' : 'background.paper',
'&:hover': {
bgcolor: 'action.hover',
},
}}
>
<input {...getInputProps()} />
{isUploading ? (
<CircularProgress />
) : (
<Typography>
{isDragActive
? 'Drop the files here...'
: 'Drag and drop files here, or click to select files'}
</Typography>
)}
</Box>
);
};
export default FileUpload;

29
frontend/src/index.tsx Normal file
View File

@ -0,0 +1,29 @@
import React from 'react';
import ReactDOM from 'react-dom/client';
import { QueryClient, QueryClientProvider } from 'react-query';
import { ThemeProvider, createTheme } from '@mui/material';
import CssBaseline from '@mui/material/CssBaseline';
import App from './App';
const queryClient = new QueryClient();
const theme = createTheme({
palette: {
mode: 'light',
},
});
const root = ReactDOM.createRoot(
document.getElementById('root') as HTMLElement
);
root.render(
<React.StrictMode>
<QueryClientProvider client={queryClient}>
<ThemeProvider theme={theme}>
<CssBaseline />
<App />
</ThemeProvider>
</QueryClientProvider>
</React.StrictMode>
);

View File

@ -0,0 +1,34 @@
import axios from 'axios';
import { File, FileUploadResponse } from '../types/file';
const API_BASE_URL = 'http://localhost:8000/api/v1';
export const api = {
uploadFile: async (file: globalThis.File): Promise<FileUploadResponse> => {
const formData = new FormData();
formData.append('file', file);
const response = await axios.post(`${API_BASE_URL}/files/upload`, formData, {
headers: {
'Content-Type': 'multipart/form-data',
},
});
return response.data;
},
listFiles: async (): Promise<File[]> => {
const response = await axios.get(`${API_BASE_URL}/files/files`);
return response.data;
},
getFile: async (fileId: string): Promise<File> => {
const response = await axios.get(`${API_BASE_URL}/files/files/${fileId}`);
return response.data;
},
downloadFile: async (fileId: string): Promise<Blob> => {
const response = await axios.get(`${API_BASE_URL}/files/files/${fileId}/download`, {
responseType: 'blob',
});
return response.data;
},
};

View File

@ -0,0 +1,23 @@
export enum FileStatus {
NOT_STARTED = "not_started",
PROCESSING = "processing",
SUCCESS = "success",
FAILED = "failed"
}
export interface File {
id: string;
filename: string;
status: FileStatus;
error_message?: string;
created_at: string;
updated_at: string;
}
export interface FileUploadResponse {
id: string;
filename: string;
status: FileStatus;
created_at: string;
updated_at: string;
}

26
frontend/tsconfig.json Normal file
View File

@ -0,0 +1,26 @@
{
"compilerOptions": {
"target": "es5",
"lib": [
"dom",
"dom.iterable",
"esnext"
],
"allowJs": true,
"skipLibCheck": true,
"esModuleInterop": true,
"allowSyntheticDefaultImports": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noFallthroughCasesInSwitch": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "react-jsx"
},
"include": [
"src"
]
}

View File

@ -1,31 +0,0 @@
# settings.py
from pydantic_settings import BaseSettings
from typing import Optional
class Settings(BaseSettings):
# Storage paths
OBJECT_STORAGE_PATH: str = ""
TARGET_DIRECTORY_PATH: str = ""
# Ollama API settings
OLLAMA_API_URL: str = "https://api.ollama.com"
OLLAMA_API_KEY: str = ""
OLLAMA_MODEL: str = "llama2"
# File monitoring settings
MONITOR_INTERVAL: int = 5
# Logging settings
LOG_LEVEL: str = "INFO"
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
LOG_FILE: str = "app.log"
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
extra = "allow"
# Create settings instance
settings = Settings()

View File

@ -1,6 +0,0 @@
from document_handlers.processors.txt_processor import TxtDocumentProcessor
from document_handlers.processors.docx_processor import DocxDocumentProcessor
from document_handlers.processors.pdf_processor import PdfDocumentProcessor
from document_handlers.processors.md_processor import MarkdownDocumentProcessor
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']

View File

@ -1,22 +0,0 @@
from config.logging_config import setup_logging
def main():
# Setup logging first
setup_logging()
from services.file_monitor import FileMonitor
from config.settings import settings
import logging
logger = logging.getLogger(__name__)
logger.info("Starting the application")
logger.info(f"Monitoring directory: {settings.OBJECT_STORAGE_PATH}")
logger.info(f"Target directory: {settings.TARGET_DIRECTORY_PATH}")
# Initialize the file monitor
file_monitor = FileMonitor(settings.OBJECT_STORAGE_PATH, settings.TARGET_DIRECTORY_PATH)
# Start monitoring the directory for new files
file_monitor.start_monitoring()
if __name__ == "__main__":
main()

View File

@ -1,54 +0,0 @@
import logging
import os
from services.document_service import DocumentService
from services.ollama_client import OllamaClient
from config.settings import settings
logger = logging.getLogger(__name__)
class FileMonitor:
def __init__(self, input_directory: str, output_directory: str):
self.input_directory = input_directory
self.output_directory = output_directory
# Create OllamaClient instance using settings
ollama_client = OllamaClient(
model_name=settings.OLLAMA_MODEL,
base_url=settings.OLLAMA_API_URL
)
# Inject OllamaClient into DocumentService
self.document_service = DocumentService(ollama_client=ollama_client)
def process_new_file(self, file_path: str) -> None:
try:
# Get the filename without directory path
filename = os.path.basename(file_path)
# Create output path
output_path = os.path.join(self.output_directory, filename)
logger.info(f"Processing file: {filename}")
# Process the document using document service
self.document_service.process_document(file_path, output_path)
logger.info(f"File processed successfully: {filename}")
except Exception as e:
logger.error(f"Error processing file {file_path}: {str(e)}")
def start_monitoring(self):
import time
# Ensure output directory exists
os.makedirs(self.output_directory, exist_ok=True)
already_seen = set(os.listdir(self.input_directory))
while True:
time.sleep(1) # Check every second
current_files = set(os.listdir(self.input_directory))
new_files = current_files - already_seen
for new_file in new_files:
file_path = os.path.join(self.input_directory, new_file)
logger.info(f"New file found: {new_file}")
self.process_new_file(file_path)
already_seen = current_files