Compare commits
No commits in common. "345fd05a2b7881c108b6dc774686bbc89b4412ce" and "47e78c35bb11018180ba2a73d3a4d6eb78cdfa10" have entirely different histories.
345fd05a2b
...
47e78c35bb
|
|
@ -70,7 +70,4 @@ app.log
|
||||||
__pycache__
|
__pycache__
|
||||||
data/doc_dest
|
data/doc_dest
|
||||||
data/doc_src
|
data/doc_src
|
||||||
data/doc_intermediate
|
data/doc_intermediate
|
||||||
|
|
||||||
node_modules
|
|
||||||
backend/storage/
|
|
||||||
|
|
@ -1,27 +0,0 @@
|
||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# Install system dependencies
|
|
||||||
RUN apt-get update && apt-get install -y \
|
|
||||||
build-essential \
|
|
||||||
libreoffice \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
|
|
||||||
# Copy requirements first to leverage Docker cache
|
|
||||||
COPY requirements.txt .
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
|
||||||
RUN pip install -U magic-pdf[full]
|
|
||||||
|
|
||||||
# Copy the rest of the application
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Create storage directories
|
|
||||||
RUN mkdir -p storage/uploads storage/processed
|
|
||||||
|
|
||||||
# Expose the port the app runs on
|
|
||||||
EXPOSE 8000
|
|
||||||
|
|
||||||
# Command to run the application
|
|
||||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
||||||
|
|
@ -1,103 +0,0 @@
|
||||||
# Legal Document Masker API
|
|
||||||
|
|
||||||
This is the backend API for the Legal Document Masking system. It provides endpoints for file upload, processing status tracking, and file download.
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
- Python 3.8+
|
|
||||||
- Redis (for Celery)
|
|
||||||
|
|
||||||
## File Storage
|
|
||||||
|
|
||||||
Files are stored in the following structure:
|
|
||||||
```
|
|
||||||
backend/
|
|
||||||
├── storage/
|
|
||||||
│ ├── uploads/ # Original uploaded files
|
|
||||||
│ └── processed/ # Masked/processed files
|
|
||||||
```
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
### Option 1: Local Development
|
|
||||||
|
|
||||||
1. Create a virtual environment:
|
|
||||||
```bash
|
|
||||||
python -m venv venv
|
|
||||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Install dependencies:
|
|
||||||
```bash
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Set up environment variables:
|
|
||||||
Create a `.env` file in the backend directory with the following variables:
|
|
||||||
```env
|
|
||||||
SECRET_KEY=your-secret-key-here
|
|
||||||
```
|
|
||||||
|
|
||||||
The database (SQLite) will be automatically created when you first run the application.
|
|
||||||
|
|
||||||
4. Start Redis (required for Celery):
|
|
||||||
```bash
|
|
||||||
redis-server
|
|
||||||
```
|
|
||||||
|
|
||||||
5. Start Celery worker:
|
|
||||||
```bash
|
|
||||||
celery -A app.services.file_service worker --loglevel=info
|
|
||||||
```
|
|
||||||
|
|
||||||
6. Start the FastAPI server:
|
|
||||||
```bash
|
|
||||||
uvicorn app.main:app --reload
|
|
||||||
```
|
|
||||||
|
|
||||||
### Option 2: Docker Deployment
|
|
||||||
|
|
||||||
1. Build and start the services:
|
|
||||||
```bash
|
|
||||||
docker-compose up --build
|
|
||||||
```
|
|
||||||
|
|
||||||
This will start:
|
|
||||||
- FastAPI server on port 8000
|
|
||||||
- Celery worker for background processing
|
|
||||||
- Redis for task queue
|
|
||||||
|
|
||||||
## API Documentation
|
|
||||||
|
|
||||||
Once the server is running, you can access:
|
|
||||||
- Swagger UI: `http://localhost:8000/docs`
|
|
||||||
- ReDoc: `http://localhost:8000/redoc`
|
|
||||||
|
|
||||||
## API Endpoints
|
|
||||||
|
|
||||||
- `POST /api/v1/files/upload` - Upload a new file
|
|
||||||
- `GET /api/v1/files` - List all files
|
|
||||||
- `GET /api/v1/files/{file_id}` - Get file details
|
|
||||||
- `GET /api/v1/files/{file_id}/download` - Download processed file
|
|
||||||
- `WS /api/v1/files/ws/status/{file_id}` - WebSocket for real-time status updates
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
### Running Tests
|
|
||||||
```bash
|
|
||||||
pytest
|
|
||||||
```
|
|
||||||
|
|
||||||
### Code Style
|
|
||||||
The project uses Black for code formatting:
|
|
||||||
```bash
|
|
||||||
black .
|
|
||||||
```
|
|
||||||
|
|
||||||
### Docker Commands
|
|
||||||
|
|
||||||
- Start services: `docker-compose up`
|
|
||||||
- Start in background: `docker-compose up -d`
|
|
||||||
- Stop services: `docker-compose down`
|
|
||||||
- View logs: `docker-compose logs -f`
|
|
||||||
- Rebuild: `docker-compose up --build`
|
|
||||||
|
|
@ -1,111 +0,0 @@
|
||||||
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, WebSocket, Response
|
|
||||||
from fastapi.responses import FileResponse
|
|
||||||
from sqlalchemy.orm import Session
|
|
||||||
from typing import List
|
|
||||||
import os
|
|
||||||
from ...core.config import settings
|
|
||||||
from ...core.database import get_db
|
|
||||||
from ...models.file import File as FileModel, FileStatus
|
|
||||||
from ...services.file_service import process_file
|
|
||||||
from ...schemas.file import FileResponse as FileResponseSchema, FileList
|
|
||||||
import asyncio
|
|
||||||
from fastapi import WebSocketDisconnect
|
|
||||||
|
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
@router.post("/upload", response_model=FileResponseSchema)
|
|
||||||
async def upload_file(
|
|
||||||
file: UploadFile = File(...),
|
|
||||||
db: Session = Depends(get_db)
|
|
||||||
):
|
|
||||||
if not file.filename:
|
|
||||||
raise HTTPException(status_code=400, detail="No file provided")
|
|
||||||
|
|
||||||
if not any(file.filename.lower().endswith(ext) for ext in settings.ALLOWED_EXTENSIONS):
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Save file
|
|
||||||
file_path = settings.UPLOAD_FOLDER / file.filename
|
|
||||||
with open(file_path, "wb") as buffer:
|
|
||||||
content = await file.read()
|
|
||||||
buffer.write(content)
|
|
||||||
|
|
||||||
# Create database entry
|
|
||||||
db_file = FileModel(
|
|
||||||
filename=file.filename,
|
|
||||||
original_path=str(file_path),
|
|
||||||
status=FileStatus.NOT_STARTED
|
|
||||||
)
|
|
||||||
db.add(db_file)
|
|
||||||
db.commit()
|
|
||||||
db.refresh(db_file)
|
|
||||||
|
|
||||||
# Start processing
|
|
||||||
process_file.delay(str(db_file.id))
|
|
||||||
|
|
||||||
return db_file
|
|
||||||
|
|
||||||
@router.get("/files", response_model=List[FileResponseSchema])
|
|
||||||
def list_files(
|
|
||||||
skip: int = 0,
|
|
||||||
limit: int = 100,
|
|
||||||
db: Session = Depends(get_db)
|
|
||||||
):
|
|
||||||
files = db.query(FileModel).offset(skip).limit(limit).all()
|
|
||||||
return files
|
|
||||||
|
|
||||||
@router.get("/files/{file_id}", response_model=FileResponseSchema)
|
|
||||||
def get_file(
|
|
||||||
file_id: str,
|
|
||||||
db: Session = Depends(get_db)
|
|
||||||
):
|
|
||||||
file = db.query(FileModel).filter(FileModel.id == file_id).first()
|
|
||||||
if not file:
|
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
|
||||||
return file
|
|
||||||
|
|
||||||
@router.get("/files/{file_id}/download")
|
|
||||||
async def download_file(
|
|
||||||
file_id: str,
|
|
||||||
db: Session = Depends(get_db)
|
|
||||||
):
|
|
||||||
file = db.query(FileModel).filter(FileModel.id == file_id).first()
|
|
||||||
if not file:
|
|
||||||
raise HTTPException(status_code=404, detail="File not found")
|
|
||||||
|
|
||||||
if file.status != FileStatus.SUCCESS:
|
|
||||||
raise HTTPException(status_code=400, detail="File is not ready for download")
|
|
||||||
|
|
||||||
if not os.path.exists(file.processed_path):
|
|
||||||
raise HTTPException(status_code=404, detail="Processed file not found")
|
|
||||||
|
|
||||||
return FileResponse(
|
|
||||||
path=file.processed_path,
|
|
||||||
filename=file.filename,
|
|
||||||
media_type="application/octet-stream"
|
|
||||||
)
|
|
||||||
|
|
||||||
@router.websocket("/ws/status/{file_id}")
|
|
||||||
async def websocket_endpoint(websocket: WebSocket, file_id: str, db: Session = Depends(get_db)):
|
|
||||||
await websocket.accept()
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
file = db.query(FileModel).filter(FileModel.id == file_id).first()
|
|
||||||
if not file:
|
|
||||||
await websocket.send_json({"error": "File not found"})
|
|
||||||
break
|
|
||||||
|
|
||||||
await websocket.send_json({
|
|
||||||
"status": file.status,
|
|
||||||
"error": file.error_message
|
|
||||||
})
|
|
||||||
|
|
||||||
if file.status in [FileStatus.SUCCESS, FileStatus.FAILED]:
|
|
||||||
break
|
|
||||||
|
|
||||||
await asyncio.sleep(1)
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
pass
|
|
||||||
|
|
@ -1,54 +0,0 @@
|
||||||
from pydantic_settings import BaseSettings
|
|
||||||
from typing import Optional
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
|
||||||
# API Settings
|
|
||||||
API_V1_STR: str = "/api/v1"
|
|
||||||
PROJECT_NAME: str = "Legal Document Masker API"
|
|
||||||
|
|
||||||
# Security
|
|
||||||
SECRET_KEY: str = "your-secret-key-here" # Change in production
|
|
||||||
ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
|
|
||||||
|
|
||||||
# Database
|
|
||||||
BASE_DIR: Path = Path(__file__).parent.parent.parent
|
|
||||||
DATABASE_URL: str = f"sqlite:///{BASE_DIR}/storage/legal_doc_masker.db"
|
|
||||||
|
|
||||||
# File Storage
|
|
||||||
UPLOAD_FOLDER: Path = BASE_DIR / "storage" / "uploads"
|
|
||||||
PROCESSED_FOLDER: Path = BASE_DIR / "storage" / "processed"
|
|
||||||
MAX_FILE_SIZE: int = 50 * 1024 * 1024 # 50MB
|
|
||||||
ALLOWED_EXTENSIONS: set = {"pdf", "docx", "doc", "md"}
|
|
||||||
|
|
||||||
# Celery
|
|
||||||
CELERY_BROKER_URL: str = "redis://redis:6379/0"
|
|
||||||
CELERY_RESULT_BACKEND: str = "redis://redis:6379/0"
|
|
||||||
|
|
||||||
# Ollama API settings
|
|
||||||
OLLAMA_API_URL: str = "https://api.ollama.com"
|
|
||||||
OLLAMA_API_KEY: str = ""
|
|
||||||
OLLAMA_MODEL: str = "llama2"
|
|
||||||
|
|
||||||
# Logging settings
|
|
||||||
LOG_LEVEL: str = "INFO"
|
|
||||||
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
||||||
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
|
|
||||||
LOG_FILE: str = "app.log"
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
case_sensitive = True
|
|
||||||
env_file = ".env"
|
|
||||||
env_file_encoding = "utf-8"
|
|
||||||
extra = "allow"
|
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
# Create storage directories if they don't exist
|
|
||||||
self.UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
|
||||||
self.PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True)
|
|
||||||
# Create storage directory for database
|
|
||||||
(self.BASE_DIR / "storage").mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
settings = Settings()
|
|
||||||
|
|
@ -1,21 +0,0 @@
|
||||||
from sqlalchemy import create_engine
|
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
|
||||||
from sqlalchemy.orm import sessionmaker
|
|
||||||
from .config import settings
|
|
||||||
|
|
||||||
# Create SQLite engine with check_same_thread=False for FastAPI
|
|
||||||
engine = create_engine(
|
|
||||||
settings.DATABASE_URL,
|
|
||||||
connect_args={"check_same_thread": False}
|
|
||||||
)
|
|
||||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
||||||
|
|
||||||
Base = declarative_base()
|
|
||||||
|
|
||||||
# Dependency
|
|
||||||
def get_db():
|
|
||||||
db = SessionLocal()
|
|
||||||
try:
|
|
||||||
yield db
|
|
||||||
finally:
|
|
||||||
db.close()
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
from .txt_processor import TxtDocumentProcessor
|
|
||||||
from .docx_processor import DocxDocumentProcessor
|
|
||||||
from .pdf_processor import PdfDocumentProcessor
|
|
||||||
from .md_processor import MarkdownDocumentProcessor
|
|
||||||
|
|
||||||
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
||||||
from fastapi import FastAPI
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from .core.config import settings
|
|
||||||
from .api.endpoints import files
|
|
||||||
from .core.database import engine, Base
|
|
||||||
|
|
||||||
# Create database tables
|
|
||||||
Base.metadata.create_all(bind=engine)
|
|
||||||
|
|
||||||
app = FastAPI(
|
|
||||||
title=settings.PROJECT_NAME,
|
|
||||||
openapi_url=f"{settings.API_V1_STR}/openapi.json"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Set up CORS
|
|
||||||
app.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=["*"], # In production, replace with specific origins
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Include routers
|
|
||||||
app.include_router(
|
|
||||||
files.router,
|
|
||||||
prefix=f"{settings.API_V1_STR}/files",
|
|
||||||
tags=["files"]
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def root():
|
|
||||||
return {"message": "Welcome to Legal Document Masker API"}
|
|
||||||
|
|
@ -1,22 +0,0 @@
|
||||||
from sqlalchemy import Column, String, DateTime, Text
|
|
||||||
from datetime import datetime
|
|
||||||
import uuid
|
|
||||||
from ..core.database import Base
|
|
||||||
|
|
||||||
class FileStatus(str):
|
|
||||||
NOT_STARTED = "not_started"
|
|
||||||
PROCESSING = "processing"
|
|
||||||
SUCCESS = "success"
|
|
||||||
FAILED = "failed"
|
|
||||||
|
|
||||||
class File(Base):
|
|
||||||
__tablename__ = "files"
|
|
||||||
|
|
||||||
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
|
||||||
filename = Column(String(255), nullable=False)
|
|
||||||
original_path = Column(String(255), nullable=False)
|
|
||||||
processed_path = Column(String(255))
|
|
||||||
status = Column(String(20), nullable=False, default=FileStatus.NOT_STARTED)
|
|
||||||
error_message = Column(Text)
|
|
||||||
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
|
||||||
updated_at = Column(DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow)
|
|
||||||
|
|
@ -1,21 +0,0 @@
|
||||||
from pydantic import BaseModel
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Optional
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
class FileBase(BaseModel):
|
|
||||||
filename: str
|
|
||||||
status: str
|
|
||||||
error_message: Optional[str] = None
|
|
||||||
|
|
||||||
class FileResponse(FileBase):
|
|
||||||
id: UUID
|
|
||||||
created_at: datetime
|
|
||||||
updated_at: datetime
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
from_attributes = True
|
|
||||||
|
|
||||||
class FileList(BaseModel):
|
|
||||||
files: list[FileResponse]
|
|
||||||
total: int
|
|
||||||
|
|
@ -1,54 +0,0 @@
|
||||||
from celery import Celery
|
|
||||||
from ..core.config import settings
|
|
||||||
from ..models.file import File, FileStatus
|
|
||||||
from sqlalchemy.orm import Session
|
|
||||||
from ..core.database import SessionLocal
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from ..core.services.document_service import DocumentService
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
celery = Celery(
|
|
||||||
'file_service',
|
|
||||||
broker=settings.CELERY_BROKER_URL,
|
|
||||||
backend=settings.CELERY_RESULT_BACKEND
|
|
||||||
)
|
|
||||||
|
|
||||||
@celery.task
|
|
||||||
def process_file(file_id: str):
|
|
||||||
db = SessionLocal()
|
|
||||||
try:
|
|
||||||
file = db.query(File).filter(File.id == file_id).first()
|
|
||||||
if not file:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Update status to processing
|
|
||||||
file.status = FileStatus.PROCESSING
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Process the file using your existing masking system
|
|
||||||
process_service = DocumentService()
|
|
||||||
|
|
||||||
# Determine output path
|
|
||||||
input_path = Path(file.original_path)
|
|
||||||
output_filename = f"processed_{input_path.name}"
|
|
||||||
output_path = str(settings.PROCESSED_FOLDER / output_filename)
|
|
||||||
|
|
||||||
# Process document with both input and output paths
|
|
||||||
process_service.process_document(file.original_path, output_path)
|
|
||||||
|
|
||||||
# Update file record with processed path
|
|
||||||
file.processed_path = output_path
|
|
||||||
file.status = FileStatus.SUCCESS
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
file.status = FileStatus.FAILED
|
|
||||||
file.error_message = str(e)
|
|
||||||
db.commit()
|
|
||||||
raise
|
|
||||||
|
|
||||||
finally:
|
|
||||||
db.close()
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
||||||
version: '3.8'
|
|
||||||
|
|
||||||
services:
|
|
||||||
api:
|
|
||||||
build: .
|
|
||||||
ports:
|
|
||||||
- "8000:8000"
|
|
||||||
volumes:
|
|
||||||
- ./storage:/app/storage
|
|
||||||
- ./legal_doc_masker.db:/app/legal_doc_masker.db
|
|
||||||
env_file:
|
|
||||||
- .env
|
|
||||||
environment:
|
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
|
||||||
depends_on:
|
|
||||||
- redis
|
|
||||||
|
|
||||||
celery_worker:
|
|
||||||
build: .
|
|
||||||
command: celery -A app.services.file_service worker --loglevel=info
|
|
||||||
volumes:
|
|
||||||
- ./storage:/app/storage
|
|
||||||
- ./legal_doc_masker.db:/app/legal_doc_masker.db
|
|
||||||
env_file:
|
|
||||||
- .env
|
|
||||||
environment:
|
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
|
||||||
depends_on:
|
|
||||||
- redis
|
|
||||||
- api
|
|
||||||
|
|
||||||
redis:
|
|
||||||
image: redis:alpine
|
|
||||||
ports:
|
|
||||||
- "6379:6379"
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"name": "backend",
|
|
||||||
"lockfileVersion": 3,
|
|
||||||
"requires": true,
|
|
||||||
"packages": {}
|
|
||||||
}
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
||||||
# FastAPI and server
|
|
||||||
fastapi>=0.104.0
|
|
||||||
uvicorn>=0.24.0
|
|
||||||
python-multipart>=0.0.6
|
|
||||||
websockets>=12.0
|
|
||||||
|
|
||||||
# Database
|
|
||||||
sqlalchemy>=2.0.0
|
|
||||||
alembic>=1.12.0
|
|
||||||
|
|
||||||
# Background tasks
|
|
||||||
celery>=5.3.0
|
|
||||||
redis>=5.0.0
|
|
||||||
|
|
||||||
# Security
|
|
||||||
python-jose[cryptography]>=3.3.0
|
|
||||||
passlib[bcrypt]>=1.7.4
|
|
||||||
python-dotenv>=1.0.0
|
|
||||||
|
|
||||||
# Testing
|
|
||||||
pytest>=7.4.0
|
|
||||||
httpx>=0.25.0
|
|
||||||
|
|
||||||
# Existing project dependencies
|
|
||||||
pydantic-settings>=2.0.0
|
|
||||||
watchdog==2.1.6
|
|
||||||
requests==2.28.1
|
|
||||||
python-docx>=0.8.11
|
|
||||||
PyPDF2>=3.0.0
|
|
||||||
pandas>=2.0.0
|
|
||||||
magic-pdf[full]
|
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
rm ./doc_src/*.md
|
||||||
|
cp ./doc/*.md ./doc_src/
|
||||||
|
|
@ -1,55 +0,0 @@
|
||||||
# Legal Document Masker Frontend
|
|
||||||
|
|
||||||
This is the frontend application for the Legal Document Masker service. It provides a user interface for uploading legal documents, monitoring their processing status, and downloading the masked versions.
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- Drag and drop file upload
|
|
||||||
- Real-time status updates
|
|
||||||
- File list with processing status
|
|
||||||
- Multi-file selection and download
|
|
||||||
- Modern Material-UI interface
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
- Node.js (v14 or higher)
|
|
||||||
- npm (v6 or higher)
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
1. Install dependencies:
|
|
||||||
```bash
|
|
||||||
npm install
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Start the development server:
|
|
||||||
```bash
|
|
||||||
npm start
|
|
||||||
```
|
|
||||||
|
|
||||||
The application will be available at http://localhost:3000
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
The frontend is built with:
|
|
||||||
- React 18
|
|
||||||
- TypeScript
|
|
||||||
- Material-UI
|
|
||||||
- React Query for data fetching
|
|
||||||
- React Dropzone for file uploads
|
|
||||||
|
|
||||||
## Building for Production
|
|
||||||
|
|
||||||
To create a production build:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm run build
|
|
||||||
```
|
|
||||||
|
|
||||||
The build artifacts will be stored in the `build/` directory.
|
|
||||||
|
|
||||||
## Environment Variables
|
|
||||||
|
|
||||||
The following environment variables can be configured:
|
|
||||||
|
|
||||||
- `REACT_APP_API_URL`: The URL of the backend API (default: http://localhost:8000/api/v1)
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,50 +0,0 @@
|
||||||
{
|
|
||||||
"name": "legal-doc-masker-frontend",
|
|
||||||
"version": "0.1.0",
|
|
||||||
"private": true,
|
|
||||||
"dependencies": {
|
|
||||||
"@emotion/react": "^11.11.3",
|
|
||||||
"@emotion/styled": "^11.11.0",
|
|
||||||
"@mui/icons-material": "^5.15.10",
|
|
||||||
"@mui/material": "^5.15.10",
|
|
||||||
"@testing-library/jest-dom": "^5.17.0",
|
|
||||||
"@testing-library/react": "^13.4.0",
|
|
||||||
"@testing-library/user-event": "^13.5.0",
|
|
||||||
"@types/jest": "^27.5.2",
|
|
||||||
"@types/node": "^16.18.80",
|
|
||||||
"@types/react": "^18.2.55",
|
|
||||||
"@types/react-dom": "^18.2.19",
|
|
||||||
"axios": "^1.6.7",
|
|
||||||
"react": "^18.2.0",
|
|
||||||
"react-dom": "^18.2.0",
|
|
||||||
"react-dropzone": "^14.2.3",
|
|
||||||
"react-query": "^3.39.3",
|
|
||||||
"react-scripts": "5.0.1",
|
|
||||||
"typescript": "^4.9.5",
|
|
||||||
"web-vitals": "^2.1.4"
|
|
||||||
},
|
|
||||||
"scripts": {
|
|
||||||
"start": "react-scripts start",
|
|
||||||
"build": "react-scripts build",
|
|
||||||
"test": "react-scripts test",
|
|
||||||
"eject": "react-scripts eject"
|
|
||||||
},
|
|
||||||
"eslintConfig": {
|
|
||||||
"extends": [
|
|
||||||
"react-app",
|
|
||||||
"react-app/jest"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"browserslist": {
|
|
||||||
"production": [
|
|
||||||
">0.2%",
|
|
||||||
"not dead",
|
|
||||||
"not op_mini all"
|
|
||||||
],
|
|
||||||
"development": [
|
|
||||||
"last 1 chrome version",
|
|
||||||
"last 1 firefox version",
|
|
||||||
"last 1 safari version"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,20 +0,0 @@
|
||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8" />
|
|
||||||
<link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
||||||
<meta name="theme-color" content="#000000" />
|
|
||||||
<meta
|
|
||||||
name="description"
|
|
||||||
content="Legal Document Masker - Upload and process legal documents"
|
|
||||||
/>
|
|
||||||
<link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
|
|
||||||
<link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
|
|
||||||
<title>Legal Document Masker</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<noscript>You need to enable JavaScript to run this app.</noscript>
|
|
||||||
<div id="root"></div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
{
|
|
||||||
"short_name": "Legal Doc Masker",
|
|
||||||
"name": "Legal Document Masker",
|
|
||||||
"icons": [
|
|
||||||
{
|
|
||||||
"src": "favicon.ico",
|
|
||||||
"sizes": "64x64 32x32 24x24 16x16",
|
|
||||||
"type": "image/x-icon"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"start_url": ".",
|
|
||||||
"display": "standalone",
|
|
||||||
"theme_color": "#000000",
|
|
||||||
"background_color": "#ffffff"
|
|
||||||
}
|
|
||||||
|
|
@ -1,58 +0,0 @@
|
||||||
import React, { useEffect, useState } from 'react';
|
|
||||||
import { Container, Typography, Box } from '@mui/material';
|
|
||||||
import { useQuery, useQueryClient } from 'react-query';
|
|
||||||
import FileUpload from './components/FileUpload';
|
|
||||||
import FileList from './components/FileList';
|
|
||||||
import { File } from './types/file';
|
|
||||||
import { api } from './services/api';
|
|
||||||
|
|
||||||
function App() {
|
|
||||||
const queryClient = useQueryClient();
|
|
||||||
const [files, setFiles] = useState<File[]>([]);
|
|
||||||
|
|
||||||
const { data, isLoading, error } = useQuery<File[]>('files', api.listFiles, {
|
|
||||||
refetchInterval: 5000, // Poll every 5 seconds
|
|
||||||
});
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
if (data) {
|
|
||||||
setFiles(data);
|
|
||||||
}
|
|
||||||
}, [data]);
|
|
||||||
|
|
||||||
const handleUploadComplete = () => {
|
|
||||||
queryClient.invalidateQueries('files');
|
|
||||||
};
|
|
||||||
|
|
||||||
if (isLoading) {
|
|
||||||
return (
|
|
||||||
<Container>
|
|
||||||
<Typography>Loading...</Typography>
|
|
||||||
</Container>
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (error) {
|
|
||||||
return (
|
|
||||||
<Container>
|
|
||||||
<Typography color="error">Error loading files</Typography>
|
|
||||||
</Container>
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return (
|
|
||||||
<Container maxWidth="lg">
|
|
||||||
<Box sx={{ my: 4 }}>
|
|
||||||
<Typography variant="h4" component="h1" gutterBottom>
|
|
||||||
Legal Document Masker
|
|
||||||
</Typography>
|
|
||||||
<Box sx={{ mb: 4 }}>
|
|
||||||
<FileUpload onUploadComplete={handleUploadComplete} />
|
|
||||||
</Box>
|
|
||||||
<FileList files={files} onFileStatusChange={handleUploadComplete} />
|
|
||||||
</Box>
|
|
||||||
</Container>
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
export default App;
|
|
||||||
|
|
@ -1,144 +0,0 @@
|
||||||
import React, { useState } from 'react';
|
|
||||||
import {
|
|
||||||
Table,
|
|
||||||
TableBody,
|
|
||||||
TableCell,
|
|
||||||
TableContainer,
|
|
||||||
TableHead,
|
|
||||||
TableRow,
|
|
||||||
Paper,
|
|
||||||
IconButton,
|
|
||||||
Checkbox,
|
|
||||||
Button,
|
|
||||||
Chip,
|
|
||||||
} from '@mui/material';
|
|
||||||
import { Download as DownloadIcon } from '@mui/icons-material';
|
|
||||||
import { File, FileStatus } from '../types/file';
|
|
||||||
import { api } from '../services/api';
|
|
||||||
|
|
||||||
interface FileListProps {
|
|
||||||
files: File[];
|
|
||||||
onFileStatusChange: () => void;
|
|
||||||
}
|
|
||||||
|
|
||||||
const FileList: React.FC<FileListProps> = ({ files, onFileStatusChange }) => {
|
|
||||||
const [selectedFiles, setSelectedFiles] = useState<string[]>([]);
|
|
||||||
|
|
||||||
const handleSelectFile = (fileId: string) => {
|
|
||||||
setSelectedFiles((prev) =>
|
|
||||||
prev.includes(fileId)
|
|
||||||
? prev.filter((id) => id !== fileId)
|
|
||||||
: [...prev, fileId]
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleSelectAll = () => {
|
|
||||||
setSelectedFiles((prev) =>
|
|
||||||
prev.length === files.length ? [] : files.map((file) => file.id)
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleDownload = async (fileId: string) => {
|
|
||||||
try {
|
|
||||||
const blob = await api.downloadFile(fileId);
|
|
||||||
const url = window.URL.createObjectURL(blob);
|
|
||||||
const a = document.createElement('a');
|
|
||||||
a.href = url;
|
|
||||||
a.download = files.find((f) => f.id === fileId)?.filename || 'downloaded-file';
|
|
||||||
document.body.appendChild(a);
|
|
||||||
a.click();
|
|
||||||
window.URL.revokeObjectURL(url);
|
|
||||||
document.body.removeChild(a);
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error downloading file:', error);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleDownloadSelected = async () => {
|
|
||||||
for (const fileId of selectedFiles) {
|
|
||||||
await handleDownload(fileId);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const getStatusColor = (status: FileStatus) => {
|
|
||||||
switch (status) {
|
|
||||||
case FileStatus.SUCCESS:
|
|
||||||
return 'success';
|
|
||||||
case FileStatus.FAILED:
|
|
||||||
return 'error';
|
|
||||||
case FileStatus.PROCESSING:
|
|
||||||
return 'warning';
|
|
||||||
default:
|
|
||||||
return 'default';
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div>
|
|
||||||
<div style={{ marginBottom: '1rem' }}>
|
|
||||||
<Button
|
|
||||||
variant="contained"
|
|
||||||
color="primary"
|
|
||||||
onClick={handleDownloadSelected}
|
|
||||||
disabled={selectedFiles.length === 0}
|
|
||||||
>
|
|
||||||
Download Selected
|
|
||||||
</Button>
|
|
||||||
</div>
|
|
||||||
<TableContainer component={Paper}>
|
|
||||||
<Table>
|
|
||||||
<TableHead>
|
|
||||||
<TableRow>
|
|
||||||
<TableCell padding="checkbox">
|
|
||||||
<Checkbox
|
|
||||||
checked={selectedFiles.length === files.length}
|
|
||||||
indeterminate={selectedFiles.length > 0 && selectedFiles.length < files.length}
|
|
||||||
onChange={handleSelectAll}
|
|
||||||
/>
|
|
||||||
</TableCell>
|
|
||||||
<TableCell>Filename</TableCell>
|
|
||||||
<TableCell>Status</TableCell>
|
|
||||||
<TableCell>Created At</TableCell>
|
|
||||||
<TableCell>Actions</TableCell>
|
|
||||||
</TableRow>
|
|
||||||
</TableHead>
|
|
||||||
<TableBody>
|
|
||||||
{files.map((file) => (
|
|
||||||
<TableRow key={file.id}>
|
|
||||||
<TableCell padding="checkbox">
|
|
||||||
<Checkbox
|
|
||||||
checked={selectedFiles.includes(file.id)}
|
|
||||||
onChange={() => handleSelectFile(file.id)}
|
|
||||||
/>
|
|
||||||
</TableCell>
|
|
||||||
<TableCell>{file.filename}</TableCell>
|
|
||||||
<TableCell>
|
|
||||||
<Chip
|
|
||||||
label={file.status}
|
|
||||||
color={getStatusColor(file.status) as any}
|
|
||||||
size="small"
|
|
||||||
/>
|
|
||||||
</TableCell>
|
|
||||||
<TableCell>
|
|
||||||
{new Date(file.created_at).toLocaleString()}
|
|
||||||
</TableCell>
|
|
||||||
<TableCell>
|
|
||||||
{file.status === FileStatus.SUCCESS && (
|
|
||||||
<IconButton
|
|
||||||
onClick={() => handleDownload(file.id)}
|
|
||||||
size="small"
|
|
||||||
>
|
|
||||||
<DownloadIcon />
|
|
||||||
</IconButton>
|
|
||||||
)}
|
|
||||||
</TableCell>
|
|
||||||
</TableRow>
|
|
||||||
))}
|
|
||||||
</TableBody>
|
|
||||||
</Table>
|
|
||||||
</TableContainer>
|
|
||||||
</div>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
export default FileList;
|
|
||||||
|
|
@ -1,66 +0,0 @@
|
||||||
import React, { useCallback } from 'react';
|
|
||||||
import { useDropzone } from 'react-dropzone';
|
|
||||||
import { Box, Typography, CircularProgress } from '@mui/material';
|
|
||||||
import { api } from '../services/api';
|
|
||||||
|
|
||||||
interface FileUploadProps {
|
|
||||||
onUploadComplete: () => void;
|
|
||||||
}
|
|
||||||
|
|
||||||
const FileUpload: React.FC<FileUploadProps> = ({ onUploadComplete }) => {
|
|
||||||
const [isUploading, setIsUploading] = React.useState(false);
|
|
||||||
|
|
||||||
const onDrop = useCallback(async (acceptedFiles: File[]) => {
|
|
||||||
setIsUploading(true);
|
|
||||||
try {
|
|
||||||
for (const file of acceptedFiles) {
|
|
||||||
await api.uploadFile(file);
|
|
||||||
}
|
|
||||||
onUploadComplete();
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error uploading files:', error);
|
|
||||||
} finally {
|
|
||||||
setIsUploading(false);
|
|
||||||
}
|
|
||||||
}, [onUploadComplete]);
|
|
||||||
|
|
||||||
const { getRootProps, getInputProps, isDragActive } = useDropzone({
|
|
||||||
onDrop,
|
|
||||||
accept: {
|
|
||||||
'application/pdf': ['.pdf'],
|
|
||||||
'application/msword': ['.doc'],
|
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
|
||||||
'text/markdown': ['.md'],
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
return (
|
|
||||||
<Box
|
|
||||||
{...getRootProps()}
|
|
||||||
sx={{
|
|
||||||
border: '2px dashed #ccc',
|
|
||||||
borderRadius: 2,
|
|
||||||
p: 3,
|
|
||||||
textAlign: 'center',
|
|
||||||
cursor: 'pointer',
|
|
||||||
bgcolor: isDragActive ? 'action.hover' : 'background.paper',
|
|
||||||
'&:hover': {
|
|
||||||
bgcolor: 'action.hover',
|
|
||||||
},
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
<input {...getInputProps()} />
|
|
||||||
{isUploading ? (
|
|
||||||
<CircularProgress />
|
|
||||||
) : (
|
|
||||||
<Typography>
|
|
||||||
{isDragActive
|
|
||||||
? 'Drop the files here...'
|
|
||||||
: 'Drag and drop files here, or click to select files'}
|
|
||||||
</Typography>
|
|
||||||
)}
|
|
||||||
</Box>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
export default FileUpload;
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
import React from 'react';
|
|
||||||
import ReactDOM from 'react-dom/client';
|
|
||||||
import { QueryClient, QueryClientProvider } from 'react-query';
|
|
||||||
import { ThemeProvider, createTheme } from '@mui/material';
|
|
||||||
import CssBaseline from '@mui/material/CssBaseline';
|
|
||||||
import App from './App';
|
|
||||||
|
|
||||||
const queryClient = new QueryClient();
|
|
||||||
|
|
||||||
const theme = createTheme({
|
|
||||||
palette: {
|
|
||||||
mode: 'light',
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const root = ReactDOM.createRoot(
|
|
||||||
document.getElementById('root') as HTMLElement
|
|
||||||
);
|
|
||||||
|
|
||||||
root.render(
|
|
||||||
<React.StrictMode>
|
|
||||||
<QueryClientProvider client={queryClient}>
|
|
||||||
<ThemeProvider theme={theme}>
|
|
||||||
<CssBaseline />
|
|
||||||
<App />
|
|
||||||
</ThemeProvider>
|
|
||||||
</QueryClientProvider>
|
|
||||||
</React.StrictMode>
|
|
||||||
);
|
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
import axios from 'axios';
|
|
||||||
import { File, FileUploadResponse } from '../types/file';
|
|
||||||
|
|
||||||
const API_BASE_URL = 'http://localhost:8000/api/v1';
|
|
||||||
|
|
||||||
export const api = {
|
|
||||||
uploadFile: async (file: globalThis.File): Promise<FileUploadResponse> => {
|
|
||||||
const formData = new FormData();
|
|
||||||
formData.append('file', file);
|
|
||||||
const response = await axios.post(`${API_BASE_URL}/files/upload`, formData, {
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'multipart/form-data',
|
|
||||||
},
|
|
||||||
});
|
|
||||||
return response.data;
|
|
||||||
},
|
|
||||||
|
|
||||||
listFiles: async (): Promise<File[]> => {
|
|
||||||
const response = await axios.get(`${API_BASE_URL}/files/files`);
|
|
||||||
return response.data;
|
|
||||||
},
|
|
||||||
|
|
||||||
getFile: async (fileId: string): Promise<File> => {
|
|
||||||
const response = await axios.get(`${API_BASE_URL}/files/files/${fileId}`);
|
|
||||||
return response.data;
|
|
||||||
},
|
|
||||||
|
|
||||||
downloadFile: async (fileId: string): Promise<Blob> => {
|
|
||||||
const response = await axios.get(`${API_BASE_URL}/files/files/${fileId}/download`, {
|
|
||||||
responseType: 'blob',
|
|
||||||
});
|
|
||||||
return response.data;
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
||||||
export enum FileStatus {
|
|
||||||
NOT_STARTED = "not_started",
|
|
||||||
PROCESSING = "processing",
|
|
||||||
SUCCESS = "success",
|
|
||||||
FAILED = "failed"
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface File {
|
|
||||||
id: string;
|
|
||||||
filename: string;
|
|
||||||
status: FileStatus;
|
|
||||||
error_message?: string;
|
|
||||||
created_at: string;
|
|
||||||
updated_at: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface FileUploadResponse {
|
|
||||||
id: string;
|
|
||||||
filename: string;
|
|
||||||
status: FileStatus;
|
|
||||||
created_at: string;
|
|
||||||
updated_at: string;
|
|
||||||
}
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
{
|
|
||||||
"compilerOptions": {
|
|
||||||
"target": "es5",
|
|
||||||
"lib": [
|
|
||||||
"dom",
|
|
||||||
"dom.iterable",
|
|
||||||
"esnext"
|
|
||||||
],
|
|
||||||
"allowJs": true,
|
|
||||||
"skipLibCheck": true,
|
|
||||||
"esModuleInterop": true,
|
|
||||||
"allowSyntheticDefaultImports": true,
|
|
||||||
"strict": true,
|
|
||||||
"forceConsistentCasingInFileNames": true,
|
|
||||||
"noFallthroughCasesInSwitch": true,
|
|
||||||
"module": "esnext",
|
|
||||||
"moduleResolution": "node",
|
|
||||||
"resolveJsonModule": true,
|
|
||||||
"isolatedModules": true,
|
|
||||||
"noEmit": true,
|
|
||||||
"jsx": "react-jsx"
|
|
||||||
},
|
|
||||||
"include": [
|
|
||||||
"src"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
import logging.config
|
import logging.config
|
||||||
# from config.settings import settings
|
from config.settings import settings
|
||||||
from .settings import settings
|
|
||||||
|
|
||||||
LOGGING_CONFIG = {
|
LOGGING_CONFIG = {
|
||||||
"version": 1,
|
"version": 1,
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
# settings.py
|
||||||
|
|
||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
# Storage paths
|
||||||
|
OBJECT_STORAGE_PATH: str = ""
|
||||||
|
TARGET_DIRECTORY_PATH: str = ""
|
||||||
|
|
||||||
|
# Ollama API settings
|
||||||
|
OLLAMA_API_URL: str = "https://api.ollama.com"
|
||||||
|
OLLAMA_API_KEY: str = ""
|
||||||
|
OLLAMA_MODEL: str = "llama2"
|
||||||
|
|
||||||
|
# File monitoring settings
|
||||||
|
MONITOR_INTERVAL: int = 5
|
||||||
|
|
||||||
|
# Logging settings
|
||||||
|
LOG_LEVEL: str = "INFO"
|
||||||
|
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
|
||||||
|
LOG_FILE: str = "app.log"
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
env_file = ".env"
|
||||||
|
env_file_encoding = "utf-8"
|
||||||
|
extra = "allow"
|
||||||
|
|
||||||
|
# Create settings instance
|
||||||
|
settings = Settings()
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from .document_processor import DocumentProcessor
|
from document_handlers.document_processor import DocumentProcessor
|
||||||
from .processors import (
|
from document_handlers.processors import (
|
||||||
TxtDocumentProcessor,
|
TxtDocumentProcessor,
|
||||||
DocxDocumentProcessor,
|
DocxDocumentProcessor,
|
||||||
PdfDocumentProcessor,
|
PdfDocumentProcessor,
|
||||||
|
|
@ -1,13 +1,11 @@
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
from ..prompts.masking_prompts import get_masking_mapping_prompt
|
from prompts.masking_prompts import get_masking_mapping_prompt
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
from ..services.ollama_client import OllamaClient
|
from services.ollama_client import OllamaClient
|
||||||
from ...core.config import settings
|
from config.settings import settings
|
||||||
from ..utils.json_extractor import LLMJsonExtractor
|
from utils.json_extractor import LLMJsonExtractor
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
from document_handlers.processors.txt_processor import TxtDocumentProcessor
|
||||||
|
from document_handlers.processors.docx_processor import DocxDocumentProcessor
|
||||||
|
from document_handlers.processors.pdf_processor import PdfDocumentProcessor
|
||||||
|
from document_handlers.processors.md_processor import MarkdownDocumentProcessor
|
||||||
|
|
||||||
|
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||||
|
|
@ -1,13 +1,13 @@
|
||||||
import os
|
import os
|
||||||
import docx
|
import docx
|
||||||
from ...document_handlers.document_processor import DocumentProcessor
|
from document_handlers.document_processor import DocumentProcessor
|
||||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
||||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||||
from magic_pdf.data.read_api import read_local_office
|
from magic_pdf.data.read_api import read_local_office
|
||||||
import logging
|
import logging
|
||||||
from ...services.ollama_client import OllamaClient
|
from services.ollama_client import OllamaClient
|
||||||
from ...config import settings
|
from config.settings import settings
|
||||||
from ...prompts.masking_prompts import get_masking_mapping_prompt
|
from prompts.masking_prompts import get_masking_mapping_prompt
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
import os
|
import os
|
||||||
from ...document_handlers.document_processor import DocumentProcessor
|
from document_handlers.document_processor import DocumentProcessor
|
||||||
from ...services.ollama_client import OllamaClient
|
from services.ollama_client import OllamaClient
|
||||||
import logging
|
import logging
|
||||||
from ...config import settings
|
from config.settings import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -1,14 +1,14 @@
|
||||||
import os
|
import os
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
from ...document_handlers.document_processor import DocumentProcessor
|
from document_handlers.document_processor import DocumentProcessor
|
||||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
||||||
from magic_pdf.data.dataset import PymuDocDataset
|
from magic_pdf.data.dataset import PymuDocDataset
|
||||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||||
from magic_pdf.config.enums import SupportedPdfParseMethod
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
||||||
from ...prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
|
from prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
|
||||||
import logging
|
import logging
|
||||||
from ...services.ollama_client import OllamaClient
|
from services.ollama_client import OllamaClient
|
||||||
from ...config import settings
|
from config.settings import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
from ...document_handlers.document_processor import DocumentProcessor
|
from document_handlers.document_processor import DocumentProcessor
|
||||||
from ...services.ollama_client import OllamaClient
|
from services.ollama_client import OllamaClient
|
||||||
import logging
|
import logging
|
||||||
from ...prompts.masking_prompts import get_masking_prompt
|
from prompts.masking_prompts import get_masking_prompt
|
||||||
from ...config import settings
|
from config.settings import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
class TxtDocumentProcessor(DocumentProcessor):
|
class TxtDocumentProcessor(DocumentProcessor):
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
from config.logging_config import setup_logging
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Setup logging first
|
||||||
|
setup_logging()
|
||||||
|
|
||||||
|
from services.file_monitor import FileMonitor
|
||||||
|
from config.settings import settings
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.info("Starting the application")
|
||||||
|
logger.info(f"Monitoring directory: {settings.OBJECT_STORAGE_PATH}")
|
||||||
|
logger.info(f"Target directory: {settings.TARGET_DIRECTORY_PATH}")
|
||||||
|
# Initialize the file monitor
|
||||||
|
file_monitor = FileMonitor(settings.OBJECT_STORAGE_PATH, settings.TARGET_DIRECTORY_PATH)
|
||||||
|
|
||||||
|
# Start monitoring the directory for new files
|
||||||
|
file_monitor.start_monitoring()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -1,12 +1,12 @@
|
||||||
import logging
|
import logging
|
||||||
from ..document_handlers.document_factory import DocumentProcessorFactory
|
from document_handlers.document_factory import DocumentProcessorFactory
|
||||||
from ..services.ollama_client import OllamaClient
|
from services.ollama_client import OllamaClient
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class DocumentService:
|
class DocumentService:
|
||||||
def __init__(self):
|
def __init__(self, ollama_client: OllamaClient):
|
||||||
pass
|
self.ollama_client = ollama_client
|
||||||
|
|
||||||
def process_document(self, input_path: str, output_path: str) -> bool:
|
def process_document(self, input_path: str, output_path: str) -> bool:
|
||||||
try:
|
try:
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from services.document_service import DocumentService
|
||||||
|
from services.ollama_client import OllamaClient
|
||||||
|
from config.settings import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class FileMonitor:
|
||||||
|
def __init__(self, input_directory: str, output_directory: str):
|
||||||
|
self.input_directory = input_directory
|
||||||
|
self.output_directory = output_directory
|
||||||
|
|
||||||
|
# Create OllamaClient instance using settings
|
||||||
|
ollama_client = OllamaClient(
|
||||||
|
model_name=settings.OLLAMA_MODEL,
|
||||||
|
base_url=settings.OLLAMA_API_URL
|
||||||
|
)
|
||||||
|
# Inject OllamaClient into DocumentService
|
||||||
|
self.document_service = DocumentService(ollama_client=ollama_client)
|
||||||
|
|
||||||
|
def process_new_file(self, file_path: str) -> None:
|
||||||
|
try:
|
||||||
|
# Get the filename without directory path
|
||||||
|
filename = os.path.basename(file_path)
|
||||||
|
# Create output path
|
||||||
|
output_path = os.path.join(self.output_directory, filename)
|
||||||
|
|
||||||
|
logger.info(f"Processing file: {filename}")
|
||||||
|
# Process the document using document service
|
||||||
|
self.document_service.process_document(file_path, output_path)
|
||||||
|
logger.info(f"File processed successfully: {filename}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing file {file_path}: {str(e)}")
|
||||||
|
|
||||||
|
def start_monitoring(self):
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Ensure output directory exists
|
||||||
|
os.makedirs(self.output_directory, exist_ok=True)
|
||||||
|
|
||||||
|
already_seen = set(os.listdir(self.input_directory))
|
||||||
|
while True:
|
||||||
|
time.sleep(1) # Check every second
|
||||||
|
current_files = set(os.listdir(self.input_directory))
|
||||||
|
new_files = current_files - already_seen
|
||||||
|
|
||||||
|
for new_file in new_files:
|
||||||
|
file_path = os.path.join(self.input_directory, new_file)
|
||||||
|
logger.info(f"New file found: {new_file}")
|
||||||
|
self.process_new_file(file_path)
|
||||||
|
|
||||||
|
already_seen = current_files
|
||||||
Loading…
Reference in New Issue