Compare commits
8 Commits
47e78c35bb
...
345fd05a2b
| Author | SHA1 | Date |
|---|---|---|
|
|
345fd05a2b | |
|
|
b3cf9f98a7 | |
|
|
24c5bbd5d7 | |
|
|
13ef24a3da | |
|
|
900a614b09 | |
|
|
3e9c44e8c4 | |
|
|
e0695e7f0e | |
|
|
76b0351f8f |
|
|
@ -71,3 +71,6 @@ __pycache__
|
|||
data/doc_dest
|
||||
data/doc_src
|
||||
data/doc_intermediate
|
||||
|
||||
node_modules
|
||||
backend/storage/
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
libreoffice \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# Copy requirements first to leverage Docker cache
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install -U magic-pdf[full]
|
||||
|
||||
# Copy the rest of the application
|
||||
COPY . .
|
||||
|
||||
# Create storage directories
|
||||
RUN mkdir -p storage/uploads storage/processed
|
||||
|
||||
# Expose the port the app runs on
|
||||
EXPOSE 8000
|
||||
|
||||
# Command to run the application
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
# Legal Document Masker API
|
||||
|
||||
This is the backend API for the Legal Document Masking system. It provides endpoints for file upload, processing status tracking, and file download.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.8+
|
||||
- Redis (for Celery)
|
||||
|
||||
## File Storage
|
||||
|
||||
Files are stored in the following structure:
|
||||
```
|
||||
backend/
|
||||
├── storage/
|
||||
│ ├── uploads/ # Original uploaded files
|
||||
│ └── processed/ # Masked/processed files
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
### Option 1: Local Development
|
||||
|
||||
1. Create a virtual environment:
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Set up environment variables:
|
||||
Create a `.env` file in the backend directory with the following variables:
|
||||
```env
|
||||
SECRET_KEY=your-secret-key-here
|
||||
```
|
||||
|
||||
The database (SQLite) will be automatically created when you first run the application.
|
||||
|
||||
4. Start Redis (required for Celery):
|
||||
```bash
|
||||
redis-server
|
||||
```
|
||||
|
||||
5. Start Celery worker:
|
||||
```bash
|
||||
celery -A app.services.file_service worker --loglevel=info
|
||||
```
|
||||
|
||||
6. Start the FastAPI server:
|
||||
```bash
|
||||
uvicorn app.main:app --reload
|
||||
```
|
||||
|
||||
### Option 2: Docker Deployment
|
||||
|
||||
1. Build and start the services:
|
||||
```bash
|
||||
docker-compose up --build
|
||||
```
|
||||
|
||||
This will start:
|
||||
- FastAPI server on port 8000
|
||||
- Celery worker for background processing
|
||||
- Redis for task queue
|
||||
|
||||
## API Documentation
|
||||
|
||||
Once the server is running, you can access:
|
||||
- Swagger UI: `http://localhost:8000/docs`
|
||||
- ReDoc: `http://localhost:8000/redoc`
|
||||
|
||||
## API Endpoints
|
||||
|
||||
- `POST /api/v1/files/upload` - Upload a new file
|
||||
- `GET /api/v1/files` - List all files
|
||||
- `GET /api/v1/files/{file_id}` - Get file details
|
||||
- `GET /api/v1/files/{file_id}/download` - Download processed file
|
||||
- `WS /api/v1/files/ws/status/{file_id}` - WebSocket for real-time status updates
|
||||
|
||||
## Development
|
||||
|
||||
### Running Tests
|
||||
```bash
|
||||
pytest
|
||||
```
|
||||
|
||||
### Code Style
|
||||
The project uses Black for code formatting:
|
||||
```bash
|
||||
black .
|
||||
```
|
||||
|
||||
### Docker Commands
|
||||
|
||||
- Start services: `docker-compose up`
|
||||
- Start in background: `docker-compose up -d`
|
||||
- Stop services: `docker-compose down`
|
||||
- View logs: `docker-compose logs -f`
|
||||
- Rebuild: `docker-compose up --build`
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, WebSocket, Response
|
||||
from fastapi.responses import FileResponse
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List
|
||||
import os
|
||||
from ...core.config import settings
|
||||
from ...core.database import get_db
|
||||
from ...models.file import File as FileModel, FileStatus
|
||||
from ...services.file_service import process_file
|
||||
from ...schemas.file import FileResponse as FileResponseSchema, FileList
|
||||
import asyncio
|
||||
from fastapi import WebSocketDisconnect
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.post("/upload", response_model=FileResponseSchema)
|
||||
async def upload_file(
|
||||
file: UploadFile = File(...),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="No file provided")
|
||||
|
||||
if not any(file.filename.lower().endswith(ext) for ext in settings.ALLOWED_EXTENSIONS):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
|
||||
)
|
||||
|
||||
# Save file
|
||||
file_path = settings.UPLOAD_FOLDER / file.filename
|
||||
with open(file_path, "wb") as buffer:
|
||||
content = await file.read()
|
||||
buffer.write(content)
|
||||
|
||||
# Create database entry
|
||||
db_file = FileModel(
|
||||
filename=file.filename,
|
||||
original_path=str(file_path),
|
||||
status=FileStatus.NOT_STARTED
|
||||
)
|
||||
db.add(db_file)
|
||||
db.commit()
|
||||
db.refresh(db_file)
|
||||
|
||||
# Start processing
|
||||
process_file.delay(str(db_file.id))
|
||||
|
||||
return db_file
|
||||
|
||||
@router.get("/files", response_model=List[FileResponseSchema])
|
||||
def list_files(
|
||||
skip: int = 0,
|
||||
limit: int = 100,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
files = db.query(FileModel).offset(skip).limit(limit).all()
|
||||
return files
|
||||
|
||||
@router.get("/files/{file_id}", response_model=FileResponseSchema)
|
||||
def get_file(
|
||||
file_id: str,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
file = db.query(FileModel).filter(FileModel.id == file_id).first()
|
||||
if not file:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
return file
|
||||
|
||||
@router.get("/files/{file_id}/download")
|
||||
async def download_file(
|
||||
file_id: str,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
file = db.query(FileModel).filter(FileModel.id == file_id).first()
|
||||
if not file:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
if file.status != FileStatus.SUCCESS:
|
||||
raise HTTPException(status_code=400, detail="File is not ready for download")
|
||||
|
||||
if not os.path.exists(file.processed_path):
|
||||
raise HTTPException(status_code=404, detail="Processed file not found")
|
||||
|
||||
return FileResponse(
|
||||
path=file.processed_path,
|
||||
filename=file.filename,
|
||||
media_type="application/octet-stream"
|
||||
)
|
||||
|
||||
@router.websocket("/ws/status/{file_id}")
|
||||
async def websocket_endpoint(websocket: WebSocket, file_id: str, db: Session = Depends(get_db)):
|
||||
await websocket.accept()
|
||||
try:
|
||||
while True:
|
||||
file = db.query(FileModel).filter(FileModel.id == file_id).first()
|
||||
if not file:
|
||||
await websocket.send_json({"error": "File not found"})
|
||||
break
|
||||
|
||||
await websocket.send_json({
|
||||
"status": file.status,
|
||||
"error": file.error_message
|
||||
})
|
||||
|
||||
if file.status in [FileStatus.SUCCESS, FileStatus.FAILED]:
|
||||
break
|
||||
|
||||
await asyncio.sleep(1)
|
||||
except WebSocketDisconnect:
|
||||
pass
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
from pydantic_settings import BaseSettings
|
||||
from typing import Optional
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# API Settings
|
||||
API_V1_STR: str = "/api/v1"
|
||||
PROJECT_NAME: str = "Legal Document Masker API"
|
||||
|
||||
# Security
|
||||
SECRET_KEY: str = "your-secret-key-here" # Change in production
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
|
||||
|
||||
# Database
|
||||
BASE_DIR: Path = Path(__file__).parent.parent.parent
|
||||
DATABASE_URL: str = f"sqlite:///{BASE_DIR}/storage/legal_doc_masker.db"
|
||||
|
||||
# File Storage
|
||||
UPLOAD_FOLDER: Path = BASE_DIR / "storage" / "uploads"
|
||||
PROCESSED_FOLDER: Path = BASE_DIR / "storage" / "processed"
|
||||
MAX_FILE_SIZE: int = 50 * 1024 * 1024 # 50MB
|
||||
ALLOWED_EXTENSIONS: set = {"pdf", "docx", "doc", "md"}
|
||||
|
||||
# Celery
|
||||
CELERY_BROKER_URL: str = "redis://redis:6379/0"
|
||||
CELERY_RESULT_BACKEND: str = "redis://redis:6379/0"
|
||||
|
||||
# Ollama API settings
|
||||
OLLAMA_API_URL: str = "https://api.ollama.com"
|
||||
OLLAMA_API_KEY: str = ""
|
||||
OLLAMA_MODEL: str = "llama2"
|
||||
|
||||
# Logging settings
|
||||
LOG_LEVEL: str = "INFO"
|
||||
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
|
||||
LOG_FILE: str = "app.log"
|
||||
|
||||
class Config:
|
||||
case_sensitive = True
|
||||
env_file = ".env"
|
||||
env_file_encoding = "utf-8"
|
||||
extra = "allow"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
# Create storage directories if they don't exist
|
||||
self.UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
self.PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||
# Create storage directory for database
|
||||
(self.BASE_DIR / "storage").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
settings = Settings()
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
import logging.config
|
||||
from config.settings import settings
|
||||
# from config.settings import settings
|
||||
from .settings import settings
|
||||
|
||||
LOGGING_CONFIG = {
|
||||
"version": 1,
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from .config import settings
|
||||
|
||||
# Create SQLite engine with check_same_thread=False for FastAPI
|
||||
engine = create_engine(
|
||||
settings.DATABASE_URL,
|
||||
connect_args={"check_same_thread": False}
|
||||
)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
# Dependency
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
from typing import Optional
|
||||
from document_handlers.document_processor import DocumentProcessor
|
||||
from document_handlers.processors import (
|
||||
from .document_processor import DocumentProcessor
|
||||
from .processors import (
|
||||
TxtDocumentProcessor,
|
||||
DocxDocumentProcessor,
|
||||
PdfDocumentProcessor,
|
||||
|
|
@ -1,11 +1,13 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict
|
||||
from prompts.masking_prompts import get_masking_mapping_prompt
|
||||
from ..prompts.masking_prompts import get_masking_mapping_prompt
|
||||
import logging
|
||||
import json
|
||||
from services.ollama_client import OllamaClient
|
||||
from config.settings import settings
|
||||
from utils.json_extractor import LLMJsonExtractor
|
||||
from ..services.ollama_client import OllamaClient
|
||||
from ...core.config import settings
|
||||
from ..utils.json_extractor import LLMJsonExtractor
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
from .txt_processor import TxtDocumentProcessor
|
||||
from .docx_processor import DocxDocumentProcessor
|
||||
from .pdf_processor import PdfDocumentProcessor
|
||||
from .md_processor import MarkdownDocumentProcessor
|
||||
|
||||
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
import os
|
||||
import docx
|
||||
from document_handlers.document_processor import DocumentProcessor
|
||||
from ...document_handlers.document_processor import DocumentProcessor
|
||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.data.read_api import read_local_office
|
||||
import logging
|
||||
from services.ollama_client import OllamaClient
|
||||
from config.settings import settings
|
||||
from prompts.masking_prompts import get_masking_mapping_prompt
|
||||
from ...services.ollama_client import OllamaClient
|
||||
from ...config import settings
|
||||
from ...prompts.masking_prompts import get_masking_mapping_prompt
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
import os
|
||||
from document_handlers.document_processor import DocumentProcessor
|
||||
from services.ollama_client import OllamaClient
|
||||
from ...document_handlers.document_processor import DocumentProcessor
|
||||
from ...services.ollama_client import OllamaClient
|
||||
import logging
|
||||
from config.settings import settings
|
||||
from ...config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
import os
|
||||
import PyPDF2
|
||||
from document_handlers.document_processor import DocumentProcessor
|
||||
from ...document_handlers.document_processor import DocumentProcessor
|
||||
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
||||
from magic_pdf.data.dataset import PymuDocDataset
|
||||
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
||||
from magic_pdf.config.enums import SupportedPdfParseMethod
|
||||
from prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
|
||||
from ...prompts.masking_prompts import get_masking_prompt, get_masking_mapping_prompt
|
||||
import logging
|
||||
from services.ollama_client import OllamaClient
|
||||
from config.settings import settings
|
||||
from ...services.ollama_client import OllamaClient
|
||||
from ...config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
from document_handlers.document_processor import DocumentProcessor
|
||||
from services.ollama_client import OllamaClient
|
||||
from ...document_handlers.document_processor import DocumentProcessor
|
||||
from ...services.ollama_client import OllamaClient
|
||||
import logging
|
||||
from prompts.masking_prompts import get_masking_prompt
|
||||
from config.settings import settings
|
||||
from ...prompts.masking_prompts import get_masking_prompt
|
||||
from ...config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
class TxtDocumentProcessor(DocumentProcessor):
|
||||
|
|
@ -1,12 +1,12 @@
|
|||
import logging
|
||||
from document_handlers.document_factory import DocumentProcessorFactory
|
||||
from services.ollama_client import OllamaClient
|
||||
from ..document_handlers.document_factory import DocumentProcessorFactory
|
||||
from ..services.ollama_client import OllamaClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentService:
|
||||
def __init__(self, ollama_client: OllamaClient):
|
||||
self.ollama_client = ollama_client
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def process_document(self, input_path: str, output_path: str) -> bool:
|
||||
try:
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from .core.config import settings
|
||||
from .api.endpoints import files
|
||||
from .core.database import engine, Base
|
||||
|
||||
# Create database tables
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
app = FastAPI(
|
||||
title=settings.PROJECT_NAME,
|
||||
openapi_url=f"{settings.API_V1_STR}/openapi.json"
|
||||
)
|
||||
|
||||
# Set up CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # In production, replace with specific origins
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Include routers
|
||||
app.include_router(
|
||||
files.router,
|
||||
prefix=f"{settings.API_V1_STR}/files",
|
||||
tags=["files"]
|
||||
)
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return {"message": "Welcome to Legal Document Masker API"}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
from sqlalchemy import Column, String, DateTime, Text
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
from ..core.database import Base
|
||||
|
||||
class FileStatus(str):
|
||||
NOT_STARTED = "not_started"
|
||||
PROCESSING = "processing"
|
||||
SUCCESS = "success"
|
||||
FAILED = "failed"
|
||||
|
||||
class File(Base):
|
||||
__tablename__ = "files"
|
||||
|
||||
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
filename = Column(String(255), nullable=False)
|
||||
original_path = Column(String(255), nullable=False)
|
||||
processed_path = Column(String(255))
|
||||
status = Column(String(20), nullable=False, default=FileStatus.NOT_STARTED)
|
||||
error_message = Column(Text)
|
||||
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from uuid import UUID
|
||||
|
||||
class FileBase(BaseModel):
|
||||
filename: str
|
||||
status: str
|
||||
error_message: Optional[str] = None
|
||||
|
||||
class FileResponse(FileBase):
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
class FileList(BaseModel):
|
||||
files: list[FileResponse]
|
||||
total: int
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
from celery import Celery
|
||||
from ..core.config import settings
|
||||
from ..models.file import File, FileStatus
|
||||
from sqlalchemy.orm import Session
|
||||
from ..core.database import SessionLocal
|
||||
import sys
|
||||
import os
|
||||
from ..core.services.document_service import DocumentService
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
celery = Celery(
|
||||
'file_service',
|
||||
broker=settings.CELERY_BROKER_URL,
|
||||
backend=settings.CELERY_RESULT_BACKEND
|
||||
)
|
||||
|
||||
@celery.task
|
||||
def process_file(file_id: str):
|
||||
db = SessionLocal()
|
||||
try:
|
||||
file = db.query(File).filter(File.id == file_id).first()
|
||||
if not file:
|
||||
return
|
||||
|
||||
# Update status to processing
|
||||
file.status = FileStatus.PROCESSING
|
||||
db.commit()
|
||||
|
||||
try:
|
||||
# Process the file using your existing masking system
|
||||
process_service = DocumentService()
|
||||
|
||||
# Determine output path
|
||||
input_path = Path(file.original_path)
|
||||
output_filename = f"processed_{input_path.name}"
|
||||
output_path = str(settings.PROCESSED_FOLDER / output_filename)
|
||||
|
||||
# Process document with both input and output paths
|
||||
process_service.process_document(file.original_path, output_path)
|
||||
|
||||
# Update file record with processed path
|
||||
file.processed_path = output_path
|
||||
file.status = FileStatus.SUCCESS
|
||||
db.commit()
|
||||
|
||||
except Exception as e:
|
||||
file.status = FileStatus.FAILED
|
||||
file.error_message = str(e)
|
||||
db.commit()
|
||||
raise
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
version: '3.8'
|
||||
|
||||
services:
|
||||
api:
|
||||
build: .
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- ./storage:/app/storage
|
||||
- ./legal_doc_masker.db:/app/legal_doc_masker.db
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
celery_worker:
|
||||
build: .
|
||||
command: celery -A app.services.file_service worker --loglevel=info
|
||||
volumes:
|
||||
- ./storage:/app/storage
|
||||
- ./legal_doc_masker.db:/app/legal_doc_masker.db
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
depends_on:
|
||||
- redis
|
||||
- api
|
||||
|
||||
redis:
|
||||
image: redis:alpine
|
||||
ports:
|
||||
- "6379:6379"
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"name": "backend",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {}
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
# FastAPI and server
|
||||
fastapi>=0.104.0
|
||||
uvicorn>=0.24.0
|
||||
python-multipart>=0.0.6
|
||||
websockets>=12.0
|
||||
|
||||
# Database
|
||||
sqlalchemy>=2.0.0
|
||||
alembic>=1.12.0
|
||||
|
||||
# Background tasks
|
||||
celery>=5.3.0
|
||||
redis>=5.0.0
|
||||
|
||||
# Security
|
||||
python-jose[cryptography]>=3.3.0
|
||||
passlib[bcrypt]>=1.7.4
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# Testing
|
||||
pytest>=7.4.0
|
||||
httpx>=0.25.0
|
||||
|
||||
# Existing project dependencies
|
||||
pydantic-settings>=2.0.0
|
||||
watchdog==2.1.6
|
||||
requests==2.28.1
|
||||
python-docx>=0.8.11
|
||||
PyPDF2>=3.0.0
|
||||
pandas>=2.0.0
|
||||
magic-pdf[full]
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
rm ./doc_src/*.md
|
||||
cp ./doc/*.md ./doc_src/
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
# Legal Document Masker Frontend
|
||||
|
||||
This is the frontend application for the Legal Document Masker service. It provides a user interface for uploading legal documents, monitoring their processing status, and downloading the masked versions.
|
||||
|
||||
## Features
|
||||
|
||||
- Drag and drop file upload
|
||||
- Real-time status updates
|
||||
- File list with processing status
|
||||
- Multi-file selection and download
|
||||
- Modern Material-UI interface
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js (v14 or higher)
|
||||
- npm (v6 or higher)
|
||||
|
||||
## Installation
|
||||
|
||||
1. Install dependencies:
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
2. Start the development server:
|
||||
```bash
|
||||
npm start
|
||||
```
|
||||
|
||||
The application will be available at http://localhost:3000
|
||||
|
||||
## Development
|
||||
|
||||
The frontend is built with:
|
||||
- React 18
|
||||
- TypeScript
|
||||
- Material-UI
|
||||
- React Query for data fetching
|
||||
- React Dropzone for file uploads
|
||||
|
||||
## Building for Production
|
||||
|
||||
To create a production build:
|
||||
|
||||
```bash
|
||||
npm run build
|
||||
```
|
||||
|
||||
The build artifacts will be stored in the `build/` directory.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `REACT_APP_API_URL`: The URL of the backend API (default: http://localhost:8000/api/v1)
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,50 @@
|
|||
{
|
||||
"name": "legal-doc-masker-frontend",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"@emotion/react": "^11.11.3",
|
||||
"@emotion/styled": "^11.11.0",
|
||||
"@mui/icons-material": "^5.15.10",
|
||||
"@mui/material": "^5.15.10",
|
||||
"@testing-library/jest-dom": "^5.17.0",
|
||||
"@testing-library/react": "^13.4.0",
|
||||
"@testing-library/user-event": "^13.5.0",
|
||||
"@types/jest": "^27.5.2",
|
||||
"@types/node": "^16.18.80",
|
||||
"@types/react": "^18.2.55",
|
||||
"@types/react-dom": "^18.2.19",
|
||||
"axios": "^1.6.7",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0",
|
||||
"react-dropzone": "^14.2.3",
|
||||
"react-query": "^3.39.3",
|
||||
"react-scripts": "5.0.1",
|
||||
"typescript": "^4.9.5",
|
||||
"web-vitals": "^2.1.4"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "react-scripts start",
|
||||
"build": "react-scripts build",
|
||||
"test": "react-scripts test",
|
||||
"eject": "react-scripts eject"
|
||||
},
|
||||
"eslintConfig": {
|
||||
"extends": [
|
||||
"react-app",
|
||||
"react-app/jest"
|
||||
]
|
||||
},
|
||||
"browserslist": {
|
||||
"production": [
|
||||
">0.2%",
|
||||
"not dead",
|
||||
"not op_mini all"
|
||||
],
|
||||
"development": [
|
||||
"last 1 chrome version",
|
||||
"last 1 firefox version",
|
||||
"last 1 safari version"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<meta name="theme-color" content="#000000" />
|
||||
<meta
|
||||
name="description"
|
||||
content="Legal Document Masker - Upload and process legal documents"
|
||||
/>
|
||||
<link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
|
||||
<link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
|
||||
<title>Legal Document Masker</title>
|
||||
</head>
|
||||
<body>
|
||||
<noscript>You need to enable JavaScript to run this app.</noscript>
|
||||
<div id="root"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"short_name": "Legal Doc Masker",
|
||||
"name": "Legal Document Masker",
|
||||
"icons": [
|
||||
{
|
||||
"src": "favicon.ico",
|
||||
"sizes": "64x64 32x32 24x24 16x16",
|
||||
"type": "image/x-icon"
|
||||
}
|
||||
],
|
||||
"start_url": ".",
|
||||
"display": "standalone",
|
||||
"theme_color": "#000000",
|
||||
"background_color": "#ffffff"
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
import React, { useEffect, useState } from 'react';
|
||||
import { Container, Typography, Box } from '@mui/material';
|
||||
import { useQuery, useQueryClient } from 'react-query';
|
||||
import FileUpload from './components/FileUpload';
|
||||
import FileList from './components/FileList';
|
||||
import { File } from './types/file';
|
||||
import { api } from './services/api';
|
||||
|
||||
function App() {
|
||||
const queryClient = useQueryClient();
|
||||
const [files, setFiles] = useState<File[]>([]);
|
||||
|
||||
const { data, isLoading, error } = useQuery<File[]>('files', api.listFiles, {
|
||||
refetchInterval: 5000, // Poll every 5 seconds
|
||||
});
|
||||
|
||||
useEffect(() => {
|
||||
if (data) {
|
||||
setFiles(data);
|
||||
}
|
||||
}, [data]);
|
||||
|
||||
const handleUploadComplete = () => {
|
||||
queryClient.invalidateQueries('files');
|
||||
};
|
||||
|
||||
if (isLoading) {
|
||||
return (
|
||||
<Container>
|
||||
<Typography>Loading...</Typography>
|
||||
</Container>
|
||||
);
|
||||
}
|
||||
|
||||
if (error) {
|
||||
return (
|
||||
<Container>
|
||||
<Typography color="error">Error loading files</Typography>
|
||||
</Container>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<Container maxWidth="lg">
|
||||
<Box sx={{ my: 4 }}>
|
||||
<Typography variant="h4" component="h1" gutterBottom>
|
||||
Legal Document Masker
|
||||
</Typography>
|
||||
<Box sx={{ mb: 4 }}>
|
||||
<FileUpload onUploadComplete={handleUploadComplete} />
|
||||
</Box>
|
||||
<FileList files={files} onFileStatusChange={handleUploadComplete} />
|
||||
</Box>
|
||||
</Container>
|
||||
);
|
||||
}
|
||||
|
||||
export default App;
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
import React, { useState } from 'react';
|
||||
import {
|
||||
Table,
|
||||
TableBody,
|
||||
TableCell,
|
||||
TableContainer,
|
||||
TableHead,
|
||||
TableRow,
|
||||
Paper,
|
||||
IconButton,
|
||||
Checkbox,
|
||||
Button,
|
||||
Chip,
|
||||
} from '@mui/material';
|
||||
import { Download as DownloadIcon } from '@mui/icons-material';
|
||||
import { File, FileStatus } from '../types/file';
|
||||
import { api } from '../services/api';
|
||||
|
||||
interface FileListProps {
|
||||
files: File[];
|
||||
onFileStatusChange: () => void;
|
||||
}
|
||||
|
||||
const FileList: React.FC<FileListProps> = ({ files, onFileStatusChange }) => {
|
||||
const [selectedFiles, setSelectedFiles] = useState<string[]>([]);
|
||||
|
||||
const handleSelectFile = (fileId: string) => {
|
||||
setSelectedFiles((prev) =>
|
||||
prev.includes(fileId)
|
||||
? prev.filter((id) => id !== fileId)
|
||||
: [...prev, fileId]
|
||||
);
|
||||
};
|
||||
|
||||
const handleSelectAll = () => {
|
||||
setSelectedFiles((prev) =>
|
||||
prev.length === files.length ? [] : files.map((file) => file.id)
|
||||
);
|
||||
};
|
||||
|
||||
const handleDownload = async (fileId: string) => {
|
||||
try {
|
||||
const blob = await api.downloadFile(fileId);
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = files.find((f) => f.id === fileId)?.filename || 'downloaded-file';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
} catch (error) {
|
||||
console.error('Error downloading file:', error);
|
||||
}
|
||||
};
|
||||
|
||||
const handleDownloadSelected = async () => {
|
||||
for (const fileId of selectedFiles) {
|
||||
await handleDownload(fileId);
|
||||
}
|
||||
};
|
||||
|
||||
const getStatusColor = (status: FileStatus) => {
|
||||
switch (status) {
|
||||
case FileStatus.SUCCESS:
|
||||
return 'success';
|
||||
case FileStatus.FAILED:
|
||||
return 'error';
|
||||
case FileStatus.PROCESSING:
|
||||
return 'warning';
|
||||
default:
|
||||
return 'default';
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div>
|
||||
<div style={{ marginBottom: '1rem' }}>
|
||||
<Button
|
||||
variant="contained"
|
||||
color="primary"
|
||||
onClick={handleDownloadSelected}
|
||||
disabled={selectedFiles.length === 0}
|
||||
>
|
||||
Download Selected
|
||||
</Button>
|
||||
</div>
|
||||
<TableContainer component={Paper}>
|
||||
<Table>
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
<TableCell padding="checkbox">
|
||||
<Checkbox
|
||||
checked={selectedFiles.length === files.length}
|
||||
indeterminate={selectedFiles.length > 0 && selectedFiles.length < files.length}
|
||||
onChange={handleSelectAll}
|
||||
/>
|
||||
</TableCell>
|
||||
<TableCell>Filename</TableCell>
|
||||
<TableCell>Status</TableCell>
|
||||
<TableCell>Created At</TableCell>
|
||||
<TableCell>Actions</TableCell>
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{files.map((file) => (
|
||||
<TableRow key={file.id}>
|
||||
<TableCell padding="checkbox">
|
||||
<Checkbox
|
||||
checked={selectedFiles.includes(file.id)}
|
||||
onChange={() => handleSelectFile(file.id)}
|
||||
/>
|
||||
</TableCell>
|
||||
<TableCell>{file.filename}</TableCell>
|
||||
<TableCell>
|
||||
<Chip
|
||||
label={file.status}
|
||||
color={getStatusColor(file.status) as any}
|
||||
size="small"
|
||||
/>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
{new Date(file.created_at).toLocaleString()}
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
{file.status === FileStatus.SUCCESS && (
|
||||
<IconButton
|
||||
onClick={() => handleDownload(file.id)}
|
||||
size="small"
|
||||
>
|
||||
<DownloadIcon />
|
||||
</IconButton>
|
||||
)}
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default FileList;
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
import React, { useCallback } from 'react';
|
||||
import { useDropzone } from 'react-dropzone';
|
||||
import { Box, Typography, CircularProgress } from '@mui/material';
|
||||
import { api } from '../services/api';
|
||||
|
||||
interface FileUploadProps {
|
||||
onUploadComplete: () => void;
|
||||
}
|
||||
|
||||
const FileUpload: React.FC<FileUploadProps> = ({ onUploadComplete }) => {
|
||||
const [isUploading, setIsUploading] = React.useState(false);
|
||||
|
||||
const onDrop = useCallback(async (acceptedFiles: File[]) => {
|
||||
setIsUploading(true);
|
||||
try {
|
||||
for (const file of acceptedFiles) {
|
||||
await api.uploadFile(file);
|
||||
}
|
||||
onUploadComplete();
|
||||
} catch (error) {
|
||||
console.error('Error uploading files:', error);
|
||||
} finally {
|
||||
setIsUploading(false);
|
||||
}
|
||||
}, [onUploadComplete]);
|
||||
|
||||
const { getRootProps, getInputProps, isDragActive } = useDropzone({
|
||||
onDrop,
|
||||
accept: {
|
||||
'application/pdf': ['.pdf'],
|
||||
'application/msword': ['.doc'],
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
||||
'text/markdown': ['.md'],
|
||||
},
|
||||
});
|
||||
|
||||
return (
|
||||
<Box
|
||||
{...getRootProps()}
|
||||
sx={{
|
||||
border: '2px dashed #ccc',
|
||||
borderRadius: 2,
|
||||
p: 3,
|
||||
textAlign: 'center',
|
||||
cursor: 'pointer',
|
||||
bgcolor: isDragActive ? 'action.hover' : 'background.paper',
|
||||
'&:hover': {
|
||||
bgcolor: 'action.hover',
|
||||
},
|
||||
}}
|
||||
>
|
||||
<input {...getInputProps()} />
|
||||
{isUploading ? (
|
||||
<CircularProgress />
|
||||
) : (
|
||||
<Typography>
|
||||
{isDragActive
|
||||
? 'Drop the files here...'
|
||||
: 'Drag and drop files here, or click to select files'}
|
||||
</Typography>
|
||||
)}
|
||||
</Box>
|
||||
);
|
||||
};
|
||||
|
||||
export default FileUpload;
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
import React from 'react';
|
||||
import ReactDOM from 'react-dom/client';
|
||||
import { QueryClient, QueryClientProvider } from 'react-query';
|
||||
import { ThemeProvider, createTheme } from '@mui/material';
|
||||
import CssBaseline from '@mui/material/CssBaseline';
|
||||
import App from './App';
|
||||
|
||||
const queryClient = new QueryClient();
|
||||
|
||||
const theme = createTheme({
|
||||
palette: {
|
||||
mode: 'light',
|
||||
},
|
||||
});
|
||||
|
||||
const root = ReactDOM.createRoot(
|
||||
document.getElementById('root') as HTMLElement
|
||||
);
|
||||
|
||||
root.render(
|
||||
<React.StrictMode>
|
||||
<QueryClientProvider client={queryClient}>
|
||||
<ThemeProvider theme={theme}>
|
||||
<CssBaseline />
|
||||
<App />
|
||||
</ThemeProvider>
|
||||
</QueryClientProvider>
|
||||
</React.StrictMode>
|
||||
);
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
import axios from 'axios';
|
||||
import { File, FileUploadResponse } from '../types/file';
|
||||
|
||||
const API_BASE_URL = 'http://localhost:8000/api/v1';
|
||||
|
||||
export const api = {
|
||||
uploadFile: async (file: globalThis.File): Promise<FileUploadResponse> => {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
const response = await axios.post(`${API_BASE_URL}/files/upload`, formData, {
|
||||
headers: {
|
||||
'Content-Type': 'multipart/form-data',
|
||||
},
|
||||
});
|
||||
return response.data;
|
||||
},
|
||||
|
||||
listFiles: async (): Promise<File[]> => {
|
||||
const response = await axios.get(`${API_BASE_URL}/files/files`);
|
||||
return response.data;
|
||||
},
|
||||
|
||||
getFile: async (fileId: string): Promise<File> => {
|
||||
const response = await axios.get(`${API_BASE_URL}/files/files/${fileId}`);
|
||||
return response.data;
|
||||
},
|
||||
|
||||
downloadFile: async (fileId: string): Promise<Blob> => {
|
||||
const response = await axios.get(`${API_BASE_URL}/files/files/${fileId}/download`, {
|
||||
responseType: 'blob',
|
||||
});
|
||||
return response.data;
|
||||
},
|
||||
};
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
export enum FileStatus {
|
||||
NOT_STARTED = "not_started",
|
||||
PROCESSING = "processing",
|
||||
SUCCESS = "success",
|
||||
FAILED = "failed"
|
||||
}
|
||||
|
||||
export interface File {
|
||||
id: string;
|
||||
filename: string;
|
||||
status: FileStatus;
|
||||
error_message?: string;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
export interface FileUploadResponse {
|
||||
id: string;
|
||||
filename: string;
|
||||
status: FileStatus;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "es5",
|
||||
"lib": [
|
||||
"dom",
|
||||
"dom.iterable",
|
||||
"esnext"
|
||||
],
|
||||
"allowJs": true,
|
||||
"skipLibCheck": true,
|
||||
"esModuleInterop": true,
|
||||
"allowSyntheticDefaultImports": true,
|
||||
"strict": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
"module": "esnext",
|
||||
"moduleResolution": "node",
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"noEmit": true,
|
||||
"jsx": "react-jsx"
|
||||
},
|
||||
"include": [
|
||||
"src"
|
||||
]
|
||||
}
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
# settings.py
|
||||
|
||||
from pydantic_settings import BaseSettings
|
||||
from typing import Optional
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# Storage paths
|
||||
OBJECT_STORAGE_PATH: str = ""
|
||||
TARGET_DIRECTORY_PATH: str = ""
|
||||
|
||||
# Ollama API settings
|
||||
OLLAMA_API_URL: str = "https://api.ollama.com"
|
||||
OLLAMA_API_KEY: str = ""
|
||||
OLLAMA_MODEL: str = "llama2"
|
||||
|
||||
# File monitoring settings
|
||||
MONITOR_INTERVAL: int = 5
|
||||
|
||||
# Logging settings
|
||||
LOG_LEVEL: str = "INFO"
|
||||
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
|
||||
LOG_FILE: str = "app.log"
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
env_file_encoding = "utf-8"
|
||||
extra = "allow"
|
||||
|
||||
# Create settings instance
|
||||
settings = Settings()
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
from document_handlers.processors.txt_processor import TxtDocumentProcessor
|
||||
from document_handlers.processors.docx_processor import DocxDocumentProcessor
|
||||
from document_handlers.processors.pdf_processor import PdfDocumentProcessor
|
||||
from document_handlers.processors.md_processor import MarkdownDocumentProcessor
|
||||
|
||||
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||
22
src/main.py
22
src/main.py
|
|
@ -1,22 +0,0 @@
|
|||
from config.logging_config import setup_logging
|
||||
|
||||
def main():
|
||||
# Setup logging first
|
||||
setup_logging()
|
||||
|
||||
from services.file_monitor import FileMonitor
|
||||
from config.settings import settings
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Starting the application")
|
||||
logger.info(f"Monitoring directory: {settings.OBJECT_STORAGE_PATH}")
|
||||
logger.info(f"Target directory: {settings.TARGET_DIRECTORY_PATH}")
|
||||
# Initialize the file monitor
|
||||
file_monitor = FileMonitor(settings.OBJECT_STORAGE_PATH, settings.TARGET_DIRECTORY_PATH)
|
||||
|
||||
# Start monitoring the directory for new files
|
||||
file_monitor.start_monitoring()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
import logging
|
||||
import os
|
||||
from services.document_service import DocumentService
|
||||
from services.ollama_client import OllamaClient
|
||||
from config.settings import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class FileMonitor:
|
||||
def __init__(self, input_directory: str, output_directory: str):
|
||||
self.input_directory = input_directory
|
||||
self.output_directory = output_directory
|
||||
|
||||
# Create OllamaClient instance using settings
|
||||
ollama_client = OllamaClient(
|
||||
model_name=settings.OLLAMA_MODEL,
|
||||
base_url=settings.OLLAMA_API_URL
|
||||
)
|
||||
# Inject OllamaClient into DocumentService
|
||||
self.document_service = DocumentService(ollama_client=ollama_client)
|
||||
|
||||
def process_new_file(self, file_path: str) -> None:
|
||||
try:
|
||||
# Get the filename without directory path
|
||||
filename = os.path.basename(file_path)
|
||||
# Create output path
|
||||
output_path = os.path.join(self.output_directory, filename)
|
||||
|
||||
logger.info(f"Processing file: {filename}")
|
||||
# Process the document using document service
|
||||
self.document_service.process_document(file_path, output_path)
|
||||
logger.info(f"File processed successfully: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_path}: {str(e)}")
|
||||
|
||||
def start_monitoring(self):
|
||||
import time
|
||||
|
||||
# Ensure output directory exists
|
||||
os.makedirs(self.output_directory, exist_ok=True)
|
||||
|
||||
already_seen = set(os.listdir(self.input_directory))
|
||||
while True:
|
||||
time.sleep(1) # Check every second
|
||||
current_files = set(os.listdir(self.input_directory))
|
||||
new_files = current_files - already_seen
|
||||
|
||||
for new_file in new_files:
|
||||
file_path = os.path.join(self.input_directory, new_file)
|
||||
logger.info(f"New file found: {new_file}")
|
||||
self.process_new_file(file_path)
|
||||
|
||||
already_seen = current_files
|
||||
Loading…
Reference in New Issue