legal-doc-masker/backend/Dockerfile

FROM python:3.11-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    build-essential \
    libreoffice \
    wget \
    git \
    && rm -rf /var/lib/apt/lists/*


# Copy requirements first to leverage Docker cache
COPY requirements.txt .

# Upgrade pip and install core dependencies
RUN pip install --upgrade pip setuptools wheel

# Install PyTorch CPU version first (for better caching and smaller size)
RUN pip install --no-cache-dir torch==2.7.0 -f https://download.pytorch.org/whl/torch_stable.html

# Install the rest of the requirements
RUN pip install --no-cache-dir -r requirements.txt

# Pre-download NER model during build (larger image but faster startup)
# RUN python -c "
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# model_name = 'uer/roberta-base-finetuned-cluener2020-chinese'
# print('Downloading NER model...')
# AutoTokenizer.from_pretrained(model_name)
# AutoModelForTokenClassification.from_pretrained(model_name)
# print('NER model downloaded successfully')
# "


# Copy the rest of the application
COPY . .

# Create storage directories
RUN mkdir -p storage/uploads storage/processed

# Expose the port the app runs on
EXPOSE 8000

# Command to run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]