feat:在docker中集成mineru,并且修正下载文件名不正确的问题

This commit is contained in:
oliviamn 2025-05-26 23:07:10 +08:00
parent 345fd05a2b
commit dea3a6bd6a
3 changed files with 18 additions and 5 deletions

View File

@ -6,14 +6,21 @@ WORKDIR /app
RUN apt-get update && apt-get install -y \
build-essential \
libreoffice \
wget \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
RUN pip install huggingface_hub
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
RUN python download_models_hf.py
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install -U magic-pdf[full]
# Copy the rest of the application
COPY . .

View File

@ -10,6 +10,7 @@ from ...services.file_service import process_file
from ...schemas.file import FileResponse as FileResponseSchema, FileList
import asyncio
from fastapi import WebSocketDisconnect
import uuid
router = APIRouter()
@ -27,14 +28,20 @@ async def upload_file(
detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
)
# Save file
file_path = settings.UPLOAD_FOLDER / file.filename
# Generate unique file ID
file_id = str(uuid.uuid4())
file_extension = os.path.splitext(file.filename)[1]
unique_filename = f"{file_id}{file_extension}"
# Save file with unique name
file_path = settings.UPLOAD_FOLDER / unique_filename
with open(file_path, "wb") as buffer:
content = await file.read()
buffer.write(content)
# Create database entry
db_file = FileModel(
id=file_id,
filename=file.filename,
original_path=str(file_path),
status=FileStatus.NOT_STARTED

View File

@ -31,9 +31,8 @@ def process_file(file_id: str):
# Process the file using your existing masking system
process_service = DocumentService()
# Determine output path
input_path = Path(file.original_path)
output_filename = f"processed_{input_path.name}"
# Determine output path using file_id with .md extension
output_filename = f"{file_id}.md"
output_path = str(settings.PROCESSED_FOLDER / output_filename)
# Process document with both input and output paths