feat:在docker中集成mineru,并且修正下载文件名不正确的问题
This commit is contained in:
parent
345fd05a2b
commit
dea3a6bd6a
|
|
@ -6,14 +6,21 @@ WORKDIR /app
|
|||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
libreoffice \
|
||||
wget \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# Copy requirements first to leverage Docker cache
|
||||
COPY requirements.txt .
|
||||
RUN pip install huggingface_hub
|
||||
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
|
||||
RUN python download_models_hf.py
|
||||
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install -U magic-pdf[full]
|
||||
|
||||
|
||||
# Copy the rest of the application
|
||||
COPY . .
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from ...services.file_service import process_file
|
|||
from ...schemas.file import FileResponse as FileResponseSchema, FileList
|
||||
import asyncio
|
||||
from fastapi import WebSocketDisconnect
|
||||
import uuid
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
|
@ -27,14 +28,20 @@ async def upload_file(
|
|||
detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
|
||||
)
|
||||
|
||||
# Save file
|
||||
file_path = settings.UPLOAD_FOLDER / file.filename
|
||||
# Generate unique file ID
|
||||
file_id = str(uuid.uuid4())
|
||||
file_extension = os.path.splitext(file.filename)[1]
|
||||
unique_filename = f"{file_id}{file_extension}"
|
||||
|
||||
# Save file with unique name
|
||||
file_path = settings.UPLOAD_FOLDER / unique_filename
|
||||
with open(file_path, "wb") as buffer:
|
||||
content = await file.read()
|
||||
buffer.write(content)
|
||||
|
||||
# Create database entry
|
||||
db_file = FileModel(
|
||||
id=file_id,
|
||||
filename=file.filename,
|
||||
original_path=str(file_path),
|
||||
status=FileStatus.NOT_STARTED
|
||||
|
|
|
|||
|
|
@ -31,9 +31,8 @@ def process_file(file_id: str):
|
|||
# Process the file using your existing masking system
|
||||
process_service = DocumentService()
|
||||
|
||||
# Determine output path
|
||||
input_path = Path(file.original_path)
|
||||
output_filename = f"processed_{input_path.name}"
|
||||
# Determine output path using file_id with .md extension
|
||||
output_filename = f"{file_id}.md"
|
||||
output_path = str(settings.PROCESSED_FOLDER / output_filename)
|
||||
|
||||
# Process document with both input and output paths
|
||||
|
|
|
|||
Loading…
Reference in New Issue