feat:在docker中集成mineru,并且修正下载文件名不正确的问题
This commit is contained in:
parent
345fd05a2b
commit
dea3a6bd6a
|
|
@ -6,14 +6,21 @@ WORKDIR /app
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
build-essential \
|
build-essential \
|
||||||
libreoffice \
|
libreoffice \
|
||||||
|
wget \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
# Copy requirements first to leverage Docker cache
|
# Copy requirements first to leverage Docker cache
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
|
RUN pip install huggingface_hub
|
||||||
|
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
|
||||||
|
RUN python download_models_hf.py
|
||||||
|
|
||||||
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
RUN pip install -U magic-pdf[full]
|
RUN pip install -U magic-pdf[full]
|
||||||
|
|
||||||
|
|
||||||
# Copy the rest of the application
|
# Copy the rest of the application
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ from ...services.file_service import process_file
|
||||||
from ...schemas.file import FileResponse as FileResponseSchema, FileList
|
from ...schemas.file import FileResponse as FileResponseSchema, FileList
|
||||||
import asyncio
|
import asyncio
|
||||||
from fastapi import WebSocketDisconnect
|
from fastapi import WebSocketDisconnect
|
||||||
|
import uuid
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
@ -27,14 +28,20 @@ async def upload_file(
|
||||||
detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
|
detail=f"File type not allowed. Allowed types: {', '.join(settings.ALLOWED_EXTENSIONS)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save file
|
# Generate unique file ID
|
||||||
file_path = settings.UPLOAD_FOLDER / file.filename
|
file_id = str(uuid.uuid4())
|
||||||
|
file_extension = os.path.splitext(file.filename)[1]
|
||||||
|
unique_filename = f"{file_id}{file_extension}"
|
||||||
|
|
||||||
|
# Save file with unique name
|
||||||
|
file_path = settings.UPLOAD_FOLDER / unique_filename
|
||||||
with open(file_path, "wb") as buffer:
|
with open(file_path, "wb") as buffer:
|
||||||
content = await file.read()
|
content = await file.read()
|
||||||
buffer.write(content)
|
buffer.write(content)
|
||||||
|
|
||||||
# Create database entry
|
# Create database entry
|
||||||
db_file = FileModel(
|
db_file = FileModel(
|
||||||
|
id=file_id,
|
||||||
filename=file.filename,
|
filename=file.filename,
|
||||||
original_path=str(file_path),
|
original_path=str(file_path),
|
||||||
status=FileStatus.NOT_STARTED
|
status=FileStatus.NOT_STARTED
|
||||||
|
|
|
||||||
|
|
@ -31,9 +31,8 @@ def process_file(file_id: str):
|
||||||
# Process the file using your existing masking system
|
# Process the file using your existing masking system
|
||||||
process_service = DocumentService()
|
process_service = DocumentService()
|
||||||
|
|
||||||
# Determine output path
|
# Determine output path using file_id with .md extension
|
||||||
input_path = Path(file.original_path)
|
output_filename = f"{file_id}.md"
|
||||||
output_filename = f"processed_{input_path.name}"
|
|
||||||
output_path = str(settings.PROCESSED_FOLDER / output_filename)
|
output_path = str(settings.PROCESSED_FOLDER / output_filename)
|
||||||
|
|
||||||
# Process document with both input and output paths
|
# Process document with both input and output paths
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue