WIP: 暂时屏蔽docx,pdf解析

This commit is contained in:
oliviamn 2025-06-25 01:30:43 +08:00
parent 12c1b5f75e
commit ee95f1daa7
6 changed files with 22 additions and 18 deletions

View File

@ -13,8 +13,10 @@ RUN apt-get update && apt-get install -y \
# Copy requirements first to leverage Docker cache # Copy requirements first to leverage Docker cache
COPY requirements.txt . COPY requirements.txt .
RUN pip install huggingface_hub RUN pip install huggingface_hub
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py # RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
RUN python download_models_hf.py # RUN wget https://raw.githubusercontent.com/opendatalab/MinerU/refs/heads/release-1.3.1/scripts/download_models_hf.py -O download_models_hf.py
# RUN python download_models_hf.py
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt

View File

@ -3,8 +3,8 @@ from typing import Optional
from .document_processor import DocumentProcessor from .document_processor import DocumentProcessor
from .processors import ( from .processors import (
TxtDocumentProcessor, TxtDocumentProcessor,
DocxDocumentProcessor, # DocxDocumentProcessor,
PdfDocumentProcessor, # PdfDocumentProcessor,
MarkdownDocumentProcessor MarkdownDocumentProcessor
) )
@ -15,9 +15,9 @@ class DocumentProcessorFactory:
processors = { processors = {
'.txt': TxtDocumentProcessor, '.txt': TxtDocumentProcessor,
'.docx': DocxDocumentProcessor, # '.docx': DocxDocumentProcessor,
'.doc': DocxDocumentProcessor, # '.doc': DocxDocumentProcessor,
'.pdf': PdfDocumentProcessor, # '.pdf': PdfDocumentProcessor,
'.md': MarkdownDocumentProcessor, '.md': MarkdownDocumentProcessor,
'.markdown': MarkdownDocumentProcessor '.markdown': MarkdownDocumentProcessor
} }

View File

@ -1,6 +1,6 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Dict from typing import Any, Dict
from ..prompts.masking_prompts import get_masking_mapping_prompt from ..prompts.masking_prompts import get_masking_mapping_prompt_v2, get_ner_name_prompt
import logging import logging
import json import json
from ..services.ollama_client import OllamaClient from ..services.ollama_client import OllamaClient
@ -77,7 +77,7 @@ class DocumentProcessor(ABC):
"""Build mapping for a single chunk of text with retry logic""" """Build mapping for a single chunk of text with retry logic"""
for attempt in range(self.max_retries): for attempt in range(self.max_retries):
try: try:
formatted_prompt = get_masking_mapping_prompt(chunk) formatted_prompt = get_ner_name_prompt(chunk)
logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}") logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
response = self.ollama_client.generate(formatted_prompt) response = self.ollama_client.generate(formatted_prompt)
logger.info(f"Raw response from LLM: {response}") logger.info(f"Raw response from LLM: {response}")
@ -175,16 +175,17 @@ class DocumentProcessor(ABC):
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
logger.info(f"Processing chunk {i+1}/{len(chunks)}") logger.info(f"Processing chunk {i+1}/{len(chunks)}")
chunk_mapping = self._build_mapping(chunk) chunk_mapping = self._build_mapping(chunk)
if chunk_mapping: # Only update if we got a valid mapping # if chunk_mapping: # Only update if we got a valid mapping
combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping) # combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
else: # else:
logger.warning(f"Failed to generate mapping for chunk {i+1}") # logger.warning(f"Failed to generate mapping for chunk {i+1}")
# Apply the combined mapping to the entire content # Apply the combined mapping to the entire content
masked_content = self._apply_mapping(content, combined_mapping) # masked_content = self._apply_mapping(content, combined_mapping)
logger.info("Successfully masked content") logger.info("Successfully masked content")
return masked_content # return masked_content
return ""
@abstractmethod @abstractmethod
def save_content(self, content: str) -> None: def save_content(self, content: str) -> None:

View File

@ -1,6 +1,7 @@
from .txt_processor import TxtDocumentProcessor from .txt_processor import TxtDocumentProcessor
from .docx_processor import DocxDocumentProcessor # from .docx_processor import DocxDocumentProcessor
from .pdf_processor import PdfDocumentProcessor # from .pdf_processor import PdfDocumentProcessor
from .md_processor import MarkdownDocumentProcessor from .md_processor import MarkdownDocumentProcessor
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor'] # __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
__all__ = ['TxtDocumentProcessor', 'MarkdownDocumentProcessor']