WIP: 暂时屏蔽docx,pdf解析
This commit is contained in:
parent
12c1b5f75e
commit
ee95f1daa7
|
|
@ -13,8 +13,10 @@ RUN apt-get update && apt-get install -y \
|
|||
# Copy requirements first to leverage Docker cache
|
||||
COPY requirements.txt .
|
||||
RUN pip install huggingface_hub
|
||||
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
|
||||
RUN python download_models_hf.py
|
||||
# RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
|
||||
# RUN wget https://raw.githubusercontent.com/opendatalab/MinerU/refs/heads/release-1.3.1/scripts/download_models_hf.py -O download_models_hf.py
|
||||
|
||||
# RUN python download_models_hf.py
|
||||
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@ from typing import Optional
|
|||
from .document_processor import DocumentProcessor
|
||||
from .processors import (
|
||||
TxtDocumentProcessor,
|
||||
DocxDocumentProcessor,
|
||||
PdfDocumentProcessor,
|
||||
# DocxDocumentProcessor,
|
||||
# PdfDocumentProcessor,
|
||||
MarkdownDocumentProcessor
|
||||
)
|
||||
|
||||
|
|
@ -15,9 +15,9 @@ class DocumentProcessorFactory:
|
|||
|
||||
processors = {
|
||||
'.txt': TxtDocumentProcessor,
|
||||
'.docx': DocxDocumentProcessor,
|
||||
'.doc': DocxDocumentProcessor,
|
||||
'.pdf': PdfDocumentProcessor,
|
||||
# '.docx': DocxDocumentProcessor,
|
||||
# '.doc': DocxDocumentProcessor,
|
||||
# '.pdf': PdfDocumentProcessor,
|
||||
'.md': MarkdownDocumentProcessor,
|
||||
'.markdown': MarkdownDocumentProcessor
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict
|
||||
from ..prompts.masking_prompts import get_masking_mapping_prompt
|
||||
from ..prompts.masking_prompts import get_masking_mapping_prompt_v2, get_ner_name_prompt
|
||||
import logging
|
||||
import json
|
||||
from ..services.ollama_client import OllamaClient
|
||||
|
|
@ -77,7 +77,7 @@ class DocumentProcessor(ABC):
|
|||
"""Build mapping for a single chunk of text with retry logic"""
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
formatted_prompt = get_masking_mapping_prompt(chunk)
|
||||
formatted_prompt = get_ner_name_prompt(chunk)
|
||||
logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
|
||||
response = self.ollama_client.generate(formatted_prompt)
|
||||
logger.info(f"Raw response from LLM: {response}")
|
||||
|
|
@ -175,16 +175,17 @@ class DocumentProcessor(ABC):
|
|||
for i, chunk in enumerate(chunks):
|
||||
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
|
||||
chunk_mapping = self._build_mapping(chunk)
|
||||
if chunk_mapping: # Only update if we got a valid mapping
|
||||
combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
|
||||
else:
|
||||
logger.warning(f"Failed to generate mapping for chunk {i+1}")
|
||||
# if chunk_mapping: # Only update if we got a valid mapping
|
||||
# combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
|
||||
# else:
|
||||
# logger.warning(f"Failed to generate mapping for chunk {i+1}")
|
||||
|
||||
# Apply the combined mapping to the entire content
|
||||
masked_content = self._apply_mapping(content, combined_mapping)
|
||||
# masked_content = self._apply_mapping(content, combined_mapping)
|
||||
logger.info("Successfully masked content")
|
||||
|
||||
return masked_content
|
||||
# return masked_content
|
||||
return ""
|
||||
|
||||
@abstractmethod
|
||||
def save_content(self, content: str) -> None:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from .txt_processor import TxtDocumentProcessor
|
||||
from .docx_processor import DocxDocumentProcessor
|
||||
from .pdf_processor import PdfDocumentProcessor
|
||||
# from .docx_processor import DocxDocumentProcessor
|
||||
# from .pdf_processor import PdfDocumentProcessor
|
||||
from .md_processor import MarkdownDocumentProcessor
|
||||
|
||||
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||
# __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||
__all__ = ['TxtDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||
Loading…
Reference in New Issue