Compare commits

...

2 Commits

Author SHA1 Message Date
oliviamn 5ddef90e8b feat:单独对名字进行NER 2025-06-25 01:31:12 +08:00
oliviamn ee95f1daa7 WIP: 暂时屏蔽docx,pdf解析 2025-06-25 01:30:43 +08:00
7 changed files with 104 additions and 18 deletions

View File

@ -13,8 +13,10 @@ RUN apt-get update && apt-get install -y \
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
RUN pip install huggingface_hub
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
RUN python download_models_hf.py
# RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
# RUN wget https://raw.githubusercontent.com/opendatalab/MinerU/refs/heads/release-1.3.1/scripts/download_models_hf.py -O download_models_hf.py
# RUN python download_models_hf.py
RUN pip install --no-cache-dir -r requirements.txt

View File

@ -3,8 +3,8 @@ from typing import Optional
from .document_processor import DocumentProcessor
from .processors import (
TxtDocumentProcessor,
DocxDocumentProcessor,
PdfDocumentProcessor,
# DocxDocumentProcessor,
# PdfDocumentProcessor,
MarkdownDocumentProcessor
)
@ -15,9 +15,9 @@ class DocumentProcessorFactory:
processors = {
'.txt': TxtDocumentProcessor,
'.docx': DocxDocumentProcessor,
'.doc': DocxDocumentProcessor,
'.pdf': PdfDocumentProcessor,
# '.docx': DocxDocumentProcessor,
# '.doc': DocxDocumentProcessor,
# '.pdf': PdfDocumentProcessor,
'.md': MarkdownDocumentProcessor,
'.markdown': MarkdownDocumentProcessor
}

View File

@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from typing import Any, Dict
from ..prompts.masking_prompts import get_masking_mapping_prompt
from ..prompts.masking_prompts import get_masking_mapping_prompt_v2, get_ner_name_prompt
import logging
import json
from ..services.ollama_client import OllamaClient
@ -77,7 +77,7 @@ class DocumentProcessor(ABC):
"""Build mapping for a single chunk of text with retry logic"""
for attempt in range(self.max_retries):
try:
formatted_prompt = get_masking_mapping_prompt(chunk)
formatted_prompt = get_ner_name_prompt(chunk)
logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
response = self.ollama_client.generate(formatted_prompt)
logger.info(f"Raw response from LLM: {response}")
@ -175,16 +175,17 @@ class DocumentProcessor(ABC):
for i, chunk in enumerate(chunks):
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
chunk_mapping = self._build_mapping(chunk)
if chunk_mapping: # Only update if we got a valid mapping
combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
else:
logger.warning(f"Failed to generate mapping for chunk {i+1}")
# if chunk_mapping: # Only update if we got a valid mapping
# combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
# else:
# logger.warning(f"Failed to generate mapping for chunk {i+1}")
# Apply the combined mapping to the entire content
masked_content = self._apply_mapping(content, combined_mapping)
# masked_content = self._apply_mapping(content, combined_mapping)
logger.info("Successfully masked content")
return masked_content
# return masked_content
return ""
@abstractmethod
def save_content(self, content: str) -> None:

View File

@ -1,6 +1,7 @@
from .txt_processor import TxtDocumentProcessor
from .docx_processor import DocxDocumentProcessor
from .pdf_processor import PdfDocumentProcessor
# from .docx_processor import DocxDocumentProcessor
# from .pdf_processor import PdfDocumentProcessor
from .md_processor import MarkdownDocumentProcessor
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
# __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
__all__ = ['TxtDocumentProcessor', 'MarkdownDocumentProcessor']

View File

@ -78,4 +78,86 @@ def get_masking_mapping_prompt(text: str) -> str:
{{}}
""")
return prompt.format(text=text)
def get_masking_mapping_prompt_v2(text: str) -> str:
"""
Returns a prompt that generates a mapping of original names/companies to their masked versions.
Args:
text (str): The input text to be analyzed for masking
Returns:
str: The formatted prompt that will generate a mapping dictionary
"""
prompt = textwrap.dedent("""
请根据以下脱敏规则对给定文本进行分析并生成脱敏映射mapJSON格式
脱敏规则说明
1.人名保留姓名变为""同姓者按"某1""某2"依次编号律师姓名审判人员姓名不脱敏
2.公司名替换为大写英文字母A公司B公司等律师事务所不脱敏
3.地址仅保留区级以上地址删除详细位置
4.英文人名保留姓名首字母其余替换为"***"
5.英文公司名替换为所属行业名称的英文大写形式
6.项目名替换为小写英文字母a项目b项目等
7.案号具体案号部分替换为"** *"
8.身份证号替换为6个"X"
9.统一社会信用代码替换为8个"X"
输入文本
{text}
输出要求
请生成一个JSON格式的映射map包含以下结构
{{
"原文1": "脱敏后1",
"原文2": "脱敏后2",
...
}}
如无需要输出的映射请输出空json如下:
{{}}
注意事项
请准确识别文本中的各类敏感信息
同一类别的多个实体请在对应类别下全部列出
如果文本中没有某类敏感信息可以省略该类别
请确保脱敏后的文本符合上述规则要求
""")
return prompt.format(text=text)
def get_ner_name_prompt(text: str) -> str:
"""
Returns a prompt that generates a mapping of original names/companies to their masked versions.
Args:
text (str): The input text to be analyzed for masking
Returns:
str: The formatted prompt that will generate a mapping dictionary
"""
prompt = textwrap.dedent("""
你是一个专业的法律文本实体识别助手请从以下文本中抽取出所有需要脱敏的敏感信息并按照指定的类别进行分类请严格按照JSON格式输出结果
实体类别包括:
- 人名 (不包括律师法官书记员检察官等公职人员)
- 英文人名
待处理文本:
{text}
输出格式:
{{
"entities": [
{{"text": "原始文本内容", "type": "实体类别"}},
...
]
}}
请严格按照JSON格式输出结果
""")
return prompt.format(text=text)