Compare commits
No commits in common. "5ddef90e8b82ceb90c7f37275edf2ecae41e8736" and "12c1b5f75ebefaa63ee8e16bd10ce4163a5d48d2" have entirely different histories.
5ddef90e8b
...
12c1b5f75e
|
|
@ -13,10 +13,8 @@ RUN apt-get update && apt-get install -y \
|
||||||
# Copy requirements first to leverage Docker cache
|
# Copy requirements first to leverage Docker cache
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install huggingface_hub
|
RUN pip install huggingface_hub
|
||||||
# RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
|
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
|
||||||
# RUN wget https://raw.githubusercontent.com/opendatalab/MinerU/refs/heads/release-1.3.1/scripts/download_models_hf.py -O download_models_hf.py
|
RUN python download_models_hf.py
|
||||||
|
|
||||||
# RUN python download_models_hf.py
|
|
||||||
|
|
||||||
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,8 @@ from typing import Optional
|
||||||
from .document_processor import DocumentProcessor
|
from .document_processor import DocumentProcessor
|
||||||
from .processors import (
|
from .processors import (
|
||||||
TxtDocumentProcessor,
|
TxtDocumentProcessor,
|
||||||
# DocxDocumentProcessor,
|
DocxDocumentProcessor,
|
||||||
# PdfDocumentProcessor,
|
PdfDocumentProcessor,
|
||||||
MarkdownDocumentProcessor
|
MarkdownDocumentProcessor
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -15,9 +15,9 @@ class DocumentProcessorFactory:
|
||||||
|
|
||||||
processors = {
|
processors = {
|
||||||
'.txt': TxtDocumentProcessor,
|
'.txt': TxtDocumentProcessor,
|
||||||
# '.docx': DocxDocumentProcessor,
|
'.docx': DocxDocumentProcessor,
|
||||||
# '.doc': DocxDocumentProcessor,
|
'.doc': DocxDocumentProcessor,
|
||||||
# '.pdf': PdfDocumentProcessor,
|
'.pdf': PdfDocumentProcessor,
|
||||||
'.md': MarkdownDocumentProcessor,
|
'.md': MarkdownDocumentProcessor,
|
||||||
'.markdown': MarkdownDocumentProcessor
|
'.markdown': MarkdownDocumentProcessor
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
from ..prompts.masking_prompts import get_masking_mapping_prompt_v2, get_ner_name_prompt
|
from ..prompts.masking_prompts import get_masking_mapping_prompt
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
from ..services.ollama_client import OllamaClient
|
from ..services.ollama_client import OllamaClient
|
||||||
|
|
@ -77,7 +77,7 @@ class DocumentProcessor(ABC):
|
||||||
"""Build mapping for a single chunk of text with retry logic"""
|
"""Build mapping for a single chunk of text with retry logic"""
|
||||||
for attempt in range(self.max_retries):
|
for attempt in range(self.max_retries):
|
||||||
try:
|
try:
|
||||||
formatted_prompt = get_ner_name_prompt(chunk)
|
formatted_prompt = get_masking_mapping_prompt(chunk)
|
||||||
logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
|
logger.info(f"Calling ollama to generate mapping for chunk (attempt {attempt + 1}/{self.max_retries}): {formatted_prompt}")
|
||||||
response = self.ollama_client.generate(formatted_prompt)
|
response = self.ollama_client.generate(formatted_prompt)
|
||||||
logger.info(f"Raw response from LLM: {response}")
|
logger.info(f"Raw response from LLM: {response}")
|
||||||
|
|
@ -175,17 +175,16 @@ class DocumentProcessor(ABC):
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
|
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
|
||||||
chunk_mapping = self._build_mapping(chunk)
|
chunk_mapping = self._build_mapping(chunk)
|
||||||
# if chunk_mapping: # Only update if we got a valid mapping
|
if chunk_mapping: # Only update if we got a valid mapping
|
||||||
# combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
|
combined_mapping = self._merge_mappings(combined_mapping, chunk_mapping)
|
||||||
# else:
|
else:
|
||||||
# logger.warning(f"Failed to generate mapping for chunk {i+1}")
|
logger.warning(f"Failed to generate mapping for chunk {i+1}")
|
||||||
|
|
||||||
# Apply the combined mapping to the entire content
|
# Apply the combined mapping to the entire content
|
||||||
# masked_content = self._apply_mapping(content, combined_mapping)
|
masked_content = self._apply_mapping(content, combined_mapping)
|
||||||
logger.info("Successfully masked content")
|
logger.info("Successfully masked content")
|
||||||
|
|
||||||
# return masked_content
|
return masked_content
|
||||||
return ""
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def save_content(self, content: str) -> None:
|
def save_content(self, content: str) -> None:
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
from .txt_processor import TxtDocumentProcessor
|
from .txt_processor import TxtDocumentProcessor
|
||||||
# from .docx_processor import DocxDocumentProcessor
|
from .docx_processor import DocxDocumentProcessor
|
||||||
# from .pdf_processor import PdfDocumentProcessor
|
from .pdf_processor import PdfDocumentProcessor
|
||||||
from .md_processor import MarkdownDocumentProcessor
|
from .md_processor import MarkdownDocumentProcessor
|
||||||
|
|
||||||
# __all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor', 'MarkdownDocumentProcessor']
|
||||||
__all__ = ['TxtDocumentProcessor', 'MarkdownDocumentProcessor']
|
|
||||||
|
|
@ -79,85 +79,3 @@ def get_masking_mapping_prompt(text: str) -> str:
|
||||||
""")
|
""")
|
||||||
|
|
||||||
return prompt.format(text=text)
|
return prompt.format(text=text)
|
||||||
|
|
||||||
def get_masking_mapping_prompt_v2(text: str) -> str:
|
|
||||||
"""
|
|
||||||
Returns a prompt that generates a mapping of original names/companies to their masked versions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The input text to be analyzed for masking
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: The formatted prompt that will generate a mapping dictionary
|
|
||||||
"""
|
|
||||||
prompt = textwrap.dedent("""
|
|
||||||
请根据以下脱敏规则,对给定文本进行分析并生成脱敏映射map(JSON格式)。
|
|
||||||
|
|
||||||
脱敏规则说明:
|
|
||||||
|
|
||||||
1.人名:保留姓,名变为"某";同姓者按"某1"、"某2"依次编号;律师姓名、审判人员姓名不脱敏
|
|
||||||
2.公司名:替换为大写英文字母(A公司、B公司等);律师事务所不脱敏
|
|
||||||
3.地址:仅保留区级以上地址,删除详细位置
|
|
||||||
4.英文人名:保留姓名首字母,其余替换为"***"
|
|
||||||
5.英文公司名:替换为所属行业名称的英文大写形式
|
|
||||||
6.项目名:替换为小写英文字母(a项目、b项目等)
|
|
||||||
7.案号:具体案号部分替换为"** *"
|
|
||||||
8.身份证号:替换为6个"X"
|
|
||||||
9.统一社会信用代码:替换为8个"X"
|
|
||||||
输入文本:
|
|
||||||
{text}
|
|
||||||
|
|
||||||
输出要求:
|
|
||||||
请生成一个JSON格式的映射map,包含以下结构:
|
|
||||||
|
|
||||||
{{
|
|
||||||
"原文1": "脱敏后1",
|
|
||||||
"原文2": "脱敏后2",
|
|
||||||
...
|
|
||||||
}}
|
|
||||||
如无需要输出的映射,请输出空json,如下:
|
|
||||||
{{}}
|
|
||||||
注意事项:
|
|
||||||
|
|
||||||
请准确识别文本中的各类敏感信息
|
|
||||||
同一类别的多个实体,请在对应类别下全部列出
|
|
||||||
如果文本中没有某类敏感信息,可以省略该类别
|
|
||||||
请确保脱敏后的文本符合上述规则要求
|
|
||||||
|
|
||||||
|
|
||||||
""")
|
|
||||||
return prompt.format(text=text)
|
|
||||||
|
|
||||||
def get_ner_name_prompt(text: str) -> str:
|
|
||||||
"""
|
|
||||||
Returns a prompt that generates a mapping of original names/companies to their masked versions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The input text to be analyzed for masking
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: The formatted prompt that will generate a mapping dictionary
|
|
||||||
"""
|
|
||||||
prompt = textwrap.dedent("""
|
|
||||||
你是一个专业的法律文本实体识别助手。请从以下文本中抽取出所有需要脱敏的敏感信息,并按照指定的类别进行分类。请严格按照JSON格式输出结果。
|
|
||||||
|
|
||||||
实体类别包括:
|
|
||||||
- 人名 (不包括律师、法官、书记员、检察官等公职人员)
|
|
||||||
- 英文人名
|
|
||||||
|
|
||||||
|
|
||||||
待处理文本:
|
|
||||||
{text}
|
|
||||||
|
|
||||||
输出格式:
|
|
||||||
{{
|
|
||||||
"entities": [
|
|
||||||
{{"text": "原始文本内容", "type": "实体类别"}},
|
|
||||||
...
|
|
||||||
]
|
|
||||||
}}
|
|
||||||
|
|
||||||
请严格按照JSON格式输出结果。
|
|
||||||
|
|
||||||
""")
|
|
||||||
return prompt.format(text=text)
|
|
||||||
Loading…
Reference in New Issue