feature-ner-keyword-detect #1
19
.env.example
19
.env.example
|
|
@ -1,19 +0,0 @@
|
||||||
# Storage paths
|
|
||||||
OBJECT_STORAGE_PATH=/path/to/mounted/object/storage
|
|
||||||
TARGET_DIRECTORY_PATH=/path/to/target/directory
|
|
||||||
|
|
||||||
# Ollama API Configuration
|
|
||||||
OLLAMA_API_URL=https://api.ollama.com
|
|
||||||
OLLAMA_API_KEY=your_api_key_here
|
|
||||||
OLLAMA_MODEL=llama2
|
|
||||||
|
|
||||||
# Application Settings
|
|
||||||
MONITOR_INTERVAL=5
|
|
||||||
|
|
||||||
# Logging Configuration
|
|
||||||
LOG_LEVEL=INFO
|
|
||||||
LOG_FILE=app.log
|
|
||||||
|
|
||||||
# Optional: Additional security settings
|
|
||||||
# MAX_FILE_SIZE=10485760 # 10MB in bytes
|
|
||||||
# ALLOWED_FILE_TYPES=.txt,.doc,.docx,.pdf
|
|
||||||
48
Dockerfile
48
Dockerfile
|
|
@ -1,48 +0,0 @@
|
||||||
# Build stage
|
|
||||||
FROM python:3.12-slim AS builder
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# Install build dependencies
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends \
|
|
||||||
build-essential \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Copy requirements first to leverage Docker cache
|
|
||||||
COPY requirements.txt .
|
|
||||||
RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
|
|
||||||
|
|
||||||
# Final stage
|
|
||||||
FROM python:3.12-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# Create non-root user
|
|
||||||
RUN useradd -m -r appuser && \
|
|
||||||
chown appuser:appuser /app
|
|
||||||
|
|
||||||
# Copy wheels from builder
|
|
||||||
COPY --from=builder /app/wheels /wheels
|
|
||||||
COPY --from=builder /app/requirements.txt .
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
RUN pip install --no-cache /wheels/*
|
|
||||||
|
|
||||||
# Copy application code
|
|
||||||
COPY src/ ./src/
|
|
||||||
|
|
||||||
# Create directories for mounted volumes
|
|
||||||
RUN mkdir -p /data/input /data/output && \
|
|
||||||
chown -R appuser:appuser /data
|
|
||||||
|
|
||||||
# Switch to non-root user
|
|
||||||
USER appuser
|
|
||||||
|
|
||||||
# Environment variables
|
|
||||||
ENV PYTHONPATH=/app \
|
|
||||||
OBJECT_STORAGE_PATH=/data/input \
|
|
||||||
TARGET_DIRECTORY_PATH=/data/output
|
|
||||||
|
|
||||||
# Run the application
|
|
||||||
CMD ["python", "src/main.py"]
|
|
||||||
|
|
@ -8,12 +8,32 @@ from ...core.config import settings
|
||||||
from ..utils.json_extractor import LLMJsonExtractor
|
from ..utils.json_extractor import LLMJsonExtractor
|
||||||
import re
|
import re
|
||||||
from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities
|
from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities
|
||||||
|
from jsonschema import validate, ValidationError # pip install jsonschema
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class DocumentProcessor(ABC):
|
class DocumentProcessor(ABC):
|
||||||
|
# JSON Schema for mapping validation
|
||||||
|
mapping_schema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"entities": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"text": {"type": "string"},
|
||||||
|
"type": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["text", "type"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["entities"]
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||||
self.max_chunk_size = 1000 # Maximum number of characters per chunk
|
self.max_chunk_size = 1000 # Maximum number of characters per chunk
|
||||||
|
|
@ -51,30 +71,15 @@ class DocumentProcessor(ABC):
|
||||||
|
|
||||||
def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
|
def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
|
||||||
"""
|
"""
|
||||||
Validate that the mapping follows the required format:
|
Validate that the mapping follows the required JSON schema format.
|
||||||
{
|
|
||||||
"原文1": "脱敏后1",
|
|
||||||
"原文2": "脱敏后2",
|
|
||||||
...
|
|
||||||
}
|
|
||||||
"""
|
"""
|
||||||
if not isinstance(mapping, dict):
|
try:
|
||||||
logger.warning("Mapping is not a dictionary")
|
validate(instance=mapping, schema=self.mapping_schema)
|
||||||
|
return True
|
||||||
|
except ValidationError as e:
|
||||||
|
logger.warning(f"Mapping validation error: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check if any key or value is not a string
|
|
||||||
for key, value in mapping.items():
|
|
||||||
if not isinstance(key, str) or not isinstance(value, str):
|
|
||||||
logger.warning(f"Invalid mapping format - key or value is not a string: {key}: {value}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check if the mapping has any nested structures
|
|
||||||
if any(isinstance(v, (dict, list)) for v in mapping.values()):
|
|
||||||
logger.warning("Invalid mapping format - contains nested structures")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
|
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
|
||||||
"""Process a single entity type with retry logic"""
|
"""Process a single entity type with retry logic"""
|
||||||
for attempt in range(self.max_retries):
|
for attempt in range(self.max_retries):
|
||||||
|
|
|
||||||
|
|
@ -29,3 +29,4 @@ python-docx>=0.8.11
|
||||||
PyPDF2>=3.0.0
|
PyPDF2>=3.0.0
|
||||||
pandas>=2.0.0
|
pandas>=2.0.0
|
||||||
magic-pdf[full]
|
magic-pdf[full]
|
||||||
|
jsonschema>=4.20.0
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
# Base dependencies
|
|
||||||
pydantic-settings>=2.0.0
|
|
||||||
python-dotenv==1.0.0
|
|
||||||
watchdog==2.1.6
|
|
||||||
requests==2.28.1
|
|
||||||
|
|
||||||
# Document processing
|
|
||||||
python-docx>=0.8.11
|
|
||||||
PyPDF2>=3.0.0
|
|
||||||
pandas>=2.0.0
|
|
||||||
magic-pdf[full]
|
|
||||||
Loading…
Reference in New Issue