This commit is contained in:
oliviamn 2025-07-06 21:11:23 +08:00
parent a949902367
commit 1cf3c45cee
5 changed files with 27 additions and 99 deletions

View File

@ -1,19 +0,0 @@
# Storage paths
OBJECT_STORAGE_PATH=/path/to/mounted/object/storage
TARGET_DIRECTORY_PATH=/path/to/target/directory
# Ollama API Configuration
OLLAMA_API_URL=https://api.ollama.com
OLLAMA_API_KEY=your_api_key_here
OLLAMA_MODEL=llama2
# Application Settings
MONITOR_INTERVAL=5
# Logging Configuration
LOG_LEVEL=INFO
LOG_FILE=app.log
# Optional: Additional security settings
# MAX_FILE_SIZE=10485760 # 10MB in bytes
# ALLOWED_FILE_TYPES=.txt,.doc,.docx,.pdf

View File

@ -1,48 +0,0 @@
# Build stage
FROM python:3.12-slim AS builder
WORKDIR /app
# Install build dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
# Final stage
FROM python:3.12-slim
WORKDIR /app
# Create non-root user
RUN useradd -m -r appuser && \
chown appuser:appuser /app
# Copy wheels from builder
COPY --from=builder /app/wheels /wheels
COPY --from=builder /app/requirements.txt .
# Install dependencies
RUN pip install --no-cache /wheels/*
# Copy application code
COPY src/ ./src/
# Create directories for mounted volumes
RUN mkdir -p /data/input /data/output && \
chown -R appuser:appuser /data
# Switch to non-root user
USER appuser
# Environment variables
ENV PYTHONPATH=/app \
OBJECT_STORAGE_PATH=/data/input \
TARGET_DIRECTORY_PATH=/data/output
# Run the application
CMD ["python", "src/main.py"]

View File

@ -8,12 +8,32 @@ from ...core.config import settings
from ..utils.json_extractor import LLMJsonExtractor
import re
from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities
from jsonschema import validate, ValidationError # pip install jsonschema
logger = logging.getLogger(__name__)
class DocumentProcessor(ABC):
# JSON Schema for mapping validation
mapping_schema = {
"type": "object",
"properties": {
"entities": {
"type": "array",
"items": {
"type": "object",
"properties": {
"text": {"type": "string"},
"type": {"type": "string"}
},
"required": ["text", "type"]
}
}
},
"required": ["entities"]
}
def __init__(self):
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
self.max_chunk_size = 1000 # Maximum number of characters per chunk
@ -51,29 +71,14 @@ class DocumentProcessor(ABC):
def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
"""
Validate that the mapping follows the required format:
{
"原文1": "脱敏后1",
"原文2": "脱敏后2",
...
}
Validate that the mapping follows the required JSON schema format.
"""
if not isinstance(mapping, dict):
logger.warning("Mapping is not a dictionary")
try:
validate(instance=mapping, schema=self.mapping_schema)
return True
except ValidationError as e:
logger.warning(f"Mapping validation error: {e}")
return False
# Check if any key or value is not a string
for key, value in mapping.items():
if not isinstance(key, str) or not isinstance(value, str):
logger.warning(f"Invalid mapping format - key or value is not a string: {key}: {value}")
return False
# Check if the mapping has any nested structures
if any(isinstance(v, (dict, list)) for v in mapping.values()):
logger.warning("Invalid mapping format - contains nested structures")
return False
return True
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
"""Process a single entity type with retry logic"""

View File

@ -29,3 +29,4 @@ python-docx>=0.8.11
PyPDF2>=3.0.0
pandas>=2.0.0
magic-pdf[full]
jsonschema>=4.20.0

View File

@ -1,11 +0,0 @@
# Base dependencies
pydantic-settings>=2.0.0
python-dotenv==1.0.0
watchdog==2.1.6
requests==2.28.1
# Document processing
python-docx>=0.8.11
PyPDF2>=3.0.0
pandas>=2.0.0
magic-pdf[full]