WIP
This commit is contained in:
parent
a949902367
commit
1cf3c45cee
19
.env.example
19
.env.example
|
|
@ -1,19 +0,0 @@
|
|||
# Storage paths
|
||||
OBJECT_STORAGE_PATH=/path/to/mounted/object/storage
|
||||
TARGET_DIRECTORY_PATH=/path/to/target/directory
|
||||
|
||||
# Ollama API Configuration
|
||||
OLLAMA_API_URL=https://api.ollama.com
|
||||
OLLAMA_API_KEY=your_api_key_here
|
||||
OLLAMA_MODEL=llama2
|
||||
|
||||
# Application Settings
|
||||
MONITOR_INTERVAL=5
|
||||
|
||||
# Logging Configuration
|
||||
LOG_LEVEL=INFO
|
||||
LOG_FILE=app.log
|
||||
|
||||
# Optional: Additional security settings
|
||||
# MAX_FILE_SIZE=10485760 # 10MB in bytes
|
||||
# ALLOWED_FILE_TYPES=.txt,.doc,.docx,.pdf
|
||||
48
Dockerfile
48
Dockerfile
|
|
@ -1,48 +0,0 @@
|
|||
# Build stage
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements first to leverage Docker cache
|
||||
COPY requirements.txt .
|
||||
RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
|
||||
|
||||
# Final stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Create non-root user
|
||||
RUN useradd -m -r appuser && \
|
||||
chown appuser:appuser /app
|
||||
|
||||
# Copy wheels from builder
|
||||
COPY --from=builder /app/wheels /wheels
|
||||
COPY --from=builder /app/requirements.txt .
|
||||
|
||||
# Install dependencies
|
||||
RUN pip install --no-cache /wheels/*
|
||||
|
||||
# Copy application code
|
||||
COPY src/ ./src/
|
||||
|
||||
# Create directories for mounted volumes
|
||||
RUN mkdir -p /data/input /data/output && \
|
||||
chown -R appuser:appuser /data
|
||||
|
||||
# Switch to non-root user
|
||||
USER appuser
|
||||
|
||||
# Environment variables
|
||||
ENV PYTHONPATH=/app \
|
||||
OBJECT_STORAGE_PATH=/data/input \
|
||||
TARGET_DIRECTORY_PATH=/data/output
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "src/main.py"]
|
||||
|
|
@ -8,12 +8,32 @@ from ...core.config import settings
|
|||
from ..utils.json_extractor import LLMJsonExtractor
|
||||
import re
|
||||
from .regs.entity_regex import extract_id_number_entities, extract_social_credit_code_entities
|
||||
from jsonschema import validate, ValidationError # pip install jsonschema
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentProcessor(ABC):
|
||||
# JSON Schema for mapping validation
|
||||
mapping_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"entities": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {"type": "string"},
|
||||
"type": {"type": "string"}
|
||||
},
|
||||
"required": ["text", "type"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["entities"]
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
self.max_chunk_size = 1000 # Maximum number of characters per chunk
|
||||
|
|
@ -51,29 +71,14 @@ class DocumentProcessor(ABC):
|
|||
|
||||
def _validate_mapping_format(self, mapping: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Validate that the mapping follows the required format:
|
||||
{
|
||||
"原文1": "脱敏后1",
|
||||
"原文2": "脱敏后2",
|
||||
...
|
||||
}
|
||||
Validate that the mapping follows the required JSON schema format.
|
||||
"""
|
||||
if not isinstance(mapping, dict):
|
||||
logger.warning("Mapping is not a dictionary")
|
||||
try:
|
||||
validate(instance=mapping, schema=self.mapping_schema)
|
||||
return True
|
||||
except ValidationError as e:
|
||||
logger.warning(f"Mapping validation error: {e}")
|
||||
return False
|
||||
|
||||
# Check if any key or value is not a string
|
||||
for key, value in mapping.items():
|
||||
if not isinstance(key, str) or not isinstance(value, str):
|
||||
logger.warning(f"Invalid mapping format - key or value is not a string: {key}: {value}")
|
||||
return False
|
||||
|
||||
# Check if the mapping has any nested structures
|
||||
if any(isinstance(v, (dict, list)) for v in mapping.values()):
|
||||
logger.warning("Invalid mapping format - contains nested structures")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _process_entity_type(self, chunk: str, prompt_func, entity_type: str) -> Dict[str, str]:
|
||||
"""Process a single entity type with retry logic"""
|
||||
|
|
|
|||
|
|
@ -29,3 +29,4 @@ python-docx>=0.8.11
|
|||
PyPDF2>=3.0.0
|
||||
pandas>=2.0.0
|
||||
magic-pdf[full]
|
||||
jsonschema>=4.20.0
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
# Base dependencies
|
||||
pydantic-settings>=2.0.0
|
||||
python-dotenv==1.0.0
|
||||
watchdog==2.1.6
|
||||
requests==2.28.1
|
||||
|
||||
# Document processing
|
||||
python-docx>=0.8.11
|
||||
PyPDF2>=3.0.0
|
||||
pandas>=2.0.0
|
||||
magic-pdf[full]
|
||||
Loading…
Reference in New Issue