Enhance document processing with Ollama integration and update .gitignore

- Added OllamaClient for document processing in TxtDocumentProcessor. - Updated process_content method to use Ollama API for content masking. - Refactored FileMonitor to utilize DocumentService with OllamaClient. - Removed unnecessary log files and Python cache files. - Added test file for document processing validation.
2025-04-23 01:09:33 +08:00 · 2025-04-23 01:09:33 +08:00 · 592fb66f40
parent fc68c243bb
commit 592fb66f40
12 changed files with 165 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -62,4 +62,9 @@ temp/
 .env.local
 .env.development.local
 .env.test.local
-.env.production.local
+.env.production.local
 src_folder
 target_folder
 app.log
 __pycache__
--- a/app.log
+++ b/app.log
@ -1 +0,0 @@
 2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md
--- a/src/config/pycache/logging_config.cpython-311.pyc
+++ b/src/config/pycache/logging_config.cpython-311.pyc
--- a/src/config/pycache/logging_config.cpython-312.pyc
+++ b/src/config/pycache/logging_config.cpython-312.pyc
--- a/src/config/pycache/settings.cpython-311.pyc
+++ b/src/config/pycache/settings.cpython-311.pyc
--- a/src/config/pycache/settings.cpython-312.pyc
+++ b/src/config/pycache/settings.cpython-312.pyc
--- a/src/models/processors/txt_processor.py
+++ b/src/models/processors/txt_processor.py
@ -1,17 +1,45 @@
 from models.document_processor import DocumentProcessor
 from services.ollama_client import OllamaClient
 import textwrap
 import logging
 from config.settings import settings
 logger = logging.getLogger(__name__)
 class TxtDocumentProcessor(DocumentProcessor):
    def __init__(self, input_path: str, output_path: str):
        self.input_path = input_path
        self.output_path = output_path
        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
    def read_content(self) -> str:
        with open(self.input_path, 'r', encoding='utf-8') as file:
            return file.read()
    def process_content(self, content: str) -> str:
-        # Implementation for processing text content
+        prompt = textwrap.dedent("""
-        return content
+            您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理：
            规则：
            1. 人名：
               - 两字名改为"姓+某"（如：张三 → 张某）
               - 三字名改为"姓+某某"（如：张三丰 → 张某某）
            2. 公司名：
               - 保留地理位置信息（如：北京、上海等）
               - 保留公司类型（如：有限公司、股份公司等）
               - 用"某"替换核心名称
            3. 保持原文其他部分不变
            4. 确保脱敏后的文本保持原有的语言流畅性和可读性
            输入文本：
            {text}
            请直接输出脱敏后的文本，无需解释或其他备注。
        """)
        formatted_prompt = prompt.format(text=content)
        response = self.ollama_client.generate(formatted_prompt)
        logger.debug(f"Processed content: {response}")
        return response
    def save_content(self, content: str) -> None:
        with open(self.output_path, 'w', encoding='utf-8') as file:
--- a/src/services/pycache/file_monitor.cpython-312.pyc
+++ b/src/services/pycache/file_monitor.cpython-312.pyc
--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@ -19,10 +19,10 @@ class DocumentService:
            content = processor.read_content()
            # Process with Ollama
-            processed_content = self.ollama_client.process_document(content)
+            masked_content = processor.process_content(content)
            # Save processed content
-            processor.save_content(processed_content)
+            processor.save_content(masked_content)
            return True
        except Exception as e:
--- a/src/services/file_monitor.py
+++ b/src/services/file_monitor.py
@ -1,24 +1,54 @@
 import logging
 import os
 from services.document_service import DocumentService
 from services.ollama_client import OllamaClient
 from config.settings import settings
 logger = logging.getLogger(__name__)
 class FileMonitor:
-    def __init__(self, directory, callback):
+    def __init__(self, input_directory: str, output_directory: str):
-        self.directory = directory
+        self.input_directory = input_directory
-        self.callback = callback
+        self.output_directory = output_directory
        # Create OllamaClient instance using settings
        ollama_client = OllamaClient(
            model_name=settings.OLLAMA_MODEL,
            base_url=settings.OLLAMA_API_URL
        )
        # Inject OllamaClient into DocumentService
        self.document_service = DocumentService(ollama_client=ollama_client)
    def process_new_file(self, file_path: str) -> None:
        try:
            # Get the filename without directory path
            filename = os.path.basename(file_path)
            # Create output path
            output_path = os.path.join(self.output_directory, filename)
            logger.info(f"Processing file: {filename}")
            # Process the document using document service
            self.document_service.process_document(file_path, output_path)
            logger.info(f"File processed successfully: {filename}")
        except Exception as e:
            logger.error(f"Error processing file {file_path}: {str(e)}")
    def start_monitoring(self):
        import time
-        import os
+        
-
+        # Ensure output directory exists
-        already_seen = set(os.listdir(self.directory))
+        os.makedirs(self.output_directory, exist_ok=True)
        already_seen = set(os.listdir(self.input_directory))
        while True:
            time.sleep(1)  # Check every second
-            current_files = set(os.listdir(self.directory))
+            current_files = set(os.listdir(self.input_directory))
            new_files = current_files - already_seen
            for new_file in new_files:
-                logger.info(f"monitor: new file found: {new_file}")
+                file_path = os.path.join(self.input_directory, new_file)
-                self.callback(os.path.join(self.directory, new_file))
+                logger.info(f"New file found: {new_file}")
                self.process_new_file(file_path)
            already_seen = current_files
--- a/src/services/ollama_client.py
+++ b/src/services/ollama_client.py
@ -1,15 +1,91 @@
 import requests
 import logging
 from typing import Dict, Any
 logger = logging.getLogger(__name__)
 class OllamaClient:
-    def __init__(self, model_name):
+    def __init__(self, model_name: str, base_url: str = "http://localhost:11434"):
        """Initialize Ollama client.
        Args:
            model_name (str): Name of the Ollama model to use
            host (str): Ollama server host address
            port (int): Ollama server port
        """
        self.model_name = model_name
        self.base_url = base_url
        self.headers = {"Content-Type": "application/json"}
-    def process_document(self, document_text):
+    def generate(self, prompt: str, strip_think: bool = True) -> str:
-        # Here you would implement the logic to interact with the Ollama API
+        """Process a document using the Ollama API.
-        # and process the document text using the specified model.
+        
-        # This is a placeholder for the actual API call.
+        Args:
-        processed_text = self._mock_api_call(document_text)
+            document_text (str): The text content to process
-        return processed_text
+            
        Returns:
            str: Processed text response from the model
        Raises:
            RequestException: If the API call fails
        """
        try:
            url = f"{self.base_url}/api/generate"
            payload = {
                "model": self.model_name,
                "prompt": prompt,
                "stream": False
            }
            logger.debug(f"Sending request to Ollama API: {url}")
            response = requests.post(url, json=payload, headers=self.headers)
            response.raise_for_status()
            result = response.json()
            logger.debug(f"Received response from Ollama API: {result}")
            if strip_think:
                # Remove the "thinking" part from the response
                # the response is expected to be <think>...</think>response_text
                # Check if the response contains <think> tag
                if "<think>" in result.get("response", ""):
                    # Split the response and take the part after </think>
                    response_parts = result["response"].split("</think>")
                    if len(response_parts) > 1:
                        # Return the part after </think>
                        return response_parts[1].strip()
                    else:
                        # If no closing tag, return the full response
                        return result.get("response", "").strip()
                else:
                    # If no <think> tag, return the full response
                    return result.get("response", "").strip()
            else:
                # If strip_think is False, return the full response
                return result.get("response", "")
-    def _mock_api_call(self, document_text):
+            
-        # Mock processing: In a real implementation, this would call the Ollama API.
+        except requests.exceptions.RequestException as e:
-        # For now, it just returns the input text with a note indicating it was processed.
+            logger.error(f"Error calling Ollama API: {str(e)}")
-        return f"Processed with {self.model_name}: {document_text}"
+            raise
    def get_model_info(self) -> Dict[str, Any]:
        """Get information about the current model.
        Returns:
            Dict[str, Any]: Model information
        Raises:
            RequestException: If the API call fails
        """
        try:
            url = f"{self.base_url}/api/show"
            payload = {"name": self.model_name}
            response = requests.post(url, json=payload, headers=self.headers)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            logger.error(f"Error getting model info: {str(e)}")
            raise
--- a/tests/test.txt
+++ b/tests/test.txt
@ -0,0 +1 @@
 关于张三天和北京易见天树有限公司的劳动纠纷
		`@ -1 +0,0 @@`
			`2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md`
		`@ -0,0 +1 @@`
							`关于张三天和北京易见天树有限公司的劳动纠纷`