Enhance document processing with Ollama integration and update .gitignore

- Added OllamaClient for document processing in TxtDocumentProcessor. - Updated process_content method to use Ollama API for content masking. - Refactored FileMonitor to utilize DocumentService with OllamaClient. - Removed unnecessary log files and Python cache files. - Added test file for document processing validation.
2025-04-23 01:09:33 +08:00 · 2025-04-23 01:09:33 +08:00 · 592fb66f40
parent fc68c243bb
commit 592fb66f40
12 changed files with 165 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -63,3 +63,8 @@ temp/
 .env.development.local
 .env.test.local
 .env.production.local
+
+src_folder
+target_folder
+app.log
+__pycache__
--- a/app.log
+++ b/app.log
@ -1 +0,0 @@
-2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md
--- a/src/config/pycache/logging_config.cpython-311.pyc
+++ b/src/config/pycache/logging_config.cpython-311.pyc
--- a/src/config/pycache/logging_config.cpython-312.pyc
+++ b/src/config/pycache/logging_config.cpython-312.pyc
--- a/src/config/pycache/settings.cpython-311.pyc
+++ b/src/config/pycache/settings.cpython-311.pyc
--- a/src/config/pycache/settings.cpython-312.pyc
+++ b/src/config/pycache/settings.cpython-312.pyc
--- a/src/models/processors/txt_processor.py
+++ b/src/models/processors/txt_processor.py
@ -1,17 +1,45 @@
 from models.document_processor import DocumentProcessor
+from services.ollama_client import OllamaClient
+import textwrap
+import logging
+from config.settings import settings

+logger = logging.getLogger(__name__)
 class TxtDocumentProcessor(DocumentProcessor):
    def __init__(self, input_path: str, output_path: str):
        self.input_path = input_path
        self.output_path = output_path
+        self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)

    def read_content(self) -> str:
        with open(self.input_path, 'r', encoding='utf-8') as file:
            return file.read()

    def process_content(self, content: str) -> str:
-        # Implementation for processing text content
-        return content
+        prompt = textwrap.dedent("""
+            您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理：
+
+            规则：
+            1. 人名：
+               - 两字名改为"姓+某"（如：张三 → 张某）
+               - 三字名改为"姓+某某"（如：张三丰 → 张某某）
+            2. 公司名：
+               - 保留地理位置信息（如：北京、上海等）
+               - 保留公司类型（如：有限公司、股份公司等）
+               - 用"某"替换核心名称
+            3. 保持原文其他部分不变
+            4. 确保脱敏后的文本保持原有的语言流畅性和可读性
+
+            输入文本：
+            {text}
+
+            请直接输出脱敏后的文本，无需解释或其他备注。
+        """)
+
+        formatted_prompt = prompt.format(text=content)
+        response = self.ollama_client.generate(formatted_prompt)
+        logger.debug(f"Processed content: {response}")
+        return response

    def save_content(self, content: str) -> None:
        with open(self.output_path, 'w', encoding='utf-8') as file:
--- a/src/services/pycache/file_monitor.cpython-312.pyc
+++ b/src/services/pycache/file_monitor.cpython-312.pyc
--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@ -19,10 +19,10 @@ class DocumentService:
            content = processor.read_content()

            # Process with Ollama
-            processed_content = self.ollama_client.process_document(content)
+            masked_content = processor.process_content(content)

            # Save processed content
-            processor.save_content(processed_content)
+            processor.save_content(masked_content)
            return True

        except Exception as e:
--- a/src/services/file_monitor.py
+++ b/src/services/file_monitor.py
@ -1,24 +1,54 @@
 import logging
+import os
+from services.document_service import DocumentService
+from services.ollama_client import OllamaClient
+from config.settings import settings

 logger = logging.getLogger(__name__)

 class FileMonitor:
-    def __init__(self, directory, callback):
-        self.directory = directory
-        self.callback = callback
+    def __init__(self, input_directory: str, output_directory: str):
+        self.input_directory = input_directory
+        self.output_directory = output_directory
+        
+        # Create OllamaClient instance using settings
+        ollama_client = OllamaClient(
+            model_name=settings.OLLAMA_MODEL,
+            base_url=settings.OLLAMA_API_URL
+        )
+        # Inject OllamaClient into DocumentService
+        self.document_service = DocumentService(ollama_client=ollama_client)
+
+    def process_new_file(self, file_path: str) -> None:
+        try:
+            # Get the filename without directory path
+            filename = os.path.basename(file_path)
+            # Create output path
+            output_path = os.path.join(self.output_directory, filename)
+            
+            logger.info(f"Processing file: {filename}")
+            # Process the document using document service
+            self.document_service.process_document(file_path, output_path)
+            logger.info(f"File processed successfully: {filename}")
+            
+        except Exception as e:
+            logger.error(f"Error processing file {file_path}: {str(e)}")

    def start_monitoring(self):
        import time
-        import os
        
-        already_seen = set(os.listdir(self.directory))
+        # Ensure output directory exists
+        os.makedirs(self.output_directory, exist_ok=True)
+        
+        already_seen = set(os.listdir(self.input_directory))
        while True:
            time.sleep(1)  # Check every second
-            current_files = set(os.listdir(self.directory))
+            current_files = set(os.listdir(self.input_directory))
            new_files = current_files - already_seen

            for new_file in new_files:
-                logger.info(f"monitor: new file found: {new_file}")
-                self.callback(os.path.join(self.directory, new_file))
+                file_path = os.path.join(self.input_directory, new_file)
+                logger.info(f"New file found: {new_file}")
+                self.process_new_file(file_path)
                
            already_seen = current_files
--- a/src/services/ollama_client.py
+++ b/src/services/ollama_client.py
@ -1,15 +1,91 @@
+import requests
+import logging
+from typing import Dict, Any
+
+logger = logging.getLogger(__name__)
+
 class OllamaClient:
-    def __init__(self, model_name):
+    def __init__(self, model_name: str, base_url: str = "http://localhost:11434"):
+        """Initialize Ollama client.
+        
+        Args:
+            model_name (str): Name of the Ollama model to use
+            host (str): Ollama server host address
+            port (int): Ollama server port
+        """
        self.model_name = model_name
+        self.base_url = base_url
+        self.headers = {"Content-Type": "application/json"}

-    def process_document(self, document_text):
-        # Here you would implement the logic to interact with the Ollama API
-        # and process the document text using the specified model.
-        # This is a placeholder for the actual API call.
-        processed_text = self._mock_api_call(document_text)
-        return processed_text
+    def generate(self, prompt: str, strip_think: bool = True) -> str:
+        """Process a document using the Ollama API.
        
-    def _mock_api_call(self, document_text):
-        # Mock processing: In a real implementation, this would call the Ollama API.
-        # For now, it just returns the input text with a note indicating it was processed.
-        return f"Processed with {self.model_name}: {document_text}"
+        Args:
+            document_text (str): The text content to process
+            
+        Returns:
+            str: Processed text response from the model
+            
+        Raises:
+            RequestException: If the API call fails
+        """
+        try:
+            url = f"{self.base_url}/api/generate"
+            payload = {
+                "model": self.model_name,
+                "prompt": prompt,
+                "stream": False
+            }
+            
+            logger.debug(f"Sending request to Ollama API: {url}")
+            response = requests.post(url, json=payload, headers=self.headers)
+            response.raise_for_status()
+            
+            result = response.json()
+            logger.debug(f"Received response from Ollama API: {result}")
+            if strip_think:
+                # Remove the "thinking" part from the response
+                # the response is expected to be <think>...</think>response_text
+                # Check if the response contains <think> tag
+                if "<think>" in result.get("response", ""):
+                    # Split the response and take the part after </think>
+                    response_parts = result["response"].split("</think>")
+                    if len(response_parts) > 1:
+                        # Return the part after </think>
+                        return response_parts[1].strip()
+                    else:
+                        # If no closing tag, return the full response
+                        return result.get("response", "").strip()
+                else:
+                    # If no <think> tag, return the full response
+                    return result.get("response", "").strip()
+            else:
+                # If strip_think is False, return the full response
+                return result.get("response", "")
+
+            
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error calling Ollama API: {str(e)}")
+            raise
+
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the current model.
+        
+        Returns:
+            Dict[str, Any]: Model information
+            
+        Raises:
+            RequestException: If the API call fails
+        """
+        try:
+            url = f"{self.base_url}/api/show"
+            payload = {"name": self.model_name}
+            
+            response = requests.post(url, json=payload, headers=self.headers)
+            response.raise_for_status()
+            
+            return response.json()
+            
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error getting model info: {str(e)}")
+            raise
--- a/tests/test.txt
+++ b/tests/test.txt
@ -0,0 +1 @@
+关于张三天和北京易见天树有限公司的劳动纠纷
				`@ -1 +0,0 @@`
				`2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md`
				`@ -0,0 +1 @@`
				`关于张三天和北京易见天树有限公司的劳动纠纷`