diff --git a/.gitignore b/.gitignore index 3775a45..20214e2 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,9 @@ temp/ .env.local .env.development.local .env.test.local -.env.production.local \ No newline at end of file +.env.production.local + +src_folder +target_folder +app.log +__pycache__ \ No newline at end of file diff --git a/app.log b/app.log deleted file mode 100644 index c40399a..0000000 --- a/app.log +++ /dev/null @@ -1 +0,0 @@ -2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md diff --git a/src/config/__pycache__/logging_config.cpython-311.pyc b/src/config/__pycache__/logging_config.cpython-311.pyc deleted file mode 100644 index 06116ba..0000000 Binary files a/src/config/__pycache__/logging_config.cpython-311.pyc and /dev/null differ diff --git a/src/config/__pycache__/logging_config.cpython-312.pyc b/src/config/__pycache__/logging_config.cpython-312.pyc deleted file mode 100644 index e6f5b95..0000000 Binary files a/src/config/__pycache__/logging_config.cpython-312.pyc and /dev/null differ diff --git a/src/config/__pycache__/settings.cpython-311.pyc b/src/config/__pycache__/settings.cpython-311.pyc deleted file mode 100644 index addb530..0000000 Binary files a/src/config/__pycache__/settings.cpython-311.pyc and /dev/null differ diff --git a/src/config/__pycache__/settings.cpython-312.pyc b/src/config/__pycache__/settings.cpython-312.pyc deleted file mode 100644 index c36ad4f..0000000 Binary files a/src/config/__pycache__/settings.cpython-312.pyc and /dev/null differ diff --git a/src/models/processors/txt_processor.py b/src/models/processors/txt_processor.py index 61e920e..413b0fc 100644 --- a/src/models/processors/txt_processor.py +++ b/src/models/processors/txt_processor.py @@ -1,17 +1,45 @@ from models.document_processor import DocumentProcessor +from services.ollama_client import OllamaClient +import textwrap +import logging +from config.settings import settings +logger = logging.getLogger(__name__) class TxtDocumentProcessor(DocumentProcessor): def __init__(self, input_path: str, output_path: str): self.input_path = input_path self.output_path = output_path + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) def read_content(self) -> str: with open(self.input_path, 'r', encoding='utf-8') as file: return file.read() def process_content(self, content: str) -> str: - # Implementation for processing text content - return content + prompt = textwrap.dedent(""" + 您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理: + + 规则: + 1. 人名: + - 两字名改为"姓+某"(如:张三 → 张某) + - 三字名改为"姓+某某"(如:张三丰 → 张某某) + 2. 公司名: + - 保留地理位置信息(如:北京、上海等) + - 保留公司类型(如:有限公司、股份公司等) + - 用"某"替换核心名称 + 3. 保持原文其他部分不变 + 4. 确保脱敏后的文本保持原有的语言流畅性和可读性 + + 输入文本: + {text} + + 请直接输出脱敏后的文本,无需解释或其他备注。 + """) + + formatted_prompt = prompt.format(text=content) + response = self.ollama_client.generate(formatted_prompt) + logger.debug(f"Processed content: {response}") + return response def save_content(self, content: str) -> None: with open(self.output_path, 'w', encoding='utf-8') as file: diff --git a/src/services/__pycache__/file_monitor.cpython-312.pyc b/src/services/__pycache__/file_monitor.cpython-312.pyc deleted file mode 100644 index cb48db2..0000000 Binary files a/src/services/__pycache__/file_monitor.cpython-312.pyc and /dev/null differ diff --git a/src/services/document_service.py b/src/services/document_service.py index 6af2377..6a42e62 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -19,10 +19,10 @@ class DocumentService: content = processor.read_content() # Process with Ollama - processed_content = self.ollama_client.process_document(content) + masked_content = processor.process_content(content) # Save processed content - processor.save_content(processed_content) + processor.save_content(masked_content) return True except Exception as e: diff --git a/src/services/file_monitor.py b/src/services/file_monitor.py index d6b8f8d..c44040d 100644 --- a/src/services/file_monitor.py +++ b/src/services/file_monitor.py @@ -1,24 +1,54 @@ import logging +import os +from services.document_service import DocumentService +from services.ollama_client import OllamaClient +from config.settings import settings logger = logging.getLogger(__name__) class FileMonitor: - def __init__(self, directory, callback): - self.directory = directory - self.callback = callback + def __init__(self, input_directory: str, output_directory: str): + self.input_directory = input_directory + self.output_directory = output_directory + + # Create OllamaClient instance using settings + ollama_client = OllamaClient( + model_name=settings.OLLAMA_MODEL, + base_url=settings.OLLAMA_API_URL + ) + # Inject OllamaClient into DocumentService + self.document_service = DocumentService(ollama_client=ollama_client) + + def process_new_file(self, file_path: str) -> None: + try: + # Get the filename without directory path + filename = os.path.basename(file_path) + # Create output path + output_path = os.path.join(self.output_directory, filename) + + logger.info(f"Processing file: {filename}") + # Process the document using document service + self.document_service.process_document(file_path, output_path) + logger.info(f"File processed successfully: {filename}") + + except Exception as e: + logger.error(f"Error processing file {file_path}: {str(e)}") def start_monitoring(self): import time - import os - - already_seen = set(os.listdir(self.directory)) + + # Ensure output directory exists + os.makedirs(self.output_directory, exist_ok=True) + + already_seen = set(os.listdir(self.input_directory)) while True: time.sleep(1) # Check every second - current_files = set(os.listdir(self.directory)) + current_files = set(os.listdir(self.input_directory)) new_files = current_files - already_seen for new_file in new_files: - logger.info(f"monitor: new file found: {new_file}") - self.callback(os.path.join(self.directory, new_file)) + file_path = os.path.join(self.input_directory, new_file) + logger.info(f"New file found: {new_file}") + self.process_new_file(file_path) already_seen = current_files \ No newline at end of file diff --git a/src/services/ollama_client.py b/src/services/ollama_client.py index e8c64ad..b1dfa96 100644 --- a/src/services/ollama_client.py +++ b/src/services/ollama_client.py @@ -1,15 +1,91 @@ +import requests +import logging +from typing import Dict, Any + +logger = logging.getLogger(__name__) + class OllamaClient: - def __init__(self, model_name): + def __init__(self, model_name: str, base_url: str = "http://localhost:11434"): + """Initialize Ollama client. + + Args: + model_name (str): Name of the Ollama model to use + host (str): Ollama server host address + port (int): Ollama server port + """ self.model_name = model_name + self.base_url = base_url + self.headers = {"Content-Type": "application/json"} - def process_document(self, document_text): - # Here you would implement the logic to interact with the Ollama API - # and process the document text using the specified model. - # This is a placeholder for the actual API call. - processed_text = self._mock_api_call(document_text) - return processed_text + def generate(self, prompt: str, strip_think: bool = True) -> str: + """Process a document using the Ollama API. + + Args: + document_text (str): The text content to process + + Returns: + str: Processed text response from the model + + Raises: + RequestException: If the API call fails + """ + try: + url = f"{self.base_url}/api/generate" + payload = { + "model": self.model_name, + "prompt": prompt, + "stream": False + } + + logger.debug(f"Sending request to Ollama API: {url}") + response = requests.post(url, json=payload, headers=self.headers) + response.raise_for_status() + + result = response.json() + logger.debug(f"Received response from Ollama API: {result}") + if strip_think: + # Remove the "thinking" part from the response + # the response is expected to be ...response_text + # Check if the response contains tag + if "" in result.get("response", ""): + # Split the response and take the part after + response_parts = result["response"].split("") + if len(response_parts) > 1: + # Return the part after + return response_parts[1].strip() + else: + # If no closing tag, return the full response + return result.get("response", "").strip() + else: + # If no tag, return the full response + return result.get("response", "").strip() + else: + # If strip_think is False, return the full response + return result.get("response", "") - def _mock_api_call(self, document_text): - # Mock processing: In a real implementation, this would call the Ollama API. - # For now, it just returns the input text with a note indicating it was processed. - return f"Processed with {self.model_name}: {document_text}" \ No newline at end of file + + except requests.exceptions.RequestException as e: + logger.error(f"Error calling Ollama API: {str(e)}") + raise + + def get_model_info(self) -> Dict[str, Any]: + """Get information about the current model. + + Returns: + Dict[str, Any]: Model information + + Raises: + RequestException: If the API call fails + """ + try: + url = f"{self.base_url}/api/show" + payload = {"name": self.model_name} + + response = requests.post(url, json=payload, headers=self.headers) + response.raise_for_status() + + return response.json() + + except requests.exceptions.RequestException as e: + logger.error(f"Error getting model info: {str(e)}") + raise \ No newline at end of file diff --git a/tests/test.txt b/tests/test.txt new file mode 100644 index 0000000..c67c623 --- /dev/null +++ b/tests/test.txt @@ -0,0 +1 @@ +关于张三天和北京易见天树有限公司的劳动纠纷 \ No newline at end of file