diff --git a/.gitignore b/.gitignore
index 3775a45..20214e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,4 +62,9 @@ temp/
.env.local
.env.development.local
.env.test.local
-.env.production.local
\ No newline at end of file
+.env.production.local
+
+src_folder
+target_folder
+app.log
+__pycache__
\ No newline at end of file
diff --git a/app.log b/app.log
deleted file mode 100644
index c40399a..0000000
--- a/app.log
+++ /dev/null
@@ -1 +0,0 @@
-2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md
diff --git a/src/config/__pycache__/logging_config.cpython-311.pyc b/src/config/__pycache__/logging_config.cpython-311.pyc
deleted file mode 100644
index 06116ba..0000000
Binary files a/src/config/__pycache__/logging_config.cpython-311.pyc and /dev/null differ
diff --git a/src/config/__pycache__/logging_config.cpython-312.pyc b/src/config/__pycache__/logging_config.cpython-312.pyc
deleted file mode 100644
index e6f5b95..0000000
Binary files a/src/config/__pycache__/logging_config.cpython-312.pyc and /dev/null differ
diff --git a/src/config/__pycache__/settings.cpython-311.pyc b/src/config/__pycache__/settings.cpython-311.pyc
deleted file mode 100644
index addb530..0000000
Binary files a/src/config/__pycache__/settings.cpython-311.pyc and /dev/null differ
diff --git a/src/config/__pycache__/settings.cpython-312.pyc b/src/config/__pycache__/settings.cpython-312.pyc
deleted file mode 100644
index c36ad4f..0000000
Binary files a/src/config/__pycache__/settings.cpython-312.pyc and /dev/null differ
diff --git a/src/models/processors/txt_processor.py b/src/models/processors/txt_processor.py
index 61e920e..413b0fc 100644
--- a/src/models/processors/txt_processor.py
+++ b/src/models/processors/txt_processor.py
@@ -1,17 +1,45 @@
from models.document_processor import DocumentProcessor
+from services.ollama_client import OllamaClient
+import textwrap
+import logging
+from config.settings import settings
+logger = logging.getLogger(__name__)
class TxtDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
self.input_path = input_path
self.output_path = output_path
+ self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
def read_content(self) -> str:
with open(self.input_path, 'r', encoding='utf-8') as file:
return file.read()
def process_content(self, content: str) -> str:
- # Implementation for processing text content
- return content
+ prompt = textwrap.dedent("""
+ 您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理:
+
+ 规则:
+ 1. 人名:
+ - 两字名改为"姓+某"(如:张三 → 张某)
+ - 三字名改为"姓+某某"(如:张三丰 → 张某某)
+ 2. 公司名:
+ - 保留地理位置信息(如:北京、上海等)
+ - 保留公司类型(如:有限公司、股份公司等)
+ - 用"某"替换核心名称
+ 3. 保持原文其他部分不变
+ 4. 确保脱敏后的文本保持原有的语言流畅性和可读性
+
+ 输入文本:
+ {text}
+
+ 请直接输出脱敏后的文本,无需解释或其他备注。
+ """)
+
+ formatted_prompt = prompt.format(text=content)
+ response = self.ollama_client.generate(formatted_prompt)
+ logger.debug(f"Processed content: {response}")
+ return response
def save_content(self, content: str) -> None:
with open(self.output_path, 'w', encoding='utf-8') as file:
diff --git a/src/services/__pycache__/file_monitor.cpython-312.pyc b/src/services/__pycache__/file_monitor.cpython-312.pyc
deleted file mode 100644
index cb48db2..0000000
Binary files a/src/services/__pycache__/file_monitor.cpython-312.pyc and /dev/null differ
diff --git a/src/services/document_service.py b/src/services/document_service.py
index 6af2377..6a42e62 100644
--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@@ -19,10 +19,10 @@ class DocumentService:
content = processor.read_content()
# Process with Ollama
- processed_content = self.ollama_client.process_document(content)
+ masked_content = processor.process_content(content)
# Save processed content
- processor.save_content(processed_content)
+ processor.save_content(masked_content)
return True
except Exception as e:
diff --git a/src/services/file_monitor.py b/src/services/file_monitor.py
index d6b8f8d..c44040d 100644
--- a/src/services/file_monitor.py
+++ b/src/services/file_monitor.py
@@ -1,24 +1,54 @@
import logging
+import os
+from services.document_service import DocumentService
+from services.ollama_client import OllamaClient
+from config.settings import settings
logger = logging.getLogger(__name__)
class FileMonitor:
- def __init__(self, directory, callback):
- self.directory = directory
- self.callback = callback
+ def __init__(self, input_directory: str, output_directory: str):
+ self.input_directory = input_directory
+ self.output_directory = output_directory
+
+ # Create OllamaClient instance using settings
+ ollama_client = OllamaClient(
+ model_name=settings.OLLAMA_MODEL,
+ base_url=settings.OLLAMA_API_URL
+ )
+ # Inject OllamaClient into DocumentService
+ self.document_service = DocumentService(ollama_client=ollama_client)
+
+ def process_new_file(self, file_path: str) -> None:
+ try:
+ # Get the filename without directory path
+ filename = os.path.basename(file_path)
+ # Create output path
+ output_path = os.path.join(self.output_directory, filename)
+
+ logger.info(f"Processing file: {filename}")
+ # Process the document using document service
+ self.document_service.process_document(file_path, output_path)
+ logger.info(f"File processed successfully: {filename}")
+
+ except Exception as e:
+ logger.error(f"Error processing file {file_path}: {str(e)}")
def start_monitoring(self):
import time
- import os
-
- already_seen = set(os.listdir(self.directory))
+
+ # Ensure output directory exists
+ os.makedirs(self.output_directory, exist_ok=True)
+
+ already_seen = set(os.listdir(self.input_directory))
while True:
time.sleep(1) # Check every second
- current_files = set(os.listdir(self.directory))
+ current_files = set(os.listdir(self.input_directory))
new_files = current_files - already_seen
for new_file in new_files:
- logger.info(f"monitor: new file found: {new_file}")
- self.callback(os.path.join(self.directory, new_file))
+ file_path = os.path.join(self.input_directory, new_file)
+ logger.info(f"New file found: {new_file}")
+ self.process_new_file(file_path)
already_seen = current_files
\ No newline at end of file
diff --git a/src/services/ollama_client.py b/src/services/ollama_client.py
index e8c64ad..b1dfa96 100644
--- a/src/services/ollama_client.py
+++ b/src/services/ollama_client.py
@@ -1,15 +1,91 @@
+import requests
+import logging
+from typing import Dict, Any
+
+logger = logging.getLogger(__name__)
+
class OllamaClient:
- def __init__(self, model_name):
+ def __init__(self, model_name: str, base_url: str = "http://localhost:11434"):
+ """Initialize Ollama client.
+
+ Args:
+ model_name (str): Name of the Ollama model to use
+ host (str): Ollama server host address
+ port (int): Ollama server port
+ """
self.model_name = model_name
+ self.base_url = base_url
+ self.headers = {"Content-Type": "application/json"}
- def process_document(self, document_text):
- # Here you would implement the logic to interact with the Ollama API
- # and process the document text using the specified model.
- # This is a placeholder for the actual API call.
- processed_text = self._mock_api_call(document_text)
- return processed_text
+ def generate(self, prompt: str, strip_think: bool = True) -> str:
+ """Process a document using the Ollama API.
+
+ Args:
+ document_text (str): The text content to process
+
+ Returns:
+ str: Processed text response from the model
+
+ Raises:
+ RequestException: If the API call fails
+ """
+ try:
+ url = f"{self.base_url}/api/generate"
+ payload = {
+ "model": self.model_name,
+ "prompt": prompt,
+ "stream": False
+ }
+
+ logger.debug(f"Sending request to Ollama API: {url}")
+ response = requests.post(url, json=payload, headers=self.headers)
+ response.raise_for_status()
+
+ result = response.json()
+ logger.debug(f"Received response from Ollama API: {result}")
+ if strip_think:
+ # Remove the "thinking" part from the response
+ # the response is expected to be ...response_text
+ # Check if the response contains tag
+ if "" in result.get("response", ""):
+ # Split the response and take the part after
+ response_parts = result["response"].split("")
+ if len(response_parts) > 1:
+ # Return the part after
+ return response_parts[1].strip()
+ else:
+ # If no closing tag, return the full response
+ return result.get("response", "").strip()
+ else:
+ # If no tag, return the full response
+ return result.get("response", "").strip()
+ else:
+ # If strip_think is False, return the full response
+ return result.get("response", "")
- def _mock_api_call(self, document_text):
- # Mock processing: In a real implementation, this would call the Ollama API.
- # For now, it just returns the input text with a note indicating it was processed.
- return f"Processed with {self.model_name}: {document_text}"
\ No newline at end of file
+
+ except requests.exceptions.RequestException as e:
+ logger.error(f"Error calling Ollama API: {str(e)}")
+ raise
+
+ def get_model_info(self) -> Dict[str, Any]:
+ """Get information about the current model.
+
+ Returns:
+ Dict[str, Any]: Model information
+
+ Raises:
+ RequestException: If the API call fails
+ """
+ try:
+ url = f"{self.base_url}/api/show"
+ payload = {"name": self.model_name}
+
+ response = requests.post(url, json=payload, headers=self.headers)
+ response.raise_for_status()
+
+ return response.json()
+
+ except requests.exceptions.RequestException as e:
+ logger.error(f"Error getting model info: {str(e)}")
+ raise
\ No newline at end of file
diff --git a/tests/test.txt b/tests/test.txt
new file mode 100644
index 0000000..c67c623
--- /dev/null
+++ b/tests/test.txt
@@ -0,0 +1 @@
+关于张三天和北京易见天树有限公司的劳动纠纷
\ No newline at end of file