Enhance document processing with Ollama integration and update .gitignore

- Added OllamaClient for document processing in TxtDocumentProcessor.
- Updated process_content method to use Ollama API for content masking.
- Refactored FileMonitor to utilize DocumentService with OllamaClient.
- Removed unnecessary log files and Python cache files.
- Added test file for document processing validation.
This commit is contained in:
tigermren 2025-04-23 01:09:33 +08:00
parent fc68c243bb
commit 592fb66f40
12 changed files with 165 additions and 26 deletions

5
.gitignore vendored
View File

@ -63,3 +63,8 @@ temp/
.env.development.local
.env.test.local
.env.production.local
src_folder
target_folder
app.log
__pycache__

View File

@ -1 +0,0 @@
2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md

View File

@ -1,17 +1,45 @@
from models.document_processor import DocumentProcessor
from services.ollama_client import OllamaClient
import textwrap
import logging
from config.settings import settings
logger = logging.getLogger(__name__)
class TxtDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
self.input_path = input_path
self.output_path = output_path
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
def read_content(self) -> str:
with open(self.input_path, 'r', encoding='utf-8') as file:
return file.read()
def process_content(self, content: str) -> str:
# Implementation for processing text content
return content
prompt = textwrap.dedent("""
您是一位专业的法律文档脱敏专家请按照以下规则对文本进行脱敏处理
规则
1. 人名
- 两字名改为"姓+某"张三 张某
- 三字名改为"姓+某某"张三丰 张某某
2. 公司名
- 保留地理位置信息北京上海等
- 保留公司类型有限公司股份公司等
- ""替换核心名称
3. 保持原文其他部分不变
4. 确保脱敏后的文本保持原有的语言流畅性和可读性
输入文本
{text}
请直接输出脱敏后的文本无需解释或其他备注
""")
formatted_prompt = prompt.format(text=content)
response = self.ollama_client.generate(formatted_prompt)
logger.debug(f"Processed content: {response}")
return response
def save_content(self, content: str) -> None:
with open(self.output_path, 'w', encoding='utf-8') as file:

View File

@ -19,10 +19,10 @@ class DocumentService:
content = processor.read_content()
# Process with Ollama
processed_content = self.ollama_client.process_document(content)
masked_content = processor.process_content(content)
# Save processed content
processor.save_content(processed_content)
processor.save_content(masked_content)
return True
except Exception as e:

View File

@ -1,24 +1,54 @@
import logging
import os
from services.document_service import DocumentService
from services.ollama_client import OllamaClient
from config.settings import settings
logger = logging.getLogger(__name__)
class FileMonitor:
def __init__(self, directory, callback):
self.directory = directory
self.callback = callback
def __init__(self, input_directory: str, output_directory: str):
self.input_directory = input_directory
self.output_directory = output_directory
# Create OllamaClient instance using settings
ollama_client = OllamaClient(
model_name=settings.OLLAMA_MODEL,
base_url=settings.OLLAMA_API_URL
)
# Inject OllamaClient into DocumentService
self.document_service = DocumentService(ollama_client=ollama_client)
def process_new_file(self, file_path: str) -> None:
try:
# Get the filename without directory path
filename = os.path.basename(file_path)
# Create output path
output_path = os.path.join(self.output_directory, filename)
logger.info(f"Processing file: {filename}")
# Process the document using document service
self.document_service.process_document(file_path, output_path)
logger.info(f"File processed successfully: {filename}")
except Exception as e:
logger.error(f"Error processing file {file_path}: {str(e)}")
def start_monitoring(self):
import time
import os
already_seen = set(os.listdir(self.directory))
# Ensure output directory exists
os.makedirs(self.output_directory, exist_ok=True)
already_seen = set(os.listdir(self.input_directory))
while True:
time.sleep(1) # Check every second
current_files = set(os.listdir(self.directory))
current_files = set(os.listdir(self.input_directory))
new_files = current_files - already_seen
for new_file in new_files:
logger.info(f"monitor: new file found: {new_file}")
self.callback(os.path.join(self.directory, new_file))
file_path = os.path.join(self.input_directory, new_file)
logger.info(f"New file found: {new_file}")
self.process_new_file(file_path)
already_seen = current_files

View File

@ -1,15 +1,91 @@
import requests
import logging
from typing import Dict, Any
logger = logging.getLogger(__name__)
class OllamaClient:
def __init__(self, model_name):
def __init__(self, model_name: str, base_url: str = "http://localhost:11434"):
"""Initialize Ollama client.
Args:
model_name (str): Name of the Ollama model to use
host (str): Ollama server host address
port (int): Ollama server port
"""
self.model_name = model_name
self.base_url = base_url
self.headers = {"Content-Type": "application/json"}
def process_document(self, document_text):
# Here you would implement the logic to interact with the Ollama API
# and process the document text using the specified model.
# This is a placeholder for the actual API call.
processed_text = self._mock_api_call(document_text)
return processed_text
def generate(self, prompt: str, strip_think: bool = True) -> str:
"""Process a document using the Ollama API.
def _mock_api_call(self, document_text):
# Mock processing: In a real implementation, this would call the Ollama API.
# For now, it just returns the input text with a note indicating it was processed.
return f"Processed with {self.model_name}: {document_text}"
Args:
document_text (str): The text content to process
Returns:
str: Processed text response from the model
Raises:
RequestException: If the API call fails
"""
try:
url = f"{self.base_url}/api/generate"
payload = {
"model": self.model_name,
"prompt": prompt,
"stream": False
}
logger.debug(f"Sending request to Ollama API: {url}")
response = requests.post(url, json=payload, headers=self.headers)
response.raise_for_status()
result = response.json()
logger.debug(f"Received response from Ollama API: {result}")
if strip_think:
# Remove the "thinking" part from the response
# the response is expected to be <think>...</think>response_text
# Check if the response contains <think> tag
if "<think>" in result.get("response", ""):
# Split the response and take the part after </think>
response_parts = result["response"].split("</think>")
if len(response_parts) > 1:
# Return the part after </think>
return response_parts[1].strip()
else:
# If no closing tag, return the full response
return result.get("response", "").strip()
else:
# If no <think> tag, return the full response
return result.get("response", "").strip()
else:
# If strip_think is False, return the full response
return result.get("response", "")
except requests.exceptions.RequestException as e:
logger.error(f"Error calling Ollama API: {str(e)}")
raise
def get_model_info(self) -> Dict[str, Any]:
"""Get information about the current model.
Returns:
Dict[str, Any]: Model information
Raises:
RequestException: If the API call fails
"""
try:
url = f"{self.base_url}/api/show"
payload = {"name": self.model_name}
response = requests.post(url, json=payload, headers=self.headers)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
logger.error(f"Error getting model info: {str(e)}")
raise

1
tests/test.txt Normal file
View File

@ -0,0 +1 @@
关于张三天和北京易见天树有限公司的劳动纠纷