Enhance document processing with Ollama integration and update .gitignore
- Added OllamaClient for document processing in TxtDocumentProcessor. - Updated process_content method to use Ollama API for content masking. - Refactored FileMonitor to utilize DocumentService with OllamaClient. - Removed unnecessary log files and Python cache files. - Added test file for document processing validation.
This commit is contained in:
parent
fc68c243bb
commit
592fb66f40
|
|
@ -63,3 +63,8 @@ temp/
|
|||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
src_folder
|
||||
target_folder
|
||||
app.log
|
||||
__pycache__
|
||||
1
app.log
1
app.log
|
|
@ -1 +0,0 @@
|
|||
2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -1,17 +1,45 @@
|
|||
from models.document_processor import DocumentProcessor
|
||||
from services.ollama_client import OllamaClient
|
||||
import textwrap
|
||||
import logging
|
||||
from config.settings import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
class TxtDocumentProcessor(DocumentProcessor):
|
||||
def __init__(self, input_path: str, output_path: str):
|
||||
self.input_path = input_path
|
||||
self.output_path = output_path
|
||||
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||
|
||||
def read_content(self) -> str:
|
||||
with open(self.input_path, 'r', encoding='utf-8') as file:
|
||||
return file.read()
|
||||
|
||||
def process_content(self, content: str) -> str:
|
||||
# Implementation for processing text content
|
||||
return content
|
||||
prompt = textwrap.dedent("""
|
||||
您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理:
|
||||
|
||||
规则:
|
||||
1. 人名:
|
||||
- 两字名改为"姓+某"(如:张三 → 张某)
|
||||
- 三字名改为"姓+某某"(如:张三丰 → 张某某)
|
||||
2. 公司名:
|
||||
- 保留地理位置信息(如:北京、上海等)
|
||||
- 保留公司类型(如:有限公司、股份公司等)
|
||||
- 用"某"替换核心名称
|
||||
3. 保持原文其他部分不变
|
||||
4. 确保脱敏后的文本保持原有的语言流畅性和可读性
|
||||
|
||||
输入文本:
|
||||
{text}
|
||||
|
||||
请直接输出脱敏后的文本,无需解释或其他备注。
|
||||
""")
|
||||
|
||||
formatted_prompt = prompt.format(text=content)
|
||||
response = self.ollama_client.generate(formatted_prompt)
|
||||
logger.debug(f"Processed content: {response}")
|
||||
return response
|
||||
|
||||
def save_content(self, content: str) -> None:
|
||||
with open(self.output_path, 'w', encoding='utf-8') as file:
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -19,10 +19,10 @@ class DocumentService:
|
|||
content = processor.read_content()
|
||||
|
||||
# Process with Ollama
|
||||
processed_content = self.ollama_client.process_document(content)
|
||||
masked_content = processor.process_content(content)
|
||||
|
||||
# Save processed content
|
||||
processor.save_content(processed_content)
|
||||
processor.save_content(masked_content)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -1,24 +1,54 @@
|
|||
import logging
|
||||
import os
|
||||
from services.document_service import DocumentService
|
||||
from services.ollama_client import OllamaClient
|
||||
from config.settings import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class FileMonitor:
|
||||
def __init__(self, directory, callback):
|
||||
self.directory = directory
|
||||
self.callback = callback
|
||||
def __init__(self, input_directory: str, output_directory: str):
|
||||
self.input_directory = input_directory
|
||||
self.output_directory = output_directory
|
||||
|
||||
# Create OllamaClient instance using settings
|
||||
ollama_client = OllamaClient(
|
||||
model_name=settings.OLLAMA_MODEL,
|
||||
base_url=settings.OLLAMA_API_URL
|
||||
)
|
||||
# Inject OllamaClient into DocumentService
|
||||
self.document_service = DocumentService(ollama_client=ollama_client)
|
||||
|
||||
def process_new_file(self, file_path: str) -> None:
|
||||
try:
|
||||
# Get the filename without directory path
|
||||
filename = os.path.basename(file_path)
|
||||
# Create output path
|
||||
output_path = os.path.join(self.output_directory, filename)
|
||||
|
||||
logger.info(f"Processing file: {filename}")
|
||||
# Process the document using document service
|
||||
self.document_service.process_document(file_path, output_path)
|
||||
logger.info(f"File processed successfully: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_path}: {str(e)}")
|
||||
|
||||
def start_monitoring(self):
|
||||
import time
|
||||
import os
|
||||
|
||||
already_seen = set(os.listdir(self.directory))
|
||||
# Ensure output directory exists
|
||||
os.makedirs(self.output_directory, exist_ok=True)
|
||||
|
||||
already_seen = set(os.listdir(self.input_directory))
|
||||
while True:
|
||||
time.sleep(1) # Check every second
|
||||
current_files = set(os.listdir(self.directory))
|
||||
current_files = set(os.listdir(self.input_directory))
|
||||
new_files = current_files - already_seen
|
||||
|
||||
for new_file in new_files:
|
||||
logger.info(f"monitor: new file found: {new_file}")
|
||||
self.callback(os.path.join(self.directory, new_file))
|
||||
file_path = os.path.join(self.input_directory, new_file)
|
||||
logger.info(f"New file found: {new_file}")
|
||||
self.process_new_file(file_path)
|
||||
|
||||
already_seen = current_files
|
||||
|
|
@ -1,15 +1,91 @@
|
|||
import requests
|
||||
import logging
|
||||
from typing import Dict, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class OllamaClient:
|
||||
def __init__(self, model_name):
|
||||
def __init__(self, model_name: str, base_url: str = "http://localhost:11434"):
|
||||
"""Initialize Ollama client.
|
||||
|
||||
Args:
|
||||
model_name (str): Name of the Ollama model to use
|
||||
host (str): Ollama server host address
|
||||
port (int): Ollama server port
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.base_url = base_url
|
||||
self.headers = {"Content-Type": "application/json"}
|
||||
|
||||
def process_document(self, document_text):
|
||||
# Here you would implement the logic to interact with the Ollama API
|
||||
# and process the document text using the specified model.
|
||||
# This is a placeholder for the actual API call.
|
||||
processed_text = self._mock_api_call(document_text)
|
||||
return processed_text
|
||||
def generate(self, prompt: str, strip_think: bool = True) -> str:
|
||||
"""Process a document using the Ollama API.
|
||||
|
||||
def _mock_api_call(self, document_text):
|
||||
# Mock processing: In a real implementation, this would call the Ollama API.
|
||||
# For now, it just returns the input text with a note indicating it was processed.
|
||||
return f"Processed with {self.model_name}: {document_text}"
|
||||
Args:
|
||||
document_text (str): The text content to process
|
||||
|
||||
Returns:
|
||||
str: Processed text response from the model
|
||||
|
||||
Raises:
|
||||
RequestException: If the API call fails
|
||||
"""
|
||||
try:
|
||||
url = f"{self.base_url}/api/generate"
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
logger.debug(f"Sending request to Ollama API: {url}")
|
||||
response = requests.post(url, json=payload, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
logger.debug(f"Received response from Ollama API: {result}")
|
||||
if strip_think:
|
||||
# Remove the "thinking" part from the response
|
||||
# the response is expected to be <think>...</think>response_text
|
||||
# Check if the response contains <think> tag
|
||||
if "<think>" in result.get("response", ""):
|
||||
# Split the response and take the part after </think>
|
||||
response_parts = result["response"].split("</think>")
|
||||
if len(response_parts) > 1:
|
||||
# Return the part after </think>
|
||||
return response_parts[1].strip()
|
||||
else:
|
||||
# If no closing tag, return the full response
|
||||
return result.get("response", "").strip()
|
||||
else:
|
||||
# If no <think> tag, return the full response
|
||||
return result.get("response", "").strip()
|
||||
else:
|
||||
# If strip_think is False, return the full response
|
||||
return result.get("response", "")
|
||||
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error calling Ollama API: {str(e)}")
|
||||
raise
|
||||
|
||||
def get_model_info(self) -> Dict[str, Any]:
|
||||
"""Get information about the current model.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Model information
|
||||
|
||||
Raises:
|
||||
RequestException: If the API call fails
|
||||
"""
|
||||
try:
|
||||
url = f"{self.base_url}/api/show"
|
||||
payload = {"name": self.model_name}
|
||||
|
||||
response = requests.post(url, json=payload, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json()
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error getting model info: {str(e)}")
|
||||
raise
|
||||
|
|
@ -0,0 +1 @@
|
|||
关于张三天和北京易见天树有限公司的劳动纠纷
|
||||
Loading…
Reference in New Issue