Enhance document processing with Ollama integration and update .gitignore
- Added OllamaClient for document processing in TxtDocumentProcessor. - Updated process_content method to use Ollama API for content masking. - Refactored FileMonitor to utilize DocumentService with OllamaClient. - Removed unnecessary log files and Python cache files. - Added test file for document processing validation.
This commit is contained in:
parent
fc68c243bb
commit
592fb66f40
|
|
@ -62,4 +62,9 @@ temp/
|
||||||
.env.local
|
.env.local
|
||||||
.env.development.local
|
.env.development.local
|
||||||
.env.test.local
|
.env.test.local
|
||||||
.env.production.local
|
.env.production.local
|
||||||
|
|
||||||
|
src_folder
|
||||||
|
target_folder
|
||||||
|
app.log
|
||||||
|
__pycache__
|
||||||
1
app.log
1
app.log
|
|
@ -1 +0,0 @@
|
||||||
2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -1,17 +1,45 @@
|
||||||
from models.document_processor import DocumentProcessor
|
from models.document_processor import DocumentProcessor
|
||||||
|
from services.ollama_client import OllamaClient
|
||||||
|
import textwrap
|
||||||
|
import logging
|
||||||
|
from config.settings import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
class TxtDocumentProcessor(DocumentProcessor):
|
class TxtDocumentProcessor(DocumentProcessor):
|
||||||
def __init__(self, input_path: str, output_path: str):
|
def __init__(self, input_path: str, output_path: str):
|
||||||
self.input_path = input_path
|
self.input_path = input_path
|
||||||
self.output_path = output_path
|
self.output_path = output_path
|
||||||
|
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
|
||||||
|
|
||||||
def read_content(self) -> str:
|
def read_content(self) -> str:
|
||||||
with open(self.input_path, 'r', encoding='utf-8') as file:
|
with open(self.input_path, 'r', encoding='utf-8') as file:
|
||||||
return file.read()
|
return file.read()
|
||||||
|
|
||||||
def process_content(self, content: str) -> str:
|
def process_content(self, content: str) -> str:
|
||||||
# Implementation for processing text content
|
prompt = textwrap.dedent("""
|
||||||
return content
|
您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理:
|
||||||
|
|
||||||
|
规则:
|
||||||
|
1. 人名:
|
||||||
|
- 两字名改为"姓+某"(如:张三 → 张某)
|
||||||
|
- 三字名改为"姓+某某"(如:张三丰 → 张某某)
|
||||||
|
2. 公司名:
|
||||||
|
- 保留地理位置信息(如:北京、上海等)
|
||||||
|
- 保留公司类型(如:有限公司、股份公司等)
|
||||||
|
- 用"某"替换核心名称
|
||||||
|
3. 保持原文其他部分不变
|
||||||
|
4. 确保脱敏后的文本保持原有的语言流畅性和可读性
|
||||||
|
|
||||||
|
输入文本:
|
||||||
|
{text}
|
||||||
|
|
||||||
|
请直接输出脱敏后的文本,无需解释或其他备注。
|
||||||
|
""")
|
||||||
|
|
||||||
|
formatted_prompt = prompt.format(text=content)
|
||||||
|
response = self.ollama_client.generate(formatted_prompt)
|
||||||
|
logger.debug(f"Processed content: {response}")
|
||||||
|
return response
|
||||||
|
|
||||||
def save_content(self, content: str) -> None:
|
def save_content(self, content: str) -> None:
|
||||||
with open(self.output_path, 'w', encoding='utf-8') as file:
|
with open(self.output_path, 'w', encoding='utf-8') as file:
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -19,10 +19,10 @@ class DocumentService:
|
||||||
content = processor.read_content()
|
content = processor.read_content()
|
||||||
|
|
||||||
# Process with Ollama
|
# Process with Ollama
|
||||||
processed_content = self.ollama_client.process_document(content)
|
masked_content = processor.process_content(content)
|
||||||
|
|
||||||
# Save processed content
|
# Save processed content
|
||||||
processor.save_content(processed_content)
|
processor.save_content(masked_content)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -1,24 +1,54 @@
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
from services.document_service import DocumentService
|
||||||
|
from services.ollama_client import OllamaClient
|
||||||
|
from config.settings import settings
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class FileMonitor:
|
class FileMonitor:
|
||||||
def __init__(self, directory, callback):
|
def __init__(self, input_directory: str, output_directory: str):
|
||||||
self.directory = directory
|
self.input_directory = input_directory
|
||||||
self.callback = callback
|
self.output_directory = output_directory
|
||||||
|
|
||||||
|
# Create OllamaClient instance using settings
|
||||||
|
ollama_client = OllamaClient(
|
||||||
|
model_name=settings.OLLAMA_MODEL,
|
||||||
|
base_url=settings.OLLAMA_API_URL
|
||||||
|
)
|
||||||
|
# Inject OllamaClient into DocumentService
|
||||||
|
self.document_service = DocumentService(ollama_client=ollama_client)
|
||||||
|
|
||||||
|
def process_new_file(self, file_path: str) -> None:
|
||||||
|
try:
|
||||||
|
# Get the filename without directory path
|
||||||
|
filename = os.path.basename(file_path)
|
||||||
|
# Create output path
|
||||||
|
output_path = os.path.join(self.output_directory, filename)
|
||||||
|
|
||||||
|
logger.info(f"Processing file: {filename}")
|
||||||
|
# Process the document using document service
|
||||||
|
self.document_service.process_document(file_path, output_path)
|
||||||
|
logger.info(f"File processed successfully: {filename}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing file {file_path}: {str(e)}")
|
||||||
|
|
||||||
def start_monitoring(self):
|
def start_monitoring(self):
|
||||||
import time
|
import time
|
||||||
import os
|
|
||||||
|
# Ensure output directory exists
|
||||||
already_seen = set(os.listdir(self.directory))
|
os.makedirs(self.output_directory, exist_ok=True)
|
||||||
|
|
||||||
|
already_seen = set(os.listdir(self.input_directory))
|
||||||
while True:
|
while True:
|
||||||
time.sleep(1) # Check every second
|
time.sleep(1) # Check every second
|
||||||
current_files = set(os.listdir(self.directory))
|
current_files = set(os.listdir(self.input_directory))
|
||||||
new_files = current_files - already_seen
|
new_files = current_files - already_seen
|
||||||
|
|
||||||
for new_file in new_files:
|
for new_file in new_files:
|
||||||
logger.info(f"monitor: new file found: {new_file}")
|
file_path = os.path.join(self.input_directory, new_file)
|
||||||
self.callback(os.path.join(self.directory, new_file))
|
logger.info(f"New file found: {new_file}")
|
||||||
|
self.process_new_file(file_path)
|
||||||
|
|
||||||
already_seen = current_files
|
already_seen = current_files
|
||||||
|
|
@ -1,15 +1,91 @@
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class OllamaClient:
|
class OllamaClient:
|
||||||
def __init__(self, model_name):
|
def __init__(self, model_name: str, base_url: str = "http://localhost:11434"):
|
||||||
|
"""Initialize Ollama client.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name (str): Name of the Ollama model to use
|
||||||
|
host (str): Ollama server host address
|
||||||
|
port (int): Ollama server port
|
||||||
|
"""
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
self.base_url = base_url
|
||||||
|
self.headers = {"Content-Type": "application/json"}
|
||||||
|
|
||||||
def process_document(self, document_text):
|
def generate(self, prompt: str, strip_think: bool = True) -> str:
|
||||||
# Here you would implement the logic to interact with the Ollama API
|
"""Process a document using the Ollama API.
|
||||||
# and process the document text using the specified model.
|
|
||||||
# This is a placeholder for the actual API call.
|
Args:
|
||||||
processed_text = self._mock_api_call(document_text)
|
document_text (str): The text content to process
|
||||||
return processed_text
|
|
||||||
|
Returns:
|
||||||
|
str: Processed text response from the model
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RequestException: If the API call fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
url = f"{self.base_url}/api/generate"
|
||||||
|
payload = {
|
||||||
|
"model": self.model_name,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": False
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug(f"Sending request to Ollama API: {url}")
|
||||||
|
response = requests.post(url, json=payload, headers=self.headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
logger.debug(f"Received response from Ollama API: {result}")
|
||||||
|
if strip_think:
|
||||||
|
# Remove the "thinking" part from the response
|
||||||
|
# the response is expected to be <think>...</think>response_text
|
||||||
|
# Check if the response contains <think> tag
|
||||||
|
if "<think>" in result.get("response", ""):
|
||||||
|
# Split the response and take the part after </think>
|
||||||
|
response_parts = result["response"].split("</think>")
|
||||||
|
if len(response_parts) > 1:
|
||||||
|
# Return the part after </think>
|
||||||
|
return response_parts[1].strip()
|
||||||
|
else:
|
||||||
|
# If no closing tag, return the full response
|
||||||
|
return result.get("response", "").strip()
|
||||||
|
else:
|
||||||
|
# If no <think> tag, return the full response
|
||||||
|
return result.get("response", "").strip()
|
||||||
|
else:
|
||||||
|
# If strip_think is False, return the full response
|
||||||
|
return result.get("response", "")
|
||||||
|
|
||||||
def _mock_api_call(self, document_text):
|
|
||||||
# Mock processing: In a real implementation, this would call the Ollama API.
|
except requests.exceptions.RequestException as e:
|
||||||
# For now, it just returns the input text with a note indicating it was processed.
|
logger.error(f"Error calling Ollama API: {str(e)}")
|
||||||
return f"Processed with {self.model_name}: {document_text}"
|
raise
|
||||||
|
|
||||||
|
def get_model_info(self) -> Dict[str, Any]:
|
||||||
|
"""Get information about the current model.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Model information
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RequestException: If the API call fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
url = f"{self.base_url}/api/show"
|
||||||
|
payload = {"name": self.model_name}
|
||||||
|
|
||||||
|
response = requests.post(url, json=payload, headers=self.headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error getting model info: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
关于张三天和北京易见天树有限公司的劳动纠纷
|
||||||
Loading…
Reference in New Issue