From 592fb66f405d5fb01f3b0e430ec84b0ec9344c31 Mon Sep 17 00:00:00 2001 From: tigermren Date: Wed, 23 Apr 2025 01:09:33 +0800 Subject: [PATCH] Enhance document processing with Ollama integration and update .gitignore - Added OllamaClient for document processing in TxtDocumentProcessor. - Updated process_content method to use Ollama API for content masking. - Refactored FileMonitor to utilize DocumentService with OllamaClient. - Removed unnecessary log files and Python cache files. - Added test file for document processing validation. --- .gitignore | 7 +- app.log | 1 - .../logging_config.cpython-311.pyc | Bin 1213 -> 0 bytes .../logging_config.cpython-312.pyc | Bin 1201 -> 0 bytes .../__pycache__/settings.cpython-311.pyc | Bin 1599 -> 0 bytes .../__pycache__/settings.cpython-312.pyc | Bin 1440 -> 0 bytes src/models/processors/txt_processor.py | 32 +++++- .../__pycache__/file_monitor.cpython-312.pyc | Bin 1462 -> 0 bytes src/services/document_service.py | 4 +- src/services/file_monitor.py | 48 +++++++-- src/services/ollama_client.py | 98 ++++++++++++++++-- tests/test.txt | 1 + 12 files changed, 165 insertions(+), 26 deletions(-) delete mode 100644 app.log delete mode 100644 src/config/__pycache__/logging_config.cpython-311.pyc delete mode 100644 src/config/__pycache__/logging_config.cpython-312.pyc delete mode 100644 src/config/__pycache__/settings.cpython-311.pyc delete mode 100644 src/config/__pycache__/settings.cpython-312.pyc delete mode 100644 src/services/__pycache__/file_monitor.cpython-312.pyc create mode 100644 tests/test.txt diff --git a/.gitignore b/.gitignore index 3775a45..20214e2 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,9 @@ temp/ .env.local .env.development.local .env.test.local -.env.production.local \ No newline at end of file +.env.production.local + +src_folder +target_folder +app.log +__pycache__ \ No newline at end of file diff --git a/app.log b/app.log deleted file mode 100644 index c40399a..0000000 --- a/app.log +++ /dev/null @@ -1 +0,0 @@ -2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md diff --git a/src/config/__pycache__/logging_config.cpython-311.pyc b/src/config/__pycache__/logging_config.cpython-311.pyc deleted file mode 100644 index 06116baa3a9b6dc6d2f892349e200b1f691f70f4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1213 zcmZWn&1(}u6rcUfCN&@ZNZNw2K`nKWWbrB@)Y{ZSO+l-NUc#E0rrU0^WoBAzP9FRp zdWldxDB?d;CE_x85X3`o3q9${H=AsLnlRLn5PUJIxDI?i17 zLSAajn7~*12F~_XoWprsXsZzYn?H%C`Wn=o?ifOcFiQrP8Wr^9hUw`88XUN7=)g8a>WEP01zjtm#)=u8lHX5_A}uh+3oZbX&qV{^1o3eIp*=asc%7=r>EAaW{~m#48!+Km7bq9zFeEbAYw?wsCUP5_ zYisL`wdTrZqrSfO0bNK5zaKfz7&Pf{4|*kC=OHWlEOYXGm%PkrKbZfMk(cL(%Ui37BP|^ptItK%!HYnNQ?80Eyi43iniI4=%m^PyVDpkYz8)30LC@zJiTT$AWfw+VmD&77YnyU=v z>~Ll_nlLAUAR(WdG$SOTg3=S={PolN+1%|>srho0(zC#npqT|Ty0b8)MhM_lE_$G- LN-aYFL)`lx3UwfF-+RCJe)D@T^D&XI5uDR6+U_q6q3<#n zod|*8>9$u#OF)?OCIy^Dksa4ChIN&6esbI2LtX`U@Dk z*uwFaj_v>9z?!_EYXV7SbVXX$m#V4E+ul@ik145f= za?EeJE<0gG#&LMa9bFjJz++6r2Nax8VFcue1j1xdZ*zLYRq_KeWD<(!7*wv7#-)+# z6xI$DG`uKDaEk_g9?m!kBw8sx==sBSM)U zHbf5l%zGY?3OV+foWCOF04VISVn(Or{jniZg9lK*^<O7 zhnpk!gIqiu1;HPqHVf+wpZi|mH%Vc*q5{mji3%XN?K z6Y4TrbqCb$P()=A77tED0^ZEwL9{W@KnuXW06#VZ<^}qpPrNR_EO+#LThF%|r}{Fz z0)S(SomD1+pM|ZbGO8P5zw^m1qxJ+!UK3m-oR#dmX zu}UvPTO@_%r6;eX^ODNC4bSEo$l;>XVz`cx_v{-*gegdHhXE5UGC0Z>t^zu%wjvU*v;PP$Bh}gmqgj6e}G(} zlXKcRr#my-pD-uwekwU>_6jII(Ve^cw)82tbWSpt@5l56Tw$io{>}L*^+odRo}wzn GXof!_!ZmRK diff --git a/src/config/__pycache__/settings.cpython-311.pyc b/src/config/__pycache__/settings.cpython-311.pyc deleted file mode 100644 index addb530c967e658ea9ed4cd82c40a405694dc929..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1599 zcmZux&rjP(6dotzU?31uXn+z*swxs`l~{s3R1sPcAcb^IT!l~xUo3AtglruWzw?IQ7kh@I&31H=o}(GjIIfn`i$qF)@aq zt^S3N@?nJjWn|dIYvr;G$~$BtQ?O7|&;)_$AuH4jYheL}&^9u|zakS8;T+ER0AtWJ z96|)21NktVLYiobdngn6$YwGEjcyno*(1K+u@d%)SW7UOJ&F`kHrkch;22ttN2XxfxH3n9-@X&7gFNp2bbk?cEi z(>b<%V#?0jp?CdqiIY%0BM8#k&&PKb4fH;Ly#$Qh50u>)SJmEA(M9Zw%*)6Hz!Or>}7>D_#K zkH*;YR-sxP?in#!t(@i5sl*LSO7ULN2-o?)d<4pE(Xf&+LmGx+5lsviBVNUZ7n9p` z?88RAjdR34X^Y4HVfFq(@->4Ip*zy)Rfh(;yytk*aMe2r?HJw_v zX?SmmD|>EDu5s^V@A`AewN7bd;IW&41Lt9Qj{!P2y+1wQmHNr2fi&Nrn(d0LyGyXe zeF9L<#N9bYn5E0oum@9v$P)I+4s`{hUa<@ulJv<1$h4bYJlOn*28NT z5wx({DW7cycmS6 zlFeoh9(0hg7r(Xt!U*hf|HNJbelZ7w!Jhh3x}~Ggr{10IZh7p1zWROi`{{ST)BR4r z$6^t{#;f1p&5Y;30+!@}wq!^Ww?j^-EgP}~LSP42@-tvTES<3& zd>AkUFgzOrFqE#655MG;muPg;^zl9p0^4o*j8(l8*q&=T%}Kbp!3kXa%8cg#14FWa z5jtQ+7&3-fu|jv4$C8b(ASg&NW(0)=%^0d6RnV+4D<~pp&WPYSmL)1EW<-T=UQpbK z30e@8Fy;ka5tKCIf>NBs(yy|mr_Vy;dX0tdVm?FF>H2z~mkxYP{8C`IFu`tV8=sUc zyJh=Nx+MpYMi}TMe||(Cg)$3c6oz?DoF=m}G%T zS~!^IyV@w%cPb6EU9GdUdi`*mG5t6QI{rqfWOnSL=Qw8DEH=G1o!2$3yjMo$`&D#M z*C@F3Dv|UCGQ!!u&39mYz|bdH7fN7WsSmW?XFH|HGKzZl?N5A z_gQYu^qYa*#(6(m$mZ5u^MmQ&6YTtt+t~Nb77L^ip5U%t-zztIiQHi!*DmC&Y;JcW zx3`hor*ZDyE;lOEm>ILvDLP(@s^Z0}R=Lz;(s<9nMTQyw^%(CkD`Y^{TS^z^m3Q&! zZWgyZ_sDMbU=h0~J>@t!Dtti|({a3SYxyu$5#nVdL?Z~bJ?q%vb`+tf$EH{iBbB|0 zj%)|h#MDCAZF&|v4OGG31;qRXT=2r)ZGOydv(_m!MRg^Aa<+<{E)9<>CTW&%iCz8` zCeOeqxpbR~JtjAq)ba_!KL_E_~QaO}{DO zKR6l=x*fJ(g#V;O^;G>3aCvQ1y)$lvugDq;<{jsM!sN9qNz%VzAbt7=SbhW3Qs(sg SH-O##UI)?C&l^9eqLTLy!GN~_ diff --git a/src/models/processors/txt_processor.py b/src/models/processors/txt_processor.py index 61e920e..413b0fc 100644 --- a/src/models/processors/txt_processor.py +++ b/src/models/processors/txt_processor.py @@ -1,17 +1,45 @@ from models.document_processor import DocumentProcessor +from services.ollama_client import OllamaClient +import textwrap +import logging +from config.settings import settings +logger = logging.getLogger(__name__) class TxtDocumentProcessor(DocumentProcessor): def __init__(self, input_path: str, output_path: str): self.input_path = input_path self.output_path = output_path + self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL) def read_content(self) -> str: with open(self.input_path, 'r', encoding='utf-8') as file: return file.read() def process_content(self, content: str) -> str: - # Implementation for processing text content - return content + prompt = textwrap.dedent(""" + 您是一位专业的法律文档脱敏专家。请按照以下规则对文本进行脱敏处理: + + 规则: + 1. 人名: + - 两字名改为"姓+某"(如:张三 → 张某) + - 三字名改为"姓+某某"(如:张三丰 → 张某某) + 2. 公司名: + - 保留地理位置信息(如:北京、上海等) + - 保留公司类型(如:有限公司、股份公司等) + - 用"某"替换核心名称 + 3. 保持原文其他部分不变 + 4. 确保脱敏后的文本保持原有的语言流畅性和可读性 + + 输入文本: + {text} + + 请直接输出脱敏后的文本,无需解释或其他备注。 + """) + + formatted_prompt = prompt.format(text=content) + response = self.ollama_client.generate(formatted_prompt) + logger.debug(f"Processed content: {response}") + return response def save_content(self, content: str) -> None: with open(self.output_path, 'w', encoding='utf-8') as file: diff --git a/src/services/__pycache__/file_monitor.cpython-312.pyc b/src/services/__pycache__/file_monitor.cpython-312.pyc deleted file mode 100644 index cb48db24c3a91ec2b2af5200b71c04d6310f7911..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1462 zcmb_c&2JM&6o30+XE*-9VA_P>!h)dE9>7vXTjW+bwCNWjse0(e5?U?06SK}{9cDIE zw$@b+Ik-})l9LlvRp11OLwn?Z=mq4Wx)TnS;F4Q}50!fAo3%Hw(jNN`Z{ECp-|sj3 zYc4klNPDl;TYty^FU2JxbtDYtQ0RgSuH-`%WFSyS+oMF}$I?(&Rb==KTzMT_wr(fW?(5LRBqnyuww z(U(^+n`L>}fn~K$jSv1zIcRDf_#?5A~VwC;)#kZ%kD0AsPS8n_k1%y2dImI;>|j4OxF7URzuTL7uaoWPPxP~eAFh3O?e6W3`4^?L_v8nY_a`4-`FZxy zY|p6lN*{FQ`_mU6W}i)8>3s3Bbn&kCqVQ%nvzd8%=KUwyFRiDgkA5qB+%KH$<~DO% zH+O2=wH<%kA4Yg`wRdLjdEt}&6ukBJz6K?;bA7)E$BI+{cEL=E=N8z@#IpFLh26QU z_!=#qwXEB#wm8Lyz diff --git a/src/services/document_service.py b/src/services/document_service.py index 6af2377..6a42e62 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -19,10 +19,10 @@ class DocumentService: content = processor.read_content() # Process with Ollama - processed_content = self.ollama_client.process_document(content) + masked_content = processor.process_content(content) # Save processed content - processor.save_content(processed_content) + processor.save_content(masked_content) return True except Exception as e: diff --git a/src/services/file_monitor.py b/src/services/file_monitor.py index d6b8f8d..c44040d 100644 --- a/src/services/file_monitor.py +++ b/src/services/file_monitor.py @@ -1,24 +1,54 @@ import logging +import os +from services.document_service import DocumentService +from services.ollama_client import OllamaClient +from config.settings import settings logger = logging.getLogger(__name__) class FileMonitor: - def __init__(self, directory, callback): - self.directory = directory - self.callback = callback + def __init__(self, input_directory: str, output_directory: str): + self.input_directory = input_directory + self.output_directory = output_directory + + # Create OllamaClient instance using settings + ollama_client = OllamaClient( + model_name=settings.OLLAMA_MODEL, + base_url=settings.OLLAMA_API_URL + ) + # Inject OllamaClient into DocumentService + self.document_service = DocumentService(ollama_client=ollama_client) + + def process_new_file(self, file_path: str) -> None: + try: + # Get the filename without directory path + filename = os.path.basename(file_path) + # Create output path + output_path = os.path.join(self.output_directory, filename) + + logger.info(f"Processing file: {filename}") + # Process the document using document service + self.document_service.process_document(file_path, output_path) + logger.info(f"File processed successfully: {filename}") + + except Exception as e: + logger.error(f"Error processing file {file_path}: {str(e)}") def start_monitoring(self): import time - import os - - already_seen = set(os.listdir(self.directory)) + + # Ensure output directory exists + os.makedirs(self.output_directory, exist_ok=True) + + already_seen = set(os.listdir(self.input_directory)) while True: time.sleep(1) # Check every second - current_files = set(os.listdir(self.directory)) + current_files = set(os.listdir(self.input_directory)) new_files = current_files - already_seen for new_file in new_files: - logger.info(f"monitor: new file found: {new_file}") - self.callback(os.path.join(self.directory, new_file)) + file_path = os.path.join(self.input_directory, new_file) + logger.info(f"New file found: {new_file}") + self.process_new_file(file_path) already_seen = current_files \ No newline at end of file diff --git a/src/services/ollama_client.py b/src/services/ollama_client.py index e8c64ad..b1dfa96 100644 --- a/src/services/ollama_client.py +++ b/src/services/ollama_client.py @@ -1,15 +1,91 @@ +import requests +import logging +from typing import Dict, Any + +logger = logging.getLogger(__name__) + class OllamaClient: - def __init__(self, model_name): + def __init__(self, model_name: str, base_url: str = "http://localhost:11434"): + """Initialize Ollama client. + + Args: + model_name (str): Name of the Ollama model to use + host (str): Ollama server host address + port (int): Ollama server port + """ self.model_name = model_name + self.base_url = base_url + self.headers = {"Content-Type": "application/json"} - def process_document(self, document_text): - # Here you would implement the logic to interact with the Ollama API - # and process the document text using the specified model. - # This is a placeholder for the actual API call. - processed_text = self._mock_api_call(document_text) - return processed_text + def generate(self, prompt: str, strip_think: bool = True) -> str: + """Process a document using the Ollama API. + + Args: + document_text (str): The text content to process + + Returns: + str: Processed text response from the model + + Raises: + RequestException: If the API call fails + """ + try: + url = f"{self.base_url}/api/generate" + payload = { + "model": self.model_name, + "prompt": prompt, + "stream": False + } + + logger.debug(f"Sending request to Ollama API: {url}") + response = requests.post(url, json=payload, headers=self.headers) + response.raise_for_status() + + result = response.json() + logger.debug(f"Received response from Ollama API: {result}") + if strip_think: + # Remove the "thinking" part from the response + # the response is expected to be ...response_text + # Check if the response contains tag + if "" in result.get("response", ""): + # Split the response and take the part after + response_parts = result["response"].split("") + if len(response_parts) > 1: + # Return the part after + return response_parts[1].strip() + else: + # If no closing tag, return the full response + return result.get("response", "").strip() + else: + # If no tag, return the full response + return result.get("response", "").strip() + else: + # If strip_think is False, return the full response + return result.get("response", "") - def _mock_api_call(self, document_text): - # Mock processing: In a real implementation, this would call the Ollama API. - # For now, it just returns the input text with a note indicating it was processed. - return f"Processed with {self.model_name}: {document_text}" \ No newline at end of file + + except requests.exceptions.RequestException as e: + logger.error(f"Error calling Ollama API: {str(e)}") + raise + + def get_model_info(self) -> Dict[str, Any]: + """Get information about the current model. + + Returns: + Dict[str, Any]: Model information + + Raises: + RequestException: If the API call fails + """ + try: + url = f"{self.base_url}/api/show" + payload = {"name": self.model_name} + + response = requests.post(url, json=payload, headers=self.headers) + response.raise_for_status() + + return response.json() + + except requests.exceptions.RequestException as e: + logger.error(f"Error getting model info: {str(e)}") + raise \ No newline at end of file diff --git a/tests/test.txt b/tests/test.txt new file mode 100644 index 0000000..c67c623 --- /dev/null +++ b/tests/test.txt @@ -0,0 +1 @@ +关于张三天和北京易见天树有限公司的劳动纠纷 \ No newline at end of file