From 0904ab50730ac1c0c2e7a51d76ae496410d3242f Mon Sep 17 00:00:00 2001 From: tigermren Date: Wed, 23 Apr 2025 00:02:10 +0800 Subject: [PATCH] Initial commit: Document processing app with Ollama integration --- .dockerignore | 13 ++++ .env.example | 19 ++++++ Dockerfile | 48 +++++++++++++++ README.md | 58 ++++++++++++++++++ app.log | 1 + requirements.txt | 10 +++ .../logging_config.cpython-311.pyc | Bin 0 -> 1213 bytes .../logging_config.cpython-312.pyc | Bin 0 -> 1201 bytes .../__pycache__/settings.cpython-311.pyc | Bin 0 -> 1599 bytes .../__pycache__/settings.cpython-312.pyc | Bin 0 -> 1440 bytes src/config/logging_config.py | 39 ++++++++++++ src/config/settings.py | 31 ++++++++++ src/main.py | 17 +++++ src/models/document.py | 12 ++++ src/models/document_factory.py | 25 ++++++++ src/models/document_processor.py | 18 ++++++ src/models/processors/__init__.py | 5 ++ src/models/processors/docx_processor.py | 20 ++++++ src/models/processors/pdf_processor.py | 20 ++++++ src/models/processors/txt_processor.py | 18 ++++++ .../__pycache__/file_monitor.cpython-312.pyc | Bin 0 -> 1462 bytes src/services/document_service.py | 30 +++++++++ src/services/file_monitor.py | 24 ++++++++ src/services/ollama_client.py | 15 +++++ src/utils/file_utils.py | 20 ++++++ src_folder/README.md | 58 ++++++++++++++++++ 26 files changed, 501 insertions(+) create mode 100644 .dockerignore create mode 100644 .env.example create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 app.log create mode 100644 requirements.txt create mode 100644 src/config/__pycache__/logging_config.cpython-311.pyc create mode 100644 src/config/__pycache__/logging_config.cpython-312.pyc create mode 100644 src/config/__pycache__/settings.cpython-311.pyc create mode 100644 src/config/__pycache__/settings.cpython-312.pyc create mode 100644 src/config/logging_config.py create mode 100644 src/config/settings.py create mode 100644 src/main.py create mode 100644 src/models/document.py create mode 100644 src/models/document_factory.py create mode 100644 src/models/document_processor.py create mode 100644 src/models/processors/__init__.py create mode 100644 src/models/processors/docx_processor.py create mode 100644 src/models/processors/pdf_processor.py create mode 100644 src/models/processors/txt_processor.py create mode 100644 src/services/__pycache__/file_monitor.cpython-312.pyc create mode 100644 src/services/document_service.py create mode 100644 src/services/file_monitor.py create mode 100644 src/services/ollama_client.py create mode 100644 src/utils/file_utils.py create mode 100644 src_folder/README.md diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d4c881d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +env/ +venv/ +.env +*.log +.git +.gitignore +.pytest_cache +tests/ \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9d89af2 --- /dev/null +++ b/.env.example @@ -0,0 +1,19 @@ +# Storage paths +OBJECT_STORAGE_PATH=/path/to/mounted/object/storage +TARGET_DIRECTORY_PATH=/path/to/target/directory + +# Ollama API Configuration +OLLAMA_API_URL=https://api.ollama.com +OLLAMA_API_KEY=your_api_key_here +OLLAMA_MODEL=llama2 + +# Application Settings +MONITOR_INTERVAL=5 + +# Logging Configuration +LOG_LEVEL=INFO +LOG_FILE=app.log + +# Optional: Additional security settings +# MAX_FILE_SIZE=10485760 # 10MB in bytes +# ALLOWED_FILE_TYPES=.txt,.doc,.docx,.pdf \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ecc62ad --- /dev/null +++ b/Dockerfile @@ -0,0 +1,48 @@ +# Build stage +FROM python:3.12-slim AS builder + +WORKDIR /app + +# Install build dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first to leverage Docker cache +COPY requirements.txt . +RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt + +# Final stage +FROM python:3.12-slim + +WORKDIR /app + +# Create non-root user +RUN useradd -m -r appuser && \ + chown appuser:appuser /app + +# Copy wheels from builder +COPY --from=builder /app/wheels /wheels +COPY --from=builder /app/requirements.txt . + +# Install dependencies +RUN pip install --no-cache /wheels/* + +# Copy application code +COPY src/ ./src/ + +# Create directories for mounted volumes +RUN mkdir -p /data/input /data/output && \ + chown -R appuser:appuser /data + +# Switch to non-root user +USER appuser + +# Environment variables +ENV PYTHONPATH=/app \ + OBJECT_STORAGE_PATH=/data/input \ + TARGET_DIRECTORY_PATH=/data/output + +# Run the application +CMD ["python", "src/main.py"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..dc3df6e --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ +# README.md + +# Document Processing App + +This project is designed to process legal documents by hiding sensitive information such as names and company names. It utilizes the Ollama API with selected models for text processing. The application monitors a specified directory for new files, processes them automatically, and saves the results to a target path. + +## Project Structure + +``` +doc-processing-app +├── src +│ ├── main.py # Entry point of the application +│ ├── config +│ │ └── settings.py # Configuration settings for paths +│ ├── services +│ │ ├── file_monitor.py # Monitors directory for new files +│ │ ├── document_processor.py # Handles document processing logic +│ │ └── ollama_client.py # Interacts with the Ollama API +│ ├── utils +│ │ └── file_utils.py # Utility functions for file operations +│ └── models +│ └── document.py # Represents the structure of a document +├── tests +│ └── test_document_processor.py # Unit tests for DocumentProcessor +├── requirements.txt # Project dependencies +├── .env.example # Example environment variables +└── README.md # Project documentation +``` + +## Setup Instructions + +1. Clone the repository: + ``` + git clone + cd doc-processing-app + ``` + +2. Install the required dependencies: + ``` + pip install -r requirements.txt + ``` + +3. Configure the application by editing the `src/config/settings.py` file to set the paths for the object storage and target directory. + +4. Create a `.env` file based on the `.env.example` file to set up necessary environment variables. + +## Usage + +To run the application, execute the following command: +``` +python src/main.py +``` + +The application will start monitoring the specified directory for new documents. Once a new document is added, it will be processed automatically. + +## Contributing + +Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes. \ No newline at end of file diff --git a/app.log b/app.log new file mode 100644 index 0000000..c40399a --- /dev/null +++ b/app.log @@ -0,0 +1 @@ +2025-04-20 20:14:00 - services.file_monitor - INFO - monitor: new file found: README.md diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c5280ad --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# Base dependencies +pydantic-settings>=2.0.0 +python-dotenv==1.0.0 +watchdog==2.1.6 +requests==2.26.0 + +# Document processing +python-docx>=0.8.11 +PyPDF2>=3.0.0 +pandas>=2.0.0 diff --git a/src/config/__pycache__/logging_config.cpython-311.pyc b/src/config/__pycache__/logging_config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06116baa3a9b6dc6d2f892349e200b1f691f70f4 GIT binary patch literal 1213 zcmZWn&1(}u6rcUfCN&@ZNZNw2K`nKWWbrB@)Y{ZSO+l-NUc#E0rrU0^WoBAzP9FRp zdWldxDB?d;CE_x85X3`o3q9${H=AsLnlRLn5PUJIxDI?i17 zLSAajn7~*12F~_XoWprsXsZzYn?H%C`Wn=o?ifOcFiQrP8Wr^9hUw`88XUN7=)g8a>WEP01zjtm#)=u8lHX5_A}uh+3oZbX&qV{^1o3eIp*=asc%7=r>EAaW{~m#48!+Km7bq9zFeEbAYw?wsCUP5_ zYisL`wdTrZqrSfO0bNK5zaKfz7&Pf{4|*kC=OHWlEOYXGm%PkrKbZfMk(cL(%Ui37BP|^ptItK%!HYnNQ?80Eyi43iniI4=%m^PyVDpkYz8)30LC@zJiTT$AWfw+VmD&77YnyU=v z>~Ll_nlLAUAR(WdG$SOTg3=S={PolN+1%|>srho0(zC#npqT|Ty0b8)MhM_lE_$G- LN-aYFL)`lx3UwfF-+RCJe)D@T^D&XI5uDR6+U_q6q3<#n zod|*8>9$u#OF)?OCIy^Dksa4ChIN&6esbI2LtX`U@Dk z*uwFaj_v>9z?!_EYXV7SbVXX$m#V4E+ul@ik145f= za?EeJE<0gG#&LMa9bFjJz++6r2Nax8VFcue1j1xdZ*zLYRq_KeWD<(!7*wv7#-)+# z6xI$DG`uKDaEk_g9?m!kBw8sx==sBSM)U zHbf5l%zGY?3OV+foWCOF04VISVn(Or{jniZg9lK*^<O7 zhnpk!gIqiu1;HPqHVf+wpZi|mH%Vc*q5{mji3%XN?K z6Y4TrbqCb$P()=A77tED0^ZEwL9{W@KnuXW06#VZ<^}qpPrNR_EO+#LThF%|r}{Fz z0)S(SomD1+pM|ZbGO8P5zw^m1qxJ+!UK3m-oR#dmX zu}UvPTO@_%r6;eX^ODNC4bSEo$l;>XVz`cx_v{-*gegdHhXE5UGC0Z>t^zu%wjvU*v;PP$Bh}gmqgj6e}G(} zlXKcRr#my-pD-uwekwU>_6jII(Ve^cw)82tbWSpt@5l56Tw$io{>}L*^+odRo}wzn GXof!_!ZmRK literal 0 HcmV?d00001 diff --git a/src/config/__pycache__/settings.cpython-311.pyc b/src/config/__pycache__/settings.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..addb530c967e658ea9ed4cd82c40a405694dc929 GIT binary patch literal 1599 zcmZux&rjP(6dotzU?31uXn+z*swxs`l~{s3R1sPcAcb^IT!l~xUo3AtglruWzw?IQ7kh@I&31H=o}(GjIIfn`i$qF)@aq zt^S3N@?nJjWn|dIYvr;G$~$BtQ?O7|&;)_$AuH4jYheL}&^9u|zakS8;T+ER0AtWJ z96|)21NktVLYiobdngn6$YwGEjcyno*(1K+u@d%)SW7UOJ&F`kHrkch;22ttN2XxfxH3n9-@X&7gFNp2bbk?cEi z(>b<%V#?0jp?CdqiIY%0BM8#k&&PKb4fH;Ly#$Qh50u>)SJmEA(M9Zw%*)6Hz!Or>}7>D_#K zkH*;YR-sxP?in#!t(@i5sl*LSO7ULN2-o?)d<4pE(Xf&+LmGx+5lsviBVNUZ7n9p` z?88RAjdR34X^Y4HVfFq(@->4Ip*zy)Rfh(;yytk*aMe2r?HJw_v zX?SmmD|>EDu5s^V@A`AewN7bd;IW&41Lt9Qj{!P2y+1wQmHNr2fi&Nrn(d0LyGyXe zeF9L<#N9bYn5E0oum@9v$P)I+4s`{hUa<@ulJv<1$h4bYJlOn*28NT z5wx({DW7cycmS6 zlFeoh9(0hg7r(Xt!U*hf|HNJbelZ7w!Jhh3x}~Ggr{10IZh7p1zWROi`{{ST)BR4r z$6^t{#;f1p&5Y;30+!@}wq!^Ww?j^-EgP}~LSP42@-tvTES<3& zd>AkUFgzOrFqE#655MG;muPg;^zl9p0^4o*j8(l8*q&=T%}Kbp!3kXa%8cg#14FWa z5jtQ+7&3-fu|jv4$C8b(ASg&NW(0)=%^0d6RnV+4D<~pp&WPYSmL)1EW<-T=UQpbK z30e@8Fy;ka5tKCIf>NBs(yy|mr_Vy;dX0tdVm?FF>H2z~mkxYP{8C`IFu`tV8=sUc zyJh=Nx+MpYMi}TMe||(Cg)$3c6oz?DoF=m}G%T zS~!^IyV@w%cPb6EU9GdUdi`*mG5t6QI{rqfWOnSL=Qw8DEH=G1o!2$3yjMo$`&D#M z*C@F3Dv|UCGQ!!u&39mYz|bdH7fN7WsSmW?XFH|HGKzZl?N5A z_gQYu^qYa*#(6(m$mZ5u^MmQ&6YTtt+t~Nb77L^ip5U%t-zztIiQHi!*DmC&Y;JcW zx3`hor*ZDyE;lOEm>ILvDLP(@s^Z0}R=Lz;(s<9nMTQyw^%(CkD`Y^{TS^z^m3Q&! zZWgyZ_sDMbU=h0~J>@t!Dtti|({a3SYxyu$5#nVdL?Z~bJ?q%vb`+tf$EH{iBbB|0 zj%)|h#MDCAZF&|v4OGG31;qRXT=2r)ZGOydv(_m!MRg^Aa<+<{E)9<>CTW&%iCz8` zCeOeqxpbR~JtjAq)ba_!KL_E_~QaO}{DO zKR6l=x*fJ(g#V;O^;G>3aCvQ1y)$lvugDq;<{jsM!sN9qNz%VzAbt7=SbhW3Qs(sg SH-O##UI)?C&l^9eqLTLy!GN~_ literal 0 HcmV?d00001 diff --git a/src/config/logging_config.py b/src/config/logging_config.py new file mode 100644 index 0000000..e30f704 --- /dev/null +++ b/src/config/logging_config.py @@ -0,0 +1,39 @@ +import logging.config +from config.settings import settings + +LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": { + "format": settings.LOG_FORMAT, + "datefmt": settings.LOG_DATE_FORMAT + }, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "standard", + "level": settings.LOG_LEVEL, + "stream": "ext://sys.stdout" + }, + "file": { + "class": "logging.FileHandler", + "formatter": "standard", + "level": settings.LOG_LEVEL, + "filename": settings.LOG_FILE, + "mode": "a", + } + }, + "loggers": { + "": { # root logger + "handlers": ["console", "file"], + "level": settings.LOG_LEVEL, + "propagate": True + } + } +} + +def setup_logging(): + """Initialize logging configuration""" + logging.config.dictConfig(LOGGING_CONFIG) \ No newline at end of file diff --git a/src/config/settings.py b/src/config/settings.py new file mode 100644 index 0000000..4476db6 --- /dev/null +++ b/src/config/settings.py @@ -0,0 +1,31 @@ +# settings.py + +from pydantic_settings import BaseSettings +from typing import Optional + +class Settings(BaseSettings): + # Storage paths + OBJECT_STORAGE_PATH: str = "/Users/tigeren/Dev/digisky/legal-doc-masker/src_folder" + TARGET_DIRECTORY_PATH: str = "/Users/tigeren/Dev/digisky/legal-doc-masker/target_folder" + + # Ollama API settings + OLLAMA_API_URL: str = "https://api.ollama.com" + OLLAMA_API_KEY: str = "" + OLLAMA_MODEL: str = "llama2" + + # File monitoring settings + MONITOR_INTERVAL: int = 5 + + # Logging settings + LOG_LEVEL: str = "INFO" + LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S" + LOG_FILE: str = "app.log" + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + extra = "allow" + +# Create settings instance +settings = Settings() diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..c292bd0 --- /dev/null +++ b/src/main.py @@ -0,0 +1,17 @@ +from config.logging_config import setup_logging + +def main(): + # Setup logging first + setup_logging() + + from services.file_monitor import FileMonitor + from config.settings import settings + + # Initialize the file monitor + file_monitor = FileMonitor(settings.OBJECT_STORAGE_PATH, settings.TARGET_DIRECTORY_PATH) + + # Start monitoring the directory for new files + file_monitor.start_monitoring() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/models/document.py b/src/models/document.py new file mode 100644 index 0000000..d68b501 --- /dev/null +++ b/src/models/document.py @@ -0,0 +1,12 @@ +class Document: + def __init__(self, file_path): + self.file_path = file_path + self.content = "" + + def load(self): + with open(self.file_path, 'r') as file: + self.content = file.read() + + def save(self, target_path): + with open(target_path, 'w') as file: + file.write(self.content) \ No newline at end of file diff --git a/src/models/document_factory.py b/src/models/document_factory.py new file mode 100644 index 0000000..4b3803e --- /dev/null +++ b/src/models/document_factory.py @@ -0,0 +1,25 @@ +import os +from typing import Optional +from models.document_processor import DocumentProcessor +from models.processors import ( + TxtDocumentProcessor, + DocxDocumentProcessor, + PdfDocumentProcessor +) + +class DocumentProcessorFactory: + @staticmethod + def create_processor(input_path: str, output_path: str) -> Optional[DocumentProcessor]: + file_extension = os.path.splitext(input_path)[1].lower() + + processors = { + '.txt': TxtDocumentProcessor, + '.docx': DocxDocumentProcessor, + '.doc': DocxDocumentProcessor, + '.pdf': PdfDocumentProcessor + } + + processor_class = processors.get(file_extension) + if processor_class: + return processor_class(input_path, output_path) + return None \ No newline at end of file diff --git a/src/models/document_processor.py b/src/models/document_processor.py new file mode 100644 index 0000000..a23336e --- /dev/null +++ b/src/models/document_processor.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from typing import Any + +class DocumentProcessor(ABC): + @abstractmethod + def read_content(self) -> str: + """Read document content""" + pass + + @abstractmethod + def process_content(self, content: str) -> str: + """Process document content""" + pass + + @abstractmethod + def save_content(self, content: str) -> None: + """Save processed content""" + pass \ No newline at end of file diff --git a/src/models/processors/__init__.py b/src/models/processors/__init__.py new file mode 100644 index 0000000..76b85c6 --- /dev/null +++ b/src/models/processors/__init__.py @@ -0,0 +1,5 @@ +from models.processors.txt_processor import TxtDocumentProcessor +from models.processors.docx_processor import DocxDocumentProcessor +from models.processors.pdf_processor import PdfDocumentProcessor + +__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor'] \ No newline at end of file diff --git a/src/models/processors/docx_processor.py b/src/models/processors/docx_processor.py new file mode 100644 index 0000000..bfbfba2 --- /dev/null +++ b/src/models/processors/docx_processor.py @@ -0,0 +1,20 @@ +import docx +from models.document_processor import DocumentProcessor + +class DocxDocumentProcessor(DocumentProcessor): + def __init__(self, input_path: str, output_path: str): + self.input_path = input_path + self.output_path = output_path + + def read_content(self) -> str: + doc = docx.Document(self.input_path) + return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) + + def process_content(self, content: str) -> str: + # Implementation for processing docx content + return content + + def save_content(self, content: str) -> None: + doc = docx.Document() + doc.add_paragraph(content) + doc.save(self.output_path) \ No newline at end of file diff --git a/src/models/processors/pdf_processor.py b/src/models/processors/pdf_processor.py new file mode 100644 index 0000000..4d73d54 --- /dev/null +++ b/src/models/processors/pdf_processor.py @@ -0,0 +1,20 @@ +import PyPDF2 +from models.document_processor import DocumentProcessor + +class PdfDocumentProcessor(DocumentProcessor): + def __init__(self, input_path: str, output_path: str): + self.input_path = input_path + self.output_path = output_path + + def read_content(self) -> str: + with open(self.input_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + return ' '.join([page.extract_text() for page in pdf_reader.pages]) + + def process_content(self, content: str) -> str: + # Implementation for processing PDF content + return content + + def save_content(self, content: str) -> None: + # Implementation for saving as PDF + pass \ No newline at end of file diff --git a/src/models/processors/txt_processor.py b/src/models/processors/txt_processor.py new file mode 100644 index 0000000..61e920e --- /dev/null +++ b/src/models/processors/txt_processor.py @@ -0,0 +1,18 @@ +from models.document_processor import DocumentProcessor + +class TxtDocumentProcessor(DocumentProcessor): + def __init__(self, input_path: str, output_path: str): + self.input_path = input_path + self.output_path = output_path + + def read_content(self) -> str: + with open(self.input_path, 'r', encoding='utf-8') as file: + return file.read() + + def process_content(self, content: str) -> str: + # Implementation for processing text content + return content + + def save_content(self, content: str) -> None: + with open(self.output_path, 'w', encoding='utf-8') as file: + file.write(content) \ No newline at end of file diff --git a/src/services/__pycache__/file_monitor.cpython-312.pyc b/src/services/__pycache__/file_monitor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb48db24c3a91ec2b2af5200b71c04d6310f7911 GIT binary patch literal 1462 zcmb_c&2JM&6o30+XE*-9VA_P>!h)dE9>7vXTjW+bwCNWjse0(e5?U?06SK}{9cDIE zw$@b+Ik-})l9LlvRp11OLwn?Z=mq4Wx)TnS;F4Q}50!fAo3%Hw(jNN`Z{ECp-|sj3 zYc4klNPDl;TYty^FU2JxbtDYtQ0RgSuH-`%WFSyS+oMF}$I?(&Rb==KTzMT_wr(fW?(5LRBqnyuww z(U(^+n`L>}fn~K$jSv1zIcRDf_#?5A~VwC;)#kZ%kD0AsPS8n_k1%y2dImI;>|j4OxF7URzuTL7uaoWPPxP~eAFh3O?e6W3`4^?L_v8nY_a`4-`FZxy zY|p6lN*{FQ`_mU6W}i)8>3s3Bbn&kCqVQ%nvzd8%=KUwyFRiDgkA5qB+%KH$<~DO% zH+O2=wH<%kA4Yg`wRdLjdEt}&6ukBJz6K?;bA7)E$BI+{cEL=E=N8z@#IpFLh26QU z_!=#qwXEB#wm8Lyz literal 0 HcmV?d00001 diff --git a/src/services/document_service.py b/src/services/document_service.py new file mode 100644 index 0000000..6af2377 --- /dev/null +++ b/src/services/document_service.py @@ -0,0 +1,30 @@ +import logging +from models.document_factory import DocumentProcessorFactory +from services.ollama_client import OllamaClient + +logger = logging.getLogger(__name__) + +class DocumentService: + def __init__(self, ollama_client: OllamaClient): + self.ollama_client = ollama_client + + def process_document(self, input_path: str, output_path: str) -> bool: + try: + processor = DocumentProcessorFactory.create_processor(input_path, output_path) + if not processor: + logger.error(f"Unsupported file format: {input_path}") + return False + + # Read content + content = processor.read_content() + + # Process with Ollama + processed_content = self.ollama_client.process_document(content) + + # Save processed content + processor.save_content(processed_content) + return True + + except Exception as e: + logger.error(f"Error processing document {input_path}: {str(e)}") + return False \ No newline at end of file diff --git a/src/services/file_monitor.py b/src/services/file_monitor.py new file mode 100644 index 0000000..d6b8f8d --- /dev/null +++ b/src/services/file_monitor.py @@ -0,0 +1,24 @@ +import logging + +logger = logging.getLogger(__name__) + +class FileMonitor: + def __init__(self, directory, callback): + self.directory = directory + self.callback = callback + + def start_monitoring(self): + import time + import os + + already_seen = set(os.listdir(self.directory)) + while True: + time.sleep(1) # Check every second + current_files = set(os.listdir(self.directory)) + new_files = current_files - already_seen + + for new_file in new_files: + logger.info(f"monitor: new file found: {new_file}") + self.callback(os.path.join(self.directory, new_file)) + + already_seen = current_files \ No newline at end of file diff --git a/src/services/ollama_client.py b/src/services/ollama_client.py new file mode 100644 index 0000000..e8c64ad --- /dev/null +++ b/src/services/ollama_client.py @@ -0,0 +1,15 @@ +class OllamaClient: + def __init__(self, model_name): + self.model_name = model_name + + def process_document(self, document_text): + # Here you would implement the logic to interact with the Ollama API + # and process the document text using the specified model. + # This is a placeholder for the actual API call. + processed_text = self._mock_api_call(document_text) + return processed_text + + def _mock_api_call(self, document_text): + # Mock processing: In a real implementation, this would call the Ollama API. + # For now, it just returns the input text with a note indicating it was processed. + return f"Processed with {self.model_name}: {document_text}" \ No newline at end of file diff --git a/src/utils/file_utils.py b/src/utils/file_utils.py new file mode 100644 index 0000000..f2c6935 --- /dev/null +++ b/src/utils/file_utils.py @@ -0,0 +1,20 @@ +def read_file(file_path): + with open(file_path, 'r') as file: + return file.read() + +def write_file(file_path, content): + with open(file_path, 'w') as file: + file.write(content) + +def file_exists(file_path): + import os + return os.path.isfile(file_path) + +def delete_file(file_path): + import os + if file_exists(file_path): + os.remove(file_path) + +def list_files_in_directory(directory_path): + import os + return [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))] \ No newline at end of file diff --git a/src_folder/README.md b/src_folder/README.md new file mode 100644 index 0000000..dc3df6e --- /dev/null +++ b/src_folder/README.md @@ -0,0 +1,58 @@ +# README.md + +# Document Processing App + +This project is designed to process legal documents by hiding sensitive information such as names and company names. It utilizes the Ollama API with selected models for text processing. The application monitors a specified directory for new files, processes them automatically, and saves the results to a target path. + +## Project Structure + +``` +doc-processing-app +├── src +│ ├── main.py # Entry point of the application +│ ├── config +│ │ └── settings.py # Configuration settings for paths +│ ├── services +│ │ ├── file_monitor.py # Monitors directory for new files +│ │ ├── document_processor.py # Handles document processing logic +│ │ └── ollama_client.py # Interacts with the Ollama API +│ ├── utils +│ │ └── file_utils.py # Utility functions for file operations +│ └── models +│ └── document.py # Represents the structure of a document +├── tests +│ └── test_document_processor.py # Unit tests for DocumentProcessor +├── requirements.txt # Project dependencies +├── .env.example # Example environment variables +└── README.md # Project documentation +``` + +## Setup Instructions + +1. Clone the repository: + ``` + git clone + cd doc-processing-app + ``` + +2. Install the required dependencies: + ``` + pip install -r requirements.txt + ``` + +3. Configure the application by editing the `src/config/settings.py` file to set the paths for the object storage and target directory. + +4. Create a `.env` file based on the `.env.example` file to set up necessary environment variables. + +## Usage + +To run the application, execute the following command: +``` +python src/main.py +``` + +The application will start monitoring the specified directory for new documents. Once a new document is added, it will be processed automatically. + +## Contributing + +Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes. \ No newline at end of file