diff --git a/.gitignore b/.gitignore index 20214e2..1ea05df 100644 --- a/.gitignore +++ b/.gitignore @@ -67,4 +67,7 @@ temp/ src_folder target_folder app.log -__pycache__ \ No newline at end of file +__pycache__ +data/doc_dest +data/doc_src +data/doc_intermediate \ No newline at end of file diff --git a/data/doc/20220707_na_decision-2.pdf b/data/doc/20220707_na_decision-2.pdf new file mode 100644 index 0000000..7dd9f29 Binary files /dev/null and b/data/doc/20220707_na_decision-2.pdf differ diff --git a/src/config/settings.py b/src/config/settings.py index 4476db6..4a144fa 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -5,8 +5,8 @@ from typing import Optional class Settings(BaseSettings): # Storage paths - OBJECT_STORAGE_PATH: str = "/Users/tigeren/Dev/digisky/legal-doc-masker/src_folder" - TARGET_DIRECTORY_PATH: str = "/Users/tigeren/Dev/digisky/legal-doc-masker/target_folder" + OBJECT_STORAGE_PATH: str = "" + TARGET_DIRECTORY_PATH: str = "" # Ollama API settings OLLAMA_API_URL: str = "https://api.ollama.com" diff --git a/src/models/document.py b/src/document_handlers/document.py similarity index 100% rename from src/models/document.py rename to src/document_handlers/document.py diff --git a/src/models/document_factory.py b/src/document_handlers/document_factory.py similarity index 84% rename from src/models/document_factory.py rename to src/document_handlers/document_factory.py index 4b3803e..01aa6a6 100644 --- a/src/models/document_factory.py +++ b/src/document_handlers/document_factory.py @@ -1,7 +1,7 @@ import os from typing import Optional -from models.document_processor import DocumentProcessor -from models.processors import ( +from document_handlers.document_processor import DocumentProcessor +from document_handlers.processors import ( TxtDocumentProcessor, DocxDocumentProcessor, PdfDocumentProcessor diff --git a/src/models/document_processor.py b/src/document_handlers/document_processor.py similarity index 100% rename from src/models/document_processor.py rename to src/document_handlers/document_processor.py diff --git a/src/document_handlers/processors/__init__.py b/src/document_handlers/processors/__init__.py new file mode 100644 index 0000000..115b63e --- /dev/null +++ b/src/document_handlers/processors/__init__.py @@ -0,0 +1,5 @@ +from document_handlers.processors.txt_processor import TxtDocumentProcessor +from document_handlers.processors.docx_processor import DocxDocumentProcessor +from document_handlers.processors.pdf_processor import PdfDocumentProcessor + +__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor'] \ No newline at end of file diff --git a/src/models/processors/docx_processor.py b/src/document_handlers/processors/docx_processor.py similarity index 85% rename from src/models/processors/docx_processor.py rename to src/document_handlers/processors/docx_processor.py index bfbfba2..780caf3 100644 --- a/src/models/processors/docx_processor.py +++ b/src/document_handlers/processors/docx_processor.py @@ -1,5 +1,5 @@ import docx -from models.document_processor import DocumentProcessor +from document_handlers.document_processor import DocumentProcessor class DocxDocumentProcessor(DocumentProcessor): def __init__(self, input_path: str, output_path: str): diff --git a/src/models/processors/pdf_processor.py b/src/document_handlers/processors/pdf_processor.py similarity index 96% rename from src/models/processors/pdf_processor.py rename to src/document_handlers/processors/pdf_processor.py index 7ffb326..8bbfec4 100644 --- a/src/models/processors/pdf_processor.py +++ b/src/document_handlers/processors/pdf_processor.py @@ -1,6 +1,6 @@ import os import PyPDF2 -from models.document_processor import DocumentProcessor +from document_handlers.document_processor import DocumentProcessor from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze diff --git a/src/models/processors/txt_processor.py b/src/document_handlers/processors/txt_processor.py similarity index 94% rename from src/models/processors/txt_processor.py rename to src/document_handlers/processors/txt_processor.py index 413b0fc..c5e5f9a 100644 --- a/src/models/processors/txt_processor.py +++ b/src/document_handlers/processors/txt_processor.py @@ -1,4 +1,4 @@ -from models.document_processor import DocumentProcessor +from document_handlers.document_processor import DocumentProcessor from services.ollama_client import OllamaClient import textwrap import logging diff --git a/src/main.py b/src/main.py index c292bd0..090e557 100644 --- a/src/main.py +++ b/src/main.py @@ -7,6 +7,11 @@ def main(): from services.file_monitor import FileMonitor from config.settings import settings + import logging + logger = logging.getLogger(__name__) + logger.info("Starting the application") + logger.info(f"Monitoring directory: {settings.OBJECT_STORAGE_PATH}") + logger.info(f"Target directory: {settings.TARGET_DIRECTORY_PATH}") # Initialize the file monitor file_monitor = FileMonitor(settings.OBJECT_STORAGE_PATH, settings.TARGET_DIRECTORY_PATH) diff --git a/src/models/processors/__init__.py b/src/models/processors/__init__.py deleted file mode 100644 index 76b85c6..0000000 --- a/src/models/processors/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from models.processors.txt_processor import TxtDocumentProcessor -from models.processors.docx_processor import DocxDocumentProcessor -from models.processors.pdf_processor import PdfDocumentProcessor - -__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor'] \ No newline at end of file diff --git a/src/services/document_service.py b/src/services/document_service.py index 6a42e62..0a6c981 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -1,5 +1,5 @@ import logging -from models.document_factory import DocumentProcessorFactory +from document_handlers.document_factory import DocumentProcessorFactory from services.ollama_client import OllamaClient logger = logging.getLogger(__name__) diff --git a/src_folder/README.md b/src_folder/README.md deleted file mode 100644 index dc3df6e..0000000 --- a/src_folder/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# README.md - -# Document Processing App - -This project is designed to process legal documents by hiding sensitive information such as names and company names. It utilizes the Ollama API with selected models for text processing. The application monitors a specified directory for new files, processes them automatically, and saves the results to a target path. - -## Project Structure - -``` -doc-processing-app -├── src -│ ├── main.py # Entry point of the application -│ ├── config -│ │ └── settings.py # Configuration settings for paths -│ ├── services -│ │ ├── file_monitor.py # Monitors directory for new files -│ │ ├── document_processor.py # Handles document processing logic -│ │ └── ollama_client.py # Interacts with the Ollama API -│ ├── utils -│ │ └── file_utils.py # Utility functions for file operations -│ └── models -│ └── document.py # Represents the structure of a document -├── tests -│ └── test_document_processor.py # Unit tests for DocumentProcessor -├── requirements.txt # Project dependencies -├── .env.example # Example environment variables -└── README.md # Project documentation -``` - -## Setup Instructions - -1. Clone the repository: - ``` - git clone - cd doc-processing-app - ``` - -2. Install the required dependencies: - ``` - pip install -r requirements.txt - ``` - -3. Configure the application by editing the `src/config/settings.py` file to set the paths for the object storage and target directory. - -4. Create a `.env` file based on the `.env.example` file to set up necessary environment variables. - -## Usage - -To run the application, execute the following command: -``` -python src/main.py -``` - -The application will start monitoring the specified directory for new documents. Once a new document is added, it will be processed automatically. - -## Contributing - -Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes. \ No newline at end of file