调整目录结构

This commit is contained in:
oliviamn 2025-05-05 20:33:08 +08:00
parent edca9a87a0
commit e6fb9b9a83
14 changed files with 22 additions and 72 deletions

5
.gitignore vendored
View File

@ -67,4 +67,7 @@ temp/
src_folder
target_folder
app.log
__pycache__
__pycache__
data/doc_dest
data/doc_src
data/doc_intermediate

Binary file not shown.

View File

@ -5,8 +5,8 @@ from typing import Optional
class Settings(BaseSettings):
# Storage paths
OBJECT_STORAGE_PATH: str = "/Users/tigeren/Dev/digisky/legal-doc-masker/src_folder"
TARGET_DIRECTORY_PATH: str = "/Users/tigeren/Dev/digisky/legal-doc-masker/target_folder"
OBJECT_STORAGE_PATH: str = ""
TARGET_DIRECTORY_PATH: str = ""
# Ollama API settings
OLLAMA_API_URL: str = "https://api.ollama.com"

View File

@ -1,7 +1,7 @@
import os
from typing import Optional
from models.document_processor import DocumentProcessor
from models.processors import (
from document_handlers.document_processor import DocumentProcessor
from document_handlers.processors import (
TxtDocumentProcessor,
DocxDocumentProcessor,
PdfDocumentProcessor

View File

@ -0,0 +1,5 @@
from document_handlers.processors.txt_processor import TxtDocumentProcessor
from document_handlers.processors.docx_processor import DocxDocumentProcessor
from document_handlers.processors.pdf_processor import PdfDocumentProcessor
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor']

View File

@ -1,5 +1,5 @@
import docx
from models.document_processor import DocumentProcessor
from document_handlers.document_processor import DocumentProcessor
class DocxDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):

View File

@ -1,6 +1,6 @@
import os
import PyPDF2
from models.document_processor import DocumentProcessor
from document_handlers.document_processor import DocumentProcessor
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze

View File

@ -1,4 +1,4 @@
from models.document_processor import DocumentProcessor
from document_handlers.document_processor import DocumentProcessor
from services.ollama_client import OllamaClient
import textwrap
import logging

View File

@ -7,6 +7,11 @@ def main():
from services.file_monitor import FileMonitor
from config.settings import settings
import logging
logger = logging.getLogger(__name__)
logger.info("Starting the application")
logger.info(f"Monitoring directory: {settings.OBJECT_STORAGE_PATH}")
logger.info(f"Target directory: {settings.TARGET_DIRECTORY_PATH}")
# Initialize the file monitor
file_monitor = FileMonitor(settings.OBJECT_STORAGE_PATH, settings.TARGET_DIRECTORY_PATH)

View File

@ -1,5 +0,0 @@
from models.processors.txt_processor import TxtDocumentProcessor
from models.processors.docx_processor import DocxDocumentProcessor
from models.processors.pdf_processor import PdfDocumentProcessor
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor']

View File

@ -1,5 +1,5 @@
import logging
from models.document_factory import DocumentProcessorFactory
from document_handlers.document_factory import DocumentProcessorFactory
from services.ollama_client import OllamaClient
logger = logging.getLogger(__name__)

View File

@ -1,58 +0,0 @@
# README.md
# Document Processing App
This project is designed to process legal documents by hiding sensitive information such as names and company names. It utilizes the Ollama API with selected models for text processing. The application monitors a specified directory for new files, processes them automatically, and saves the results to a target path.
## Project Structure
```
doc-processing-app
├── src
│ ├── main.py # Entry point of the application
│ ├── config
│ │ └── settings.py # Configuration settings for paths
│ ├── services
│ │ ├── file_monitor.py # Monitors directory for new files
│ │ ├── document_processor.py # Handles document processing logic
│ │ └── ollama_client.py # Interacts with the Ollama API
│ ├── utils
│ │ └── file_utils.py # Utility functions for file operations
│ └── models
│ └── document.py # Represents the structure of a document
├── tests
│ └── test_document_processor.py # Unit tests for DocumentProcessor
├── requirements.txt # Project dependencies
├── .env.example # Example environment variables
└── README.md # Project documentation
```
## Setup Instructions
1. Clone the repository:
```
git clone <repository-url>
cd doc-processing-app
```
2. Install the required dependencies:
```
pip install -r requirements.txt
```
3. Configure the application by editing the `src/config/settings.py` file to set the paths for the object storage and target directory.
4. Create a `.env` file based on the `.env.example` file to set up necessary environment variables.
## Usage
To run the application, execute the following command:
```
python src/main.py
```
The application will start monitoring the specified directory for new documents. Once a new document is added, it will be processed automatically.
## Contributing
Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.