Compare commits

...

5 Commits

Author SHA1 Message Date
oliviamn 7d0be5aa8a 将题词抽象出来 2025-05-06 00:13:19 +08:00
oliviamn 815427a509 文件写入output folder的.work隐藏目录下 2025-05-05 23:34:10 +08:00
oliviamn e6fb9b9a83 调整目录结构 2025-05-05 20:33:08 +08:00
oliviamn edca9a87a0 Refactor PdfDocumentProcessor to enhance PDF content processing
- Updated read_content method to return raw bytes instead of extracted text.
- Modified process_content method to handle bytes and generate multiple output files including markdown, JSON, and processed PDFs.
- Implemented directory setup for image storage and output management.
- Integrated PymuDocDataset for PDF classification and processing based on OCR capabilities.
2025-05-05 19:15:03 +08:00
oliviamn 6acf3e5423 Update requirements.txt to upgrade requests and add magic-pdf dependency 2025-05-05 18:53:22 +08:00
19 changed files with 247 additions and 137 deletions

5
.gitignore vendored
View File

@ -67,4 +67,7 @@ temp/
src_folder
target_folder
app.log
__pycache__
__pycache__
data/doc_dest
data/doc_src
data/doc_intermediate

Binary file not shown.

67
download_models.py Normal file
View File

@ -0,0 +1,67 @@
import json
import shutil
import os
import requests
from modelscope import snapshot_download
def download_json(url):
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.2.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
mineru_patterns = [
# "models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_hf_small_2503/*",
"models/OCR/paddleocr_torch/*",
# "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
# paddleocr_model_dir = model_dir + '/OCR/paddleocr'
# user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
# if os.path.exists(user_paddleocr_dir):
# shutil.rmtree(user_paddleocr_dir)
# shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')

View File

@ -2,9 +2,10 @@
pydantic-settings>=2.0.0
python-dotenv==1.0.0
watchdog==2.1.6
requests==2.26.0
requests==2.28.1
# Document processing
python-docx>=0.8.11
PyPDF2>=3.0.0
pandas>=2.0.0
magic-pdf[full]

View File

@ -5,8 +5,8 @@ from typing import Optional
class Settings(BaseSettings):
# Storage paths
OBJECT_STORAGE_PATH: str = "/Users/tigeren/Dev/digisky/legal-doc-masker/src_folder"
TARGET_DIRECTORY_PATH: str = "/Users/tigeren/Dev/digisky/legal-doc-masker/target_folder"
OBJECT_STORAGE_PATH: str = ""
TARGET_DIRECTORY_PATH: str = ""
# Ollama API settings
OLLAMA_API_URL: str = "https://api.ollama.com"

View File

@ -1,7 +1,7 @@
import os
from typing import Optional
from models.document_processor import DocumentProcessor
from models.processors import (
from document_handlers.document_processor import DocumentProcessor
from document_handlers.processors import (
TxtDocumentProcessor,
DocxDocumentProcessor,
PdfDocumentProcessor

View File

@ -0,0 +1,5 @@
from document_handlers.processors.txt_processor import TxtDocumentProcessor
from document_handlers.processors.docx_processor import DocxDocumentProcessor
from document_handlers.processors.pdf_processor import PdfDocumentProcessor
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor']

View File

@ -1,5 +1,5 @@
import docx
from models.document_processor import DocumentProcessor
from document_handlers.document_processor import DocumentProcessor
class DocxDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):

View File

@ -0,0 +1,98 @@
import os
import PyPDF2
from document_handlers.document_processor import DocumentProcessor
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from prompts.masking_prompts import get_masking_prompt
import logging
from services.ollama_client import OllamaClient
from config.settings import settings
logger = logging.getLogger(__name__)
class PdfDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
self.input_path = input_path
self.output_path = output_path
self.output_dir = os.path.dirname(output_path)
self.name_without_suff = os.path.splitext(os.path.basename(input_path))[0]
# Setup output directories
self.local_image_dir = os.path.join(self.output_dir, "images")
self.image_dir = os.path.basename(self.local_image_dir)
os.makedirs(self.local_image_dir, exist_ok=True)
# Setup work directory under output directory
self.work_dir = os.path.join(
os.path.dirname(output_path),
".work",
os.path.splitext(os.path.basename(input_path))[0]
)
os.makedirs(self.work_dir, exist_ok=True)
self.work_local_image_dir = os.path.join(self.work_dir, "images")
self.work_image_dir = os.path.basename(self.work_local_image_dir)
os.makedirs(self.work_local_image_dir, exist_ok=True)
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
def read_content(self) -> bytes:
with open(self.input_path, 'rb') as file:
return file.read()
def process_content(self, content: bytes) -> dict:
logger.info("Starting PDF content processing")
# Initialize writers
image_writer = FileBasedDataWriter(self.work_local_image_dir)
md_writer = FileBasedDataWriter(self.work_dir)
# Create Dataset Instance
ds = PymuDocDataset(content)
logger.info("Classifying PDF type: %s", ds.classify())
# Process based on PDF type
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
logger.info("Generating all outputs")
# Generate all outputs
infer_result.draw_model(os.path.join(self.work_dir, f"{self.name_without_suff}_model.pdf"))
model_inference_result = infer_result.get_infer_res()
pipe_result.draw_layout(os.path.join(self.work_dir, f"{self.name_without_suff}_layout.pdf"))
pipe_result.draw_span(os.path.join(self.work_dir, f"{self.name_without_suff}_spans.pdf"))
md_content = pipe_result.get_markdown(self.work_image_dir)
pipe_result.dump_md(md_writer, f"{self.name_without_suff}.md", self.work_image_dir)
content_list = pipe_result.get_content_list(self.work_image_dir)
pipe_result.dump_content_list(md_writer, f"{self.name_without_suff}_content_list.json", self.work_image_dir)
middle_json = pipe_result.get_middle_json()
pipe_result.dump_middle_json(md_writer, f'{self.name_without_suff}_middle.json')
logger.info("Masking content")
formatted_prompt = get_masking_prompt(md_content)
logger.info("Calling ollama to generate response")
response = self.ollama_client.generate(formatted_prompt)
logger.info("Response generated")
return response
def save_content(self, content: str) -> None:
# Ensure output path has .md extension
output_dir = os.path.dirname(self.output_path)
base_name = os.path.splitext(os.path.basename(self.output_path))[0]
md_output_path = os.path.join(output_dir, f"{base_name}.md")
logger.info(f"Saving masked content to: {md_output_path}")
with open(md_output_path, 'w', encoding='utf-8') as file:
file.write(content)

View File

@ -0,0 +1,27 @@
from document_handlers.document_processor import DocumentProcessor
from services.ollama_client import OllamaClient
import logging
from prompts.masking_prompts import get_masking_prompt
from config.settings import settings
logger = logging.getLogger(__name__)
class TxtDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
self.input_path = input_path
self.output_path = output_path
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
def read_content(self) -> str:
with open(self.input_path, 'r', encoding='utf-8') as file:
return file.read()
def process_content(self, content: str) -> str:
formatted_prompt = get_masking_prompt(content)
response = self.ollama_client.generate(formatted_prompt)
logger.debug(f"Processed content: {response}")
return response
def save_content(self, content: str) -> None:
with open(self.output_path, 'w', encoding='utf-8') as file:
file.write(content)

View File

@ -7,6 +7,11 @@ def main():
from services.file_monitor import FileMonitor
from config.settings import settings
import logging
logger = logging.getLogger(__name__)
logger.info("Starting the application")
logger.info(f"Monitoring directory: {settings.OBJECT_STORAGE_PATH}")
logger.info(f"Target directory: {settings.TARGET_DIRECTORY_PATH}")
# Initialize the file monitor
file_monitor = FileMonitor(settings.OBJECT_STORAGE_PATH, settings.TARGET_DIRECTORY_PATH)

View File

@ -1,5 +0,0 @@
from models.processors.txt_processor import TxtDocumentProcessor
from models.processors.docx_processor import DocxDocumentProcessor
from models.processors.pdf_processor import PdfDocumentProcessor
__all__ = ['TxtDocumentProcessor', 'DocxDocumentProcessor', 'PdfDocumentProcessor']

View File

@ -1,20 +0,0 @@
import PyPDF2
from models.document_processor import DocumentProcessor
class PdfDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
self.input_path = input_path
self.output_path = output_path
def read_content(self) -> str:
with open(self.input_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
return ' '.join([page.extract_text() for page in pdf_reader.pages])
def process_content(self, content: str) -> str:
# Implementation for processing PDF content
return content
def save_content(self, content: str) -> None:
# Implementation for saving as PDF
pass

View File

@ -1,46 +0,0 @@
from models.document_processor import DocumentProcessor
from services.ollama_client import OllamaClient
import textwrap
import logging
from config.settings import settings
logger = logging.getLogger(__name__)
class TxtDocumentProcessor(DocumentProcessor):
def __init__(self, input_path: str, output_path: str):
self.input_path = input_path
self.output_path = output_path
self.ollama_client = OllamaClient(model_name=settings.OLLAMA_MODEL, base_url=settings.OLLAMA_API_URL)
def read_content(self) -> str:
with open(self.input_path, 'r', encoding='utf-8') as file:
return file.read()
def process_content(self, content: str) -> str:
prompt = textwrap.dedent("""
您是一位专业的法律文档脱敏专家请按照以下规则对文本进行脱敏处理
规则
1. 人名
- 两字名改为"姓+某"张三 张某
- 三字名改为"姓+某某"张三丰 张某某
2. 公司名
- 保留地理位置信息北京上海等
- 保留公司类型有限公司股份公司等
- ""替换核心名称
3. 保持原文其他部分不变
4. 确保脱敏后的文本保持原有的语言流畅性和可读性
输入文本
{text}
请直接输出脱敏后的文本无需解释或其他备注
""")
formatted_prompt = prompt.format(text=content)
response = self.ollama_client.generate(formatted_prompt)
logger.debug(f"Processed content: {response}")
return response
def save_content(self, content: str) -> None:
with open(self.output_path, 'w', encoding='utf-8') as file:
file.write(content)

View File

@ -0,0 +1,33 @@
import textwrap
def get_masking_prompt(text: str) -> str:
"""
Returns the prompt for masking sensitive information in legal documents.
Args:
text (str): The input text to be masked
Returns:
str: The formatted prompt with the input text
"""
prompt = textwrap.dedent("""
您是一位专业的法律文档脱敏专家请按照以下规则对文本进行脱敏处理
规则
1. 人名
- 两字名改为"姓+某"张三 张某
- 三字名改为"姓+某某"张三丰 张某某
2. 公司名
- 保留地理位置信息北京上海等
- 保留公司类型有限公司股份公司等
- ""替换核心名称
3. 保持原文其他部分不变
4. 确保脱敏后的文本保持原有的语言流畅性和可读性
输入文本
{text}
请直接输出脱敏后的文本无需解释或其他备注
""")
return prompt.format(text=text)

View File

@ -1,5 +1,5 @@
import logging
from models.document_factory import DocumentProcessorFactory
from document_handlers.document_factory import DocumentProcessorFactory
from services.ollama_client import OllamaClient
logger = logging.getLogger(__name__)

View File

@ -1,58 +0,0 @@
# README.md
# Document Processing App
This project is designed to process legal documents by hiding sensitive information such as names and company names. It utilizes the Ollama API with selected models for text processing. The application monitors a specified directory for new files, processes them automatically, and saves the results to a target path.
## Project Structure
```
doc-processing-app
├── src
│ ├── main.py # Entry point of the application
│ ├── config
│ │ └── settings.py # Configuration settings for paths
│ ├── services
│ │ ├── file_monitor.py # Monitors directory for new files
│ │ ├── document_processor.py # Handles document processing logic
│ │ └── ollama_client.py # Interacts with the Ollama API
│ ├── utils
│ │ └── file_utils.py # Utility functions for file operations
│ └── models
│ └── document.py # Represents the structure of a document
├── tests
│ └── test_document_processor.py # Unit tests for DocumentProcessor
├── requirements.txt # Project dependencies
├── .env.example # Example environment variables
└── README.md # Project documentation
```
## Setup Instructions
1. Clone the repository:
```
git clone <repository-url>
cd doc-processing-app
```
2. Install the required dependencies:
```
pip install -r requirements.txt
```
3. Configure the application by editing the `src/config/settings.py` file to set the paths for the object storage and target directory.
4. Create a `.env` file based on the `.env.example` file to set up necessary environment variables.
## Usage
To run the application, execute the following command:
```
python src/main.py
```
The application will start monitoring the specified directory for new documents. Once a new document is added, it will be processed automatically.
## Contributing
Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.