legal-doc-masker/mineru/test_mineru_api.py

#!/usr/bin/env python3
"""
Test script for Mineru API endpoints
"""

import requests
import json
from pathlib import Path

# API base URL
BASE_URL = "http://localhost:8000/api/v1/mineru"

def test_health_check():
    """Test the health check endpoint"""
    print("Testing health check...")
    response = requests.get(f"{BASE_URL}/health")
    print(f"Status: {response.status_code}")
    print(f"Response: {response.json()}")
    print()

def test_parse_document(file_path: str):
    """Test document parsing endpoint"""
    print(f"Testing document parsing with file: {file_path}")

    # Check if file exists
    if not Path(file_path).exists():
        print(f"Error: File {file_path} not found")
        return

    # Prepare the file upload
    with open(file_path, 'rb') as f:
        files = {'file': (Path(file_path).name, f, 'application/pdf')}

        # Prepare parameters
        params = {
            'lang': 'ch',
            'backend': 'pipeline',
            'method': 'auto',
            'formula_enable': True,
            'table_enable': True,
            'draw_layout_bbox': True,
            'draw_span_bbox': True,
            'dump_md': True,
            'dump_middle_json': True,
            'dump_model_output': True,
            'dump_orig_pdf': True,
            'dump_content_list': True,
            'make_md_mode': 'MM_MD'
        }

        # Make the request
        response = requests.post(f"{BASE_URL}/parse", files=files, params=params)

        print(f"Status: {response.status_code}")
        if response.status_code == 200:
            result = response.json()
            print("Parse successful!")
            print(f"File name: {result['file_name']}")
            print(f"Output directory: {result['output_directory']}")
            print("Generated outputs:")
            for output_type, output_path in result['outputs'].items():
                print(f"  - {output_type}: {output_path}")
        else:
            print(f"Error: {response.text}")
        print()

def test_download_file(file_path: str):
    """Test file download endpoint"""
    print(f"Testing file download: {file_path}")

    response = requests.get(f"{BASE_URL}/download/{file_path}")
    print(f"Status: {response.status_code}")

    if response.status_code == 200:
        # Save the downloaded file
        output_filename = f"downloaded_{Path(file_path).name}"
        with open(output_filename, 'wb') as f:
            f.write(response.content)
        print(f"File downloaded successfully as: {output_filename}")
    else:
        print(f"Error: {response.text}")
    print()

if __name__ == "__main__":
    print("Mineru API Test Script")
    print("=" * 50)

    # Test health check
    test_health_check()

    # Test document parsing (you'll need to provide a PDF file)
    # Uncomment and modify the path below to test with your own file
    # test_parse_document("path/to/your/document.pdf")

    # Example of how to test file download (after parsing)
    # test_download_file("some_uuid/document_name.md")

    print("Test completed!")
    print("\nTo test document parsing:")
    print("1. Uncomment the test_parse_document line above")
    print("2. Provide a valid PDF file path")
    print("3. Run the script again")
    print("\nTo test file download:")
    print("1. First run a parse operation to get file paths")
    print("2. Use the output paths from the parse result")
    print("3. Uncomment and modify the test_download_file line")