videocut/find_duplicates.py

#!/usr/bin/env python3
"""
Find duplicate files that share the same base ID (e.g., [LETTERS]-[DIGITS])
but differ only by extra suffix characters.

Examples:
    NACR-996.mp4   vs  NACR-996J.mp4      -> same base: NACR-996
    NDWQ-008-C.mp4 vs  NDWQ-008.mp4       -> same base: NDWQ-008
"""

import os
import re
import argparse
from collections import defaultdict


def extract_base_id(filename: str) -> str | None:
    """
    Extract the base ID from a filename.
    Matches pattern: [letters]-[digits], case-insensitive.
    Returns the matched base ID (uppercased for case-insensitive comparison),
    or None if no match.
    """
    # Remove file extension first so suffixes in the stem don't confuse matching.
    # But we need to match within the full filename (without extension).
    stem, _ = os.path.splitext(filename)

    # Match: one or more letters, a hyphen, one or more digits
    # This is the core ID. We take the FIRST such match in the stem.
    match = re.search(r'[a-zA-Z]+-\d+', stem)
    if not match:
        return None

    return match.group(0).upper()


def find_duplicates(
    folder: str,
    extensions: set[str] | None = None,
    skip_extensions: set[str] | None = None,
) -> dict[str, list[str]]:
    """
    Scan `folder` for files, group by base ID, and return groups with >1 file.

    Args:
        folder: Path to the directory to scan.
        extensions: If provided, only consider files with these extensions
                    (e.g., {'.mp4', '.mkv', '.avi'}). Case-insensitive.
        skip_extensions: Extensions to always ignore (e.g., {'.srt'}).

    Returns:
        Dict mapping base_id -> list of filenames (duplicate groups).
    """
    if skip_extensions is None:
        skip_extensions = set()

    groups: dict[str, list[str]] = defaultdict(list)

    for entry in sorted(os.listdir(folder)):
        full_path = os.path.join(folder, entry)
        if not os.path.isfile(full_path):
            continue

        _, ext = os.path.splitext(entry)
        ext_lower = ext.lower()

        if ext_lower in skip_extensions:
            continue

        if extensions and ext_lower not in extensions:
            continue

        base_id = extract_base_id(entry)
        if base_id:
            groups[base_id].append(entry)

    # Only return groups with duplicates
    return {bid: files for bid, files in groups.items() if len(files) > 1}


def main():
    parser = argparse.ArgumentParser(
        description="Find duplicate files by base ID (e.g., NACR-996 / NACR-996J)"
    )
    parser.add_argument(
        "folder",
        nargs="?",
        default=".",
        help="Folder to scan (default: current directory)",
    )
    parser.add_argument(
        "-e", "--extensions",
        nargs="+",
        default=None,
        help="Only check files with these extensions (e.g., -e .mp4 .mkv)",
    )
    parser.add_argument(
        "-s", "--skip",
        nargs="+",
        default=[".srt"],
        help="Extensions to ignore (default: .srt)",
    )
    args = parser.parse_args()

    folder = os.path.abspath(args.folder)
    if not os.path.isdir(folder):
        print(f"Error: '{folder}' is not a directory.", flush=True)
        return

    exts = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.extensions} if args.extensions else None
    skips = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.skip}

    dupes = find_duplicates(folder, exts, skips)

    if not dupes:
        print("No duplicate files found.", flush=True)
        return

    print(f"Found {len(dupes)} duplicate group(s):\n", flush=True)
    for base_id, files in sorted(dupes.items()):
        print(f"  Base ID: {base_id}", flush=True)
        for f in files:
            print(f"    -> {f}", flush=True)
        print(flush=True)


if __name__ == "__main__":
    main()