#!/usr/bin/env python3 """ Find duplicate files that share the same base ID (e.g., [LETTERS]-[DIGITS]) but differ only by extra suffix characters. Examples: NACR-996.mp4 vs NACR-996J.mp4 -> same base: NACR-996 NDWQ-008-C.mp4 vs NDWQ-008.mp4 -> same base: NDWQ-008 """ import os import re import argparse from collections import defaultdict def extract_base_id(filename: str) -> str | None: """ Extract the base ID from a filename. Matches pattern: [letters]-[digits], case-insensitive. Returns the matched base ID (uppercased for case-insensitive comparison), or None if no match. """ # Remove file extension first so suffixes in the stem don't confuse matching. # But we need to match within the full filename (without extension). stem, _ = os.path.splitext(filename) # Match: one or more letters, a hyphen, one or more digits # This is the core ID. We take the FIRST such match in the stem. match = re.search(r'[a-zA-Z]+-\d+', stem) if not match: return None return match.group(0).upper() def find_duplicates( folder: str, extensions: set[str] | None = None, skip_extensions: set[str] | None = None, ) -> dict[str, list[str]]: """ Scan `folder` for files, group by base ID, and return groups with >1 file. Args: folder: Path to the directory to scan. extensions: If provided, only consider files with these extensions (e.g., {'.mp4', '.mkv', '.avi'}). Case-insensitive. skip_extensions: Extensions to always ignore (e.g., {'.srt'}). Returns: Dict mapping base_id -> list of filenames (duplicate groups). """ if skip_extensions is None: skip_extensions = set() groups: dict[str, list[str]] = defaultdict(list) for entry in sorted(os.listdir(folder)): full_path = os.path.join(folder, entry) if not os.path.isfile(full_path): continue _, ext = os.path.splitext(entry) ext_lower = ext.lower() if ext_lower in skip_extensions: continue if extensions and ext_lower not in extensions: continue base_id = extract_base_id(entry) if base_id: groups[base_id].append(entry) # Only return groups with duplicates return {bid: files for bid, files in groups.items() if len(files) > 1} def main(): parser = argparse.ArgumentParser( description="Find duplicate files by base ID (e.g., NACR-996 / NACR-996J)" ) parser.add_argument( "folder", nargs="?", default=".", help="Folder to scan (default: current directory)", ) parser.add_argument( "-e", "--extensions", nargs="+", default=None, help="Only check files with these extensions (e.g., -e .mp4 .mkv)", ) parser.add_argument( "-s", "--skip", nargs="+", default=[".srt"], help="Extensions to ignore (default: .srt)", ) args = parser.parse_args() folder = os.path.abspath(args.folder) if not os.path.isdir(folder): print(f"Error: '{folder}' is not a directory.", flush=True) return exts = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.extensions} if args.extensions else None skips = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.skip} dupes = find_duplicates(folder, exts, skips) if not dupes: print("No duplicate files found.", flush=True) return print(f"Found {len(dupes)} duplicate group(s):\n", flush=True) for base_id, files in sorted(dupes.items()): print(f" Base ID: {base_id}", flush=True) for f in files: print(f" -> {f}", flush=True) print(flush=True) if __name__ == "__main__": main()