From c0fd091d609a71d89eb5dfe96ea2ec334e6910b7 Mon Sep 17 00:00:00 2001 From: tigerenwork Date: Sat, 27 Jun 2026 14:43:23 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=89=BE=E9=87=8D=E5=A4=8D=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- find_duplicates.py | 109 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 find_duplicates.py diff --git a/find_duplicates.py b/find_duplicates.py new file mode 100644 index 0000000..e4c1699 --- /dev/null +++ b/find_duplicates.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +Find duplicate files that share the same base ID (e.g., [LETTERS]-[DIGITS]) +but differ only by extra suffix characters. + +Examples: + NACR-996.mp4 vs NACR-996J.mp4 -> same base: NACR-996 + NDWQ-008-C.mp4 vs NDWQ-008.mp4 -> same base: NDWQ-008 +""" + +import os +import re +import argparse +from collections import defaultdict + + +def extract_base_id(filename: str) -> str | None: + """ + Extract the base ID from a filename. + Matches pattern: [letters]-[digits], case-insensitive. + Returns the matched base ID (uppercased for case-insensitive comparison), + or None if no match. + """ + # Remove file extension first so suffixes in the stem don't confuse matching. + # But we need to match within the full filename (without extension). + stem, _ = os.path.splitext(filename) + + # Match: one or more letters, a hyphen, one or more digits + # This is the core ID. We take the FIRST such match in the stem. + match = re.search(r'[a-zA-Z]+-\d+', stem) + if not match: + return None + + return match.group(0).upper() + + +def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str, list[str]]: + """ + Scan `folder` for files, group by base ID, and return groups with >1 file. + + Args: + folder: Path to the directory to scan. + extensions: If provided, only consider files with these extensions + (e.g., {'.mp4', '.mkv', '.avi'}). Case-insensitive. + + Returns: + Dict mapping base_id -> list of filenames (duplicate groups). + """ + groups: dict[str, list[str]] = defaultdict(list) + + for entry in sorted(os.listdir(folder)): + full_path = os.path.join(folder, entry) + if not os.path.isfile(full_path): + continue + + if extensions: + _, ext = os.path.splitext(entry) + if ext.lower() not in extensions: + continue + + base_id = extract_base_id(entry) + if base_id: + groups[base_id].append(entry) + + # Only return groups with duplicates + return {bid: files for bid, files in groups.items() if len(files) > 1} + + +def main(): + parser = argparse.ArgumentParser( + description="Find duplicate files by base ID (e.g., NACR-996 / NACR-996J)" + ) + parser.add_argument( + "folder", + nargs="?", + default=".", + help="Folder to scan (default: current directory)", + ) + parser.add_argument( + "-e", "--extensions", + nargs="+", + default=None, + help="Only check files with these extensions (e.g., -e .mp4 .mkv)", + ) + args = parser.parse_args() + + folder = os.path.abspath(args.folder) + if not os.path.isdir(folder): + print(f"Error: '{folder}' is not a directory.", flush=True) + return + + exts = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.extensions} if args.extensions else None + + dupes = find_duplicates(folder, exts) + + if not dupes: + print("No duplicate files found.", flush=True) + return + + print(f"Found {len(dupes)} duplicate group(s):\n", flush=True) + for base_id, files in sorted(dupes.items()): + print(f" Base ID: {base_id}", flush=True) + for f in files: + print(f" -> {f}", flush=True) + print(flush=True) + + +if __name__ == "__main__": + main()