feat: 找重复文件

2026-06-27 14:43:23 +08:00 · 2026-06-27 14:43:23 +08:00 · c0fd091d60
parent 662122dbf6
commit c0fd091d60
1 changed files with 109 additions and 0 deletions
--- a/find_duplicates.py
+++ b/find_duplicates.py
@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""
+Find duplicate files that share the same base ID (e.g., [LETTERS]-[DIGITS])
+but differ only by extra suffix characters.
+
+Examples:
+    NACR-996.mp4   vs  NACR-996J.mp4      -> same base: NACR-996
+    NDWQ-008-C.mp4 vs  NDWQ-008.mp4       -> same base: NDWQ-008
+"""
+
+import os
+import re
+import argparse
+from collections import defaultdict
+
+
+def extract_base_id(filename: str) -> str | None:
+    """
+    Extract the base ID from a filename.
+    Matches pattern: [letters]-[digits], case-insensitive.
+    Returns the matched base ID (uppercased for case-insensitive comparison),
+    or None if no match.
+    """
+    # Remove file extension first so suffixes in the stem don't confuse matching.
+    # But we need to match within the full filename (without extension).
+    stem, _ = os.path.splitext(filename)
+
+    # Match: one or more letters, a hyphen, one or more digits
+    # This is the core ID. We take the FIRST such match in the stem.
+    match = re.search(r'[a-zA-Z]+-\d+', stem)
+    if not match:
+        return None
+
+    return match.group(0).upper()
+
+
+def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str, list[str]]:
+    """
+    Scan `folder` for files, group by base ID, and return groups with >1 file.
+
+    Args:
+        folder: Path to the directory to scan.
+        extensions: If provided, only consider files with these extensions
+                    (e.g., {'.mp4', '.mkv', '.avi'}). Case-insensitive.
+
+    Returns:
+        Dict mapping base_id -> list of filenames (duplicate groups).
+    """
+    groups: dict[str, list[str]] = defaultdict(list)
+
+    for entry in sorted(os.listdir(folder)):
+        full_path = os.path.join(folder, entry)
+        if not os.path.isfile(full_path):
+            continue
+
+        if extensions:
+            _, ext = os.path.splitext(entry)
+            if ext.lower() not in extensions:
+                continue
+
+        base_id = extract_base_id(entry)
+        if base_id:
+            groups[base_id].append(entry)
+
+    # Only return groups with duplicates
+    return {bid: files for bid, files in groups.items() if len(files) > 1}
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Find duplicate files by base ID (e.g., NACR-996 / NACR-996J)"
+    )
+    parser.add_argument(
+        "folder",
+        nargs="?",
+        default=".",
+        help="Folder to scan (default: current directory)",
+    )
+    parser.add_argument(
+        "-e", "--extensions",
+        nargs="+",
+        default=None,
+        help="Only check files with these extensions (e.g., -e .mp4 .mkv)",
+    )
+    args = parser.parse_args()
+
+    folder = os.path.abspath(args.folder)
+    if not os.path.isdir(folder):
+        print(f"Error: '{folder}' is not a directory.", flush=True)
+        return
+
+    exts = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.extensions} if args.extensions else None
+
+    dupes = find_duplicates(folder, exts)
+
+    if not dupes:
+        print("No duplicate files found.", flush=True)
+        return
+
+    print(f"Found {len(dupes)} duplicate group(s):\n", flush=True)
+    for base_id, files in sorted(dupes.items()):
+        print(f"  Base ID: {base_id}", flush=True)
+        for f in files:
+            print(f"    -> {f}", flush=True)
+        print(flush=True)
+
+
+if __name__ == "__main__":
+    main()