feat: 找重复文件
This commit is contained in:
parent
662122dbf6
commit
c0fd091d60
|
|
@ -0,0 +1,109 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Find duplicate files that share the same base ID (e.g., [LETTERS]-[DIGITS])
|
||||||
|
but differ only by extra suffix characters.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
NACR-996.mp4 vs NACR-996J.mp4 -> same base: NACR-996
|
||||||
|
NDWQ-008-C.mp4 vs NDWQ-008.mp4 -> same base: NDWQ-008
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
def extract_base_id(filename: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Extract the base ID from a filename.
|
||||||
|
Matches pattern: [letters]-[digits], case-insensitive.
|
||||||
|
Returns the matched base ID (uppercased for case-insensitive comparison),
|
||||||
|
or None if no match.
|
||||||
|
"""
|
||||||
|
# Remove file extension first so suffixes in the stem don't confuse matching.
|
||||||
|
# But we need to match within the full filename (without extension).
|
||||||
|
stem, _ = os.path.splitext(filename)
|
||||||
|
|
||||||
|
# Match: one or more letters, a hyphen, one or more digits
|
||||||
|
# This is the core ID. We take the FIRST such match in the stem.
|
||||||
|
match = re.search(r'[a-zA-Z]+-\d+', stem)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return match.group(0).upper()
|
||||||
|
|
||||||
|
|
||||||
|
def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str, list[str]]:
|
||||||
|
"""
|
||||||
|
Scan `folder` for files, group by base ID, and return groups with >1 file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder: Path to the directory to scan.
|
||||||
|
extensions: If provided, only consider files with these extensions
|
||||||
|
(e.g., {'.mp4', '.mkv', '.avi'}). Case-insensitive.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping base_id -> list of filenames (duplicate groups).
|
||||||
|
"""
|
||||||
|
groups: dict[str, list[str]] = defaultdict(list)
|
||||||
|
|
||||||
|
for entry in sorted(os.listdir(folder)):
|
||||||
|
full_path = os.path.join(folder, entry)
|
||||||
|
if not os.path.isfile(full_path):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if extensions:
|
||||||
|
_, ext = os.path.splitext(entry)
|
||||||
|
if ext.lower() not in extensions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
base_id = extract_base_id(entry)
|
||||||
|
if base_id:
|
||||||
|
groups[base_id].append(entry)
|
||||||
|
|
||||||
|
# Only return groups with duplicates
|
||||||
|
return {bid: files for bid, files in groups.items() if len(files) > 1}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Find duplicate files by base ID (e.g., NACR-996 / NACR-996J)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"folder",
|
||||||
|
nargs="?",
|
||||||
|
default=".",
|
||||||
|
help="Folder to scan (default: current directory)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-e", "--extensions",
|
||||||
|
nargs="+",
|
||||||
|
default=None,
|
||||||
|
help="Only check files with these extensions (e.g., -e .mp4 .mkv)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
folder = os.path.abspath(args.folder)
|
||||||
|
if not os.path.isdir(folder):
|
||||||
|
print(f"Error: '{folder}' is not a directory.", flush=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
exts = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.extensions} if args.extensions else None
|
||||||
|
|
||||||
|
dupes = find_duplicates(folder, exts)
|
||||||
|
|
||||||
|
if not dupes:
|
||||||
|
print("No duplicate files found.", flush=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Found {len(dupes)} duplicate group(s):\n", flush=True)
|
||||||
|
for base_id, files in sorted(dupes.items()):
|
||||||
|
print(f" Base ID: {base_id}", flush=True)
|
||||||
|
for f in files:
|
||||||
|
print(f" -> {f}", flush=True)
|
||||||
|
print(flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue