diff --git a/find_duplicates.py b/find_duplicates.py index e4c1699..dc5fedf 100644 --- a/find_duplicates.py +++ b/find_duplicates.py @@ -34,7 +34,11 @@ def extract_base_id(filename: str) -> str | None: return match.group(0).upper() -def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str, list[str]]: +def find_duplicates( + folder: str, + extensions: set[str] | None = None, + skip_extensions: set[str] | None = None, +) -> dict[str, list[str]]: """ Scan `folder` for files, group by base ID, and return groups with >1 file. @@ -42,10 +46,14 @@ def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str folder: Path to the directory to scan. extensions: If provided, only consider files with these extensions (e.g., {'.mp4', '.mkv', '.avi'}). Case-insensitive. + skip_extensions: Extensions to always ignore (e.g., {'.srt'}). Returns: Dict mapping base_id -> list of filenames (duplicate groups). """ + if skip_extensions is None: + skip_extensions = set() + groups: dict[str, list[str]] = defaultdict(list) for entry in sorted(os.listdir(folder)): @@ -53,10 +61,14 @@ def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str if not os.path.isfile(full_path): continue - if extensions: - _, ext = os.path.splitext(entry) - if ext.lower() not in extensions: - continue + _, ext = os.path.splitext(entry) + ext_lower = ext.lower() + + if ext_lower in skip_extensions: + continue + + if extensions and ext_lower not in extensions: + continue base_id = extract_base_id(entry) if base_id: @@ -82,6 +94,12 @@ def main(): default=None, help="Only check files with these extensions (e.g., -e .mp4 .mkv)", ) + parser.add_argument( + "-s", "--skip", + nargs="+", + default=[".srt"], + help="Extensions to ignore (default: .srt)", + ) args = parser.parse_args() folder = os.path.abspath(args.folder) @@ -90,8 +108,9 @@ def main(): return exts = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.extensions} if args.extensions else None + skips = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.skip} - dupes = find_duplicates(folder, exts) + dupes = find_duplicates(folder, exts, skips) if not dupes: print("No duplicate files found.", flush=True)