feat: exclude .srt dup detect
This commit is contained in:
parent
c0fd091d60
commit
2dfec487dd
|
|
@ -34,7 +34,11 @@ def extract_base_id(filename: str) -> str | None:
|
||||||
return match.group(0).upper()
|
return match.group(0).upper()
|
||||||
|
|
||||||
|
|
||||||
def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str, list[str]]:
|
def find_duplicates(
|
||||||
|
folder: str,
|
||||||
|
extensions: set[str] | None = None,
|
||||||
|
skip_extensions: set[str] | None = None,
|
||||||
|
) -> dict[str, list[str]]:
|
||||||
"""
|
"""
|
||||||
Scan `folder` for files, group by base ID, and return groups with >1 file.
|
Scan `folder` for files, group by base ID, and return groups with >1 file.
|
||||||
|
|
||||||
|
|
@ -42,10 +46,14 @@ def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str
|
||||||
folder: Path to the directory to scan.
|
folder: Path to the directory to scan.
|
||||||
extensions: If provided, only consider files with these extensions
|
extensions: If provided, only consider files with these extensions
|
||||||
(e.g., {'.mp4', '.mkv', '.avi'}). Case-insensitive.
|
(e.g., {'.mp4', '.mkv', '.avi'}). Case-insensitive.
|
||||||
|
skip_extensions: Extensions to always ignore (e.g., {'.srt'}).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict mapping base_id -> list of filenames (duplicate groups).
|
Dict mapping base_id -> list of filenames (duplicate groups).
|
||||||
"""
|
"""
|
||||||
|
if skip_extensions is None:
|
||||||
|
skip_extensions = set()
|
||||||
|
|
||||||
groups: dict[str, list[str]] = defaultdict(list)
|
groups: dict[str, list[str]] = defaultdict(list)
|
||||||
|
|
||||||
for entry in sorted(os.listdir(folder)):
|
for entry in sorted(os.listdir(folder)):
|
||||||
|
|
@ -53,10 +61,14 @@ def find_duplicates(folder: str, extensions: set[str] | None = None) -> dict[str
|
||||||
if not os.path.isfile(full_path):
|
if not os.path.isfile(full_path):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if extensions:
|
_, ext = os.path.splitext(entry)
|
||||||
_, ext = os.path.splitext(entry)
|
ext_lower = ext.lower()
|
||||||
if ext.lower() not in extensions:
|
|
||||||
continue
|
if ext_lower in skip_extensions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if extensions and ext_lower not in extensions:
|
||||||
|
continue
|
||||||
|
|
||||||
base_id = extract_base_id(entry)
|
base_id = extract_base_id(entry)
|
||||||
if base_id:
|
if base_id:
|
||||||
|
|
@ -82,6 +94,12 @@ def main():
|
||||||
default=None,
|
default=None,
|
||||||
help="Only check files with these extensions (e.g., -e .mp4 .mkv)",
|
help="Only check files with these extensions (e.g., -e .mp4 .mkv)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-s", "--skip",
|
||||||
|
nargs="+",
|
||||||
|
default=[".srt"],
|
||||||
|
help="Extensions to ignore (default: .srt)",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
folder = os.path.abspath(args.folder)
|
folder = os.path.abspath(args.folder)
|
||||||
|
|
@ -90,8 +108,9 @@ def main():
|
||||||
return
|
return
|
||||||
|
|
||||||
exts = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.extensions} if args.extensions else None
|
exts = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.extensions} if args.extensions else None
|
||||||
|
skips = {e.lower() if e.startswith('.') else f'.{e.lower()}' for e in args.skip}
|
||||||
|
|
||||||
dupes = find_duplicates(folder, exts)
|
dupes = find_duplicates(folder, exts, skips)
|
||||||
|
|
||||||
if not dupes:
|
if not dupes:
|
||||||
print("No duplicate files found.", flush=True)
|
print("No duplicate files found.", flush=True)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue