diff --git a/strip_suffix.py b/strip_suffix.py index 9fb5b38..1604d64 100644 --- a/strip_suffix.py +++ b/strip_suffix.py @@ -2,21 +2,67 @@ import os import re import sys import glob +import unicodedata + + +def _is_strippable_prefix_char(ch): + """Return True if ch is an emoji/symbol/whitespace character that should + be stripped when it appears at the start of a filename. + """ + if ch.isspace(): + return True + cp = ord(ch) + # Variation selectors (e.g. U+FE0F after ❤) and zero-width joiner + if 0xFE00 <= cp <= 0xFE0F or cp == 0x200D: + return True + # Common emoji / pictographic / symbol blocks + emoji_ranges = ( + (0x2300, 0x23FF), # Misc Technical (⏰ etc.) + (0x2460, 0x24FF), # Enclosed Alphanumerics + (0x2500, 0x257F), # Box Drawing + (0x2580, 0x259F), # Block Elements + (0x25A0, 0x25FF), # Geometric Shapes (⚫ is 0x26AB, but ▶ here) + (0x2600, 0x26FF), # Misc Symbols (⚫ ❤ ☀ ...) + (0x2700, 0x27BF), # Dingbats (✅ ✨ ...) + (0x2B00, 0x2BFF), # Misc Symbols and Arrows + (0x1F000, 0x1FFFF), # Supplementary symbols & emoji planes + ) + for lo, hi in emoji_ranges: + if lo <= cp <= hi: + return True + # Unicode general categories: Symbol (So/Sk/Sm) and format chars + cat = unicodedata.category(ch) + if cat in ('So', 'Sk', 'Sm', 'Cf'): + return True + return False + + +def strip_emoji_prefix(name): + """Strip leading emoji/symbol/whitespace characters from name.""" + i = 0 + while i < len(name) and _is_strippable_prefix_char(name[i]): + i += 1 + return name[i:] def strip_duplicate_suffix(filename): - """Strips unwanted suffixes from a filename: OS-generated ' (2)', ' (3)', - and trailing ' - Join'. + """Strips unwanted prefixes/suffixes from a filename. + + Suffixes removed: OS-generated ' (2)', ' (3)', and trailing ' - Join'. + Prefixes removed: leading emoji / special symbols (e.g. ⚫️, ❤️, ✅). Examples: - 'video (2).mp4' -> 'video.mp4' - 'video (3).mp4' -> 'video.mp4' - 'video - Join.mp4' -> 'video.mp4' - 'video.mp4' -> 'video.mp4' + 'video (2).mp4' -> 'video.mp4' + 'video (3).mp4' -> 'video.mp4' + 'video - Join.mp4' -> 'video.mp4' + '⚫️“啊~爸”.mp4' -> '“啊~爸”.mp4' + '❤️hello.mp4' -> 'hello.mp4' + 'video.mp4' -> 'video.mp4' """ name, ext = os.path.splitext(filename) cleaned = re.sub(r'\s+\(\d+\)$', '', name) cleaned = re.sub(r'\s+-\s+Join$', '', cleaned) + cleaned = strip_emoji_prefix(cleaned) return cleaned + ext