Strip leading emoji/special-symbol prefix from filenames

Add _is_strippable_prefix_char() to detect emojis, symbols, variation
selectors, and whitespace using Unicode ranges and categories. Add
strip_emoji_prefix() to strip matching leading chars. Update
strip_duplicate_suffix() to apply prefix stripping in addition to the
existing suffix cleanup.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
tigerenwork 2026-05-27 01:21:51 +08:00
parent 9741c726bc
commit 662122dbf6
1 changed files with 52 additions and 6 deletions

View File

@ -2,21 +2,67 @@ import os
import re
import sys
import glob
import unicodedata
def _is_strippable_prefix_char(ch):
"""Return True if ch is an emoji/symbol/whitespace character that should
be stripped when it appears at the start of a filename.
"""
if ch.isspace():
return True
cp = ord(ch)
# Variation selectors (e.g. U+FE0F after ❤) and zero-width joiner
if 0xFE00 <= cp <= 0xFE0F or cp == 0x200D:
return True
# Common emoji / pictographic / symbol blocks
emoji_ranges = (
(0x2300, 0x23FF), # Misc Technical (⏰ etc.)
(0x2460, 0x24FF), # Enclosed Alphanumerics
(0x2500, 0x257F), # Box Drawing
(0x2580, 0x259F), # Block Elements
(0x25A0, 0x25FF), # Geometric Shapes (⚫ is 0x26AB, but ▶ here)
(0x2600, 0x26FF), # Misc Symbols (⚫ ❤ ☀ ...)
(0x2700, 0x27BF), # Dingbats (✅ ✨ ...)
(0x2B00, 0x2BFF), # Misc Symbols and Arrows
(0x1F000, 0x1FFFF), # Supplementary symbols & emoji planes
)
for lo, hi in emoji_ranges:
if lo <= cp <= hi:
return True
# Unicode general categories: Symbol (So/Sk/Sm) and format chars
cat = unicodedata.category(ch)
if cat in ('So', 'Sk', 'Sm', 'Cf'):
return True
return False
def strip_emoji_prefix(name):
"""Strip leading emoji/symbol/whitespace characters from name."""
i = 0
while i < len(name) and _is_strippable_prefix_char(name[i]):
i += 1
return name[i:]
def strip_duplicate_suffix(filename):
"""Strips unwanted suffixes from a filename: OS-generated ' (2)', ' (3)',
and trailing ' - Join'.
"""Strips unwanted prefixes/suffixes from a filename.
Suffixes removed: OS-generated ' (2)', ' (3)', and trailing ' - Join'.
Prefixes removed: leading emoji / special symbols (e.g. , , ).
Examples:
'video (2).mp4' -> 'video.mp4'
'video (3).mp4' -> 'video.mp4'
'video - Join.mp4' -> 'video.mp4'
'video.mp4' -> 'video.mp4'
'video (2).mp4' -> 'video.mp4'
'video (3).mp4' -> 'video.mp4'
'video - Join.mp4' -> 'video.mp4'
'⚫️“啊~爸”.mp4' -> '“啊~爸”.mp4'
'hello.mp4' -> 'hello.mp4'
'video.mp4' -> 'video.mp4'
"""
name, ext = os.path.splitext(filename)
cleaned = re.sub(r'\s+\(\d+\)$', '', name)
cleaned = re.sub(r'\s+-\s+Join$', '', cleaned)
cleaned = strip_emoji_prefix(cleaned)
return cleaned + ext