Consolidate the unicode blocks and expand to include dingbats and Japanese outdoor signage emojis. An extensive search showed no official language symbols in this range. Consider using UnicodeCharacterTokenizer or tf.strings.unicode_encode and decode.

This commit is contained in:
Coenraad 2023-04-01 14:16:36 +02:00
parent ec83d01dca
commit 0472c89b26

View File

@ -40,17 +40,8 @@ REGEX_PATTERNS = [
EMOJI_PATTERN = re.compile(
"(["
"\U0001F1E0-\U0001F1FF"
"\U0001F300-\U0001F5FF"
"\U0001F600-\U0001F64F"
"\U0001F680-\U0001F6FF"
"\U0001F700-\U0001F77F"
"\U0001F780-\U0001F7FF"
"\U0001F800-\U0001F8FF"
"\U0001F900-\U0001F9FF"
"\U0001FA00-\U0001FA6F"
"\U0001FA70-\U0001FAFF"
"\U00002702-\U000027B0"
"\U00002600-\U000027BF"
"\U0001F000-\U0001FAFF"
"])"
)