From 0472c89b2651c12b8f4751fd8c5e59675a94182b Mon Sep 17 00:00:00 2001 From: Coenraad Date: Sat, 1 Apr 2023 14:16:36 +0200 Subject: [PATCH] Consolidate the unicode blocks and expand to include dingbats and Japanese outdoor signage emojis. An extensive search showed no official language symbols in this range. Consider using UnicodeCharacterTokenizer or tf.strings.unicode_encode and decode. --- trust_and_safety_models/nsfw/nsfw_text.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/trust_and_safety_models/nsfw/nsfw_text.py b/trust_and_safety_models/nsfw/nsfw_text.py index 980fc8fd4..4aeec332a 100644 --- a/trust_and_safety_models/nsfw/nsfw_text.py +++ b/trust_and_safety_models/nsfw/nsfw_text.py @@ -40,17 +40,8 @@ REGEX_PATTERNS = [ EMOJI_PATTERN = re.compile( "([" - "\U0001F1E0-\U0001F1FF" - "\U0001F300-\U0001F5FF" - "\U0001F600-\U0001F64F" - "\U0001F680-\U0001F6FF" - "\U0001F700-\U0001F77F" - "\U0001F780-\U0001F7FF" - "\U0001F800-\U0001F8FF" - "\U0001F900-\U0001F9FF" - "\U0001FA00-\U0001FA6F" - "\U0001FA70-\U0001FAFF" - "\U00002702-\U000027B0" + "\U00002600-\U000027BF" + "\U0001F000-\U0001FAFF" "])" )