Merge pull request #11 from LuminosoInsight/split-emoji

wordfreq now splits emoji from text
2024-12-23 17:31:41 +00:00 · 2015-06-26 12:12:51 -04:00 · 2015-06-26 12:12:51 -04:00 · 6c76942da2
commit 6c76942da2
parent 97bbb97f63 811c199e15
5 changed files with 143 additions and 11 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,2 +1,3 @@
 recursive-include wordfreq/data *.gz
 include README.md
+recursive-include wordfreq/data *.txt
--- a/tests/test.py
+++ b/tests/test.py
@ -1,6 +1,6 @@
 from wordfreq import (
    word_frequency, available_languages, cB_to_freq, iter_wordlist,
-    top_n_list, random_words, random_ascii_words
+    top_n_list, random_words, random_ascii_words, tokenize
 )
 from nose.tools import (
    eq_, assert_almost_equal, assert_greater, assert_less, raises
@ -84,12 +84,16 @@ def test_failed_cB_conversion():
 def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data, while the fake word "plan't" can't be found.
-    assert_greater(word_frequency("can't", 'en'), 0)
-    eq_(word_frequency("plan't", 'en'), 0)
+    eq_(tokenize("can't", 'en'), ["can't"])
+    eq_(tokenize("plan't", 'en'), ["plan't"])
+
+    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

    # We do split at other punctuation, causing the word-combining rule to
    # apply.
-    assert_greater(word_frequency("can.t", 'en'), 0)
+    eq_(tokenize("can.t", 'en'), ['can', 't'])
+
+def test_phrase_freq():
    plant = word_frequency("plan.t", 'en')
    assert_greater(plant, 0)
    assert_less(plant, word_frequency('plan', 'en'))
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -1,6 +1,9 @@
 from pkg_resources import resource_filename
 from functools import lru_cache
+import unicodedata
+from ftfy import chardata
 import langcodes
+import itertools
 import msgpack
 import re
 import gzip
@ -9,14 +12,128 @@ import random
 import logging
 logger = logging.getLogger(__name__)

-
-NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff０-９Ａ-Ｚａ-ｚ\uff66-\U0002ffff]'
-NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
-TOKEN_RE = re.compile("{0}+(?:'{0}+)*".format(NON_PUNCT_RANGE))
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

 CACHE_SIZE = 100000

+def _emoji_char_class():
+    """
+    Build a regex for emoji substitution.  First we create a regex character set
+    (like "[a-cv-z]") matching characters we consider emoji (see the docstring
+    of _replace_problem_text()).  The final regex matches one such character
+    followed by any number of spaces and identical characters.
+    """
+    ranges = []
+    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
+        if c == '3' and i >= 0x2600 and i != 0xfffd:
+            if ranges and i == ranges[-1][1] + 1:
+                ranges[-1][1] = i
+            else:
+                ranges.append([i, i])
+    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
+
+EMOJI_RANGE = _emoji_char_class()
+
+def _non_punct_class():
+    """
+    Builds a regex that matches anything that is not a one of the following
+    classes:
+    - P: punctuation
+    - S: symbols
+    - Z: separators
+    - C: control characters
+    This will classify symbols, including emoji, as punctuation; callers that
+    want to treat emoji separately should filter them out first.
+    """
+    non_punct_file = DATA_PATH / 'non_punct.txt'
+    try:
+        with non_punct_file.open() as file:
+            return file.read()
+    except FileNotFoundError:
+
+        out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
+
+        with non_punct_file.open(mode='w') as file:
+            file.write(out)
+
+        return out
+
+def _combining_mark_class():
+    """
+    Builds a regex that matches anything that is a combining mark
+    """
+    _combining_mark_file = DATA_PATH / 'combining_mark.txt'
+    try:
+        with _combining_mark_file.open() as file:
+            return file.read()
+    except FileNotFoundError:
+
+        out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
+
+        with _combining_mark_file.open(mode='w') as file:
+            file.write(out)
+
+        return out
+
+
+def func_to_ranges(accept):
+    """
+    Converts a function that accepts a single unicode character into a list of
+    ranges. Unassigned unicode are automatically accepted.
+    """
+    ranges = []
+    start = None
+    for x in range(0x110000):
+        cat = unicodedata.category(chr(x))
+        if cat == 'Cn' or accept(chr(x)):
+            if start is None:
+                start = x
+        else:
+            if start is not None:
+                ranges.append((start, x-1))
+                start = None
+
+    if start is not None:
+        ranges.append((start, x))
+
+    return ranges
+
+unassigned_ranges = None
+
+def func_to_regex(accept):
+    """
+    Converts a function that accepts a single unicode character into a regex.
+    Unassigned unicode characters are treated like their neighbors.
+    """
+    ranges = []
+    start = None
+    for x in range(0x110000):
+        cat = unicodedata.category(chr(x))
+        if cat == 'Cn' or accept(chr(x)):
+            if start is None:
+                start = x
+        else:
+            if start is not None:
+                ranges.append((start, x-1))
+                start = None
+
+    if start is not None:
+        ranges.append((start, x))
+
+    global unassigned_ranges
+    if unassigned_ranges is None:
+        unassigned_ranges = set(func_to_ranges(lambda _: False))
+
+    ranges = [range for range in ranges if range not in unassigned_ranges]
+
+    return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
+                                for start, end in ranges)
+
+
+COMBINING_MARK_RE = re.compile(_combining_mark_class())
+NON_PUNCT_RANGE = _non_punct_class()
+
+TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))

 def simple_tokenize(text):
    """
@ -34,21 +151,29 @@ def simple_tokenize(text):
    """
    return [token.lower() for token in TOKEN_RE.findall(text)]

-
+mecab_tokenize = None
 def tokenize(text, lang):
    """
    Tokenize this text in a way that's straightforward but appropriate for
    the language.

    So far, this means that Japanese is handled by mecab_tokenize, and
-    everything else is handled by simple_tokenize.
+    everything else is handled by simple_tokenize. Additionally, Arabic commas
+    and combining marks are removed.

    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
    """
    if lang == 'ja':
-        from wordfreq.mecab import mecab_tokenize
+        global mecab_tokenize
+        if mecab_tokenize is None:
+            from wordfreq.mecab import mecab_tokenize
        return mecab_tokenize(text)
+    elif lang == 'ar':
+        tokens = simple_tokenize(text)
+        tokens = [token.replace('ـ', '') for token in tokens] # remove tatweel
+        tokens = [COMBINING_MARK_RE.sub('', token) for token in tokens]
+        return [token for token in tokens if token] # remove empty strings
    else:
        return simple_tokenize(text)

--- a/wordfreq/data/combining_mark.txt
+++ b/wordfreq/data/combining_mark.txt
@ -0,0 +1 @@
+[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿]
--- a/wordfreq/data/non_punct.txt
+++ b/wordfreq/data/non_punct.txt
@ -0,0 +1 @@
+[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾０-９Ａ-Ｚａ-ｚｦ-￟-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿]
				`@ -0,0 +1 @@`
				[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿]
				`@ -0,0 +1 @@`
				[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾０-９Ａ-Ｚａ-ｚｦ-￟-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿]