diff --git a/MANIFEST.in b/MANIFEST.in index 4f20a26..012f4ca 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ recursive-include wordfreq/data *.gz include README.md +recursive-include wordfreq/data *.txt diff --git a/tests/test.py b/tests/test.py index 91f990a..470d4fe 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,6 +1,6 @@ from wordfreq import ( word_frequency, available_languages, cB_to_freq, iter_wordlist, - top_n_list, random_words, random_ascii_words + top_n_list, random_words, random_ascii_words, tokenize ) from nose.tools import ( eq_, assert_almost_equal, assert_greater, assert_less, raises @@ -84,12 +84,16 @@ def test_failed_cB_conversion(): def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data, while the fake word "plan't" can't be found. - assert_greater(word_frequency("can't", 'en'), 0) - eq_(word_frequency("plan't", 'en'), 0) + eq_(tokenize("can't", 'en'), ["can't"]) + eq_(tokenize("plan't", 'en'), ["plan't"]) + + eq_(tokenize('😂test', 'en'), ['😂', 'test']) # We do split at other punctuation, causing the word-combining rule to # apply. - assert_greater(word_frequency("can.t", 'en'), 0) + eq_(tokenize("can.t", 'en'), ['can', 't']) + +def test_phrase_freq(): plant = word_frequency("plan.t", 'en') assert_greater(plant, 0) assert_less(plant, word_frequency('plan', 'en')) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 87e82f4..f861c89 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -1,6 +1,9 @@ from pkg_resources import resource_filename from functools import lru_cache +import unicodedata +from ftfy import chardata import langcodes +import itertools import msgpack import re import gzip @@ -9,14 +12,128 @@ import random import logging logger = logging.getLogger(__name__) - -NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]' -NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE) -TOKEN_RE = re.compile("{0}+(?:'{0}+)*".format(NON_PUNCT_RANGE)) DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) CACHE_SIZE = 100000 +def _emoji_char_class(): + """ + Build a regex for emoji substitution. First we create a regex character set + (like "[a-cv-z]") matching characters we consider emoji (see the docstring + of _replace_problem_text()). The final regex matches one such character + followed by any number of spaces and identical characters. + """ + ranges = [] + for i, c in enumerate(chardata.CHAR_CLASS_STRING): + if c == '3' and i >= 0x2600 and i != 0xfffd: + if ranges and i == ranges[-1][1] + 1: + ranges[-1][1] = i + else: + ranges.append([i, i]) + return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges) + +EMOJI_RANGE = _emoji_char_class() + +def _non_punct_class(): + """ + Builds a regex that matches anything that is not a one of the following + classes: + - P: punctuation + - S: symbols + - Z: separators + - C: control characters + This will classify symbols, including emoji, as punctuation; callers that + want to treat emoji separately should filter them out first. + """ + non_punct_file = DATA_PATH / 'non_punct.txt' + try: + with non_punct_file.open() as file: + return file.read() + except FileNotFoundError: + + out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC') + + with non_punct_file.open(mode='w') as file: + file.write(out) + + return out + +def _combining_mark_class(): + """ + Builds a regex that matches anything that is a combining mark + """ + _combining_mark_file = DATA_PATH / 'combining_mark.txt' + try: + with _combining_mark_file.open() as file: + return file.read() + except FileNotFoundError: + + out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M') + + with _combining_mark_file.open(mode='w') as file: + file.write(out) + + return out + + +def func_to_ranges(accept): + """ + Converts a function that accepts a single unicode character into a list of + ranges. Unassigned unicode are automatically accepted. + """ + ranges = [] + start = None + for x in range(0x110000): + cat = unicodedata.category(chr(x)) + if cat == 'Cn' or accept(chr(x)): + if start is None: + start = x + else: + if start is not None: + ranges.append((start, x-1)) + start = None + + if start is not None: + ranges.append((start, x)) + + return ranges + +unassigned_ranges = None + +def func_to_regex(accept): + """ + Converts a function that accepts a single unicode character into a regex. + Unassigned unicode characters are treated like their neighbors. + """ + ranges = [] + start = None + for x in range(0x110000): + cat = unicodedata.category(chr(x)) + if cat == 'Cn' or accept(chr(x)): + if start is None: + start = x + else: + if start is not None: + ranges.append((start, x-1)) + start = None + + if start is not None: + ranges.append((start, x)) + + global unassigned_ranges + if unassigned_ranges is None: + unassigned_ranges = set(func_to_ranges(lambda _: False)) + + ranges = [range for range in ranges if range not in unassigned_ranges] + + return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end)) + for start, end in ranges) + + +COMBINING_MARK_RE = re.compile(_combining_mark_class()) +NON_PUNCT_RANGE = _non_punct_class() + +TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) def simple_tokenize(text): """ @@ -34,21 +151,29 @@ def simple_tokenize(text): """ return [token.lower() for token in TOKEN_RE.findall(text)] - +mecab_tokenize = None def tokenize(text, lang): """ Tokenize this text in a way that's straightforward but appropriate for the language. So far, this means that Japanese is handled by mecab_tokenize, and - everything else is handled by simple_tokenize. + everything else is handled by simple_tokenize. Additionally, Arabic commas + and combining marks are removed. Strings that are looked up in wordfreq will be run through this function first, so that they can be expected to match the data. """ if lang == 'ja': - from wordfreq.mecab import mecab_tokenize + global mecab_tokenize + if mecab_tokenize is None: + from wordfreq.mecab import mecab_tokenize return mecab_tokenize(text) + elif lang == 'ar': + tokens = simple_tokenize(text) + tokens = [token.replace('ـ', '') for token in tokens] # remove tatweel + tokens = [COMBINING_MARK_RE.sub('', token) for token in tokens] + return [token for token in tokens if token] # remove empty strings else: return simple_tokenize(text) diff --git a/wordfreq/data/combining_mark.txt b/wordfreq/data/combining_mark.txt new file mode 100644 index 0000000..ac83af1 --- /dev/null +++ b/wordfreq/data/combining_mark.txt @@ -0,0 +1 @@ +[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿] \ No newline at end of file diff --git a/wordfreq/data/non_punct.txt b/wordfreq/data/non_punct.txt new file mode 100644 index 0000000..85af3b5 --- /dev/null +++ b/wordfreq/data/non_punct.txt @@ -0,0 +1 @@ +[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾0-9A-Za-zヲ-￟￾-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿] \ No newline at end of file