Merge pull request #22 from LuminosoInsight/standard-tokenizer

Use a more standard Unicode tokenizer
2024-12-23 17:31:41 +00:00 · 2015-08-27 11:56:19 -04:00 · 2015-08-27 11:56:19 -04:00 · e6d9b36203
commit e6d9b36203
parent 2b8089e2b1 b952676679
43 changed files with 218 additions and 179 deletions
--- a/README.md
+++ b/README.md
@ -2,6 +2,7 @@ Tools for working with word frequencies from various corpora.
 Author: Rob Speer
 ## Installation
 wordfreq requires Python 3 and depends on a few other Python modules
@ -21,11 +22,25 @@ install them on Ubuntu:
    sudo apt-get install mecab-ipadic-utf8 libmecab-dev
    pip3 install mecab-python3
 ## Unicode data
-The tokenizers that split non-Japanese phrases utilize regexes built using the
+## Tokenization
-`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0.  To
+
-update these regexes, run `scripts/gen_regex.py`.
+wordfreq uses the Python package `regex`, which is a more advanced
 implementation of regular expressions than the standard library, to
 separate text into tokens that can be counted consistently. `regex`
 produces tokens that follow the recommendations in [Unicode
 Annex #29, Text Segmentation][uax29].
 There are language-specific exceptions:
 - In Arabic, it additionally normalizes ligatures and removes combining marks.
 - In Japanese, instead of using the regex library, it uses the external library
  `mecab-python3`. This is an optional dependency of wordfreq, and compiling
  it requires the `libmecab-dev` system package to be installed.
 - It does not yet attempt to tokenize Chinese ideograms.
 [uax29]: http://unicode.org/reports/tr29/
 ## License
@ -56,5 +71,5 @@ sources:
 Some additional data was collected by a custom application that watches the
 streaming Twitter API, in accordance with Twitter's Developer Agreement &
-Policy. This software only gives statistics about words that are very commonly
+Policy. This software gives statistics about words that are commonly used on
-used on Twitter; it does not display or republish any Twitter content.
+Twitter; it does not display or republish any Twitter content.
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@ -1,76 +0,0 @@
 import unicodedata
 from ftfy import chardata
 import pathlib
 from pkg_resources import resource_filename
 CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 def func_to_regex(accept_func):
    """
    Given a function that returns True or False for a numerical codepoint,
    return a regex character class accepting the characters resulting in True.
    Ranges separated only by unassigned characters are merged for efficiency.
    """
    # parsing_range is True if the current codepoint might be in a range that
    # the regex will accept
    parsing_range = False
    ranges = []
    for codepoint, category in enumerate(CATEGORIES):
        if accept_func(codepoint):
            if not parsing_range:
                ranges.append([codepoint, codepoint])
                parsing_range = True
            else:
                ranges[-1][1] = codepoint
        elif category != 'Cn':
            parsing_range = False
    return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
 def cache_regex_from_func(filename, func):
    """
    Generates a regex from a function that accepts a single unicode character,
    and caches it in the data path at filename.
    """
    with (DATA_PATH / filename).open(mode='w') as file:
        file.write(func_to_regex(func))
 def _is_emoji_codepoint(i):
    """
    Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
    character (as future-proofed by the ftfy chardata module) but excluding
    symbols like © and ™ below U+2600 and the replacement character U+FFFD.
    """
    return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd
 def _is_non_punct_codepoint(i):
    """
    Report whether a numerical codepoint is not one of the following classes:
    - P: punctuation
    - S: symbols
    - Z: separators
    - C: control characters
    This will classify symbols, including emoji, as punctuation; users that
    want to accept emoji should add them separately.
    """
    return CATEGORIES[i][0] not in 'PSZC'
 def _is_combining_mark_codepoint(i):
    """
    Report whether a numerical codepoint is a combining mark (Unicode 'M').
    """
    return CATEGORIES[i][0] == 'M'
 if __name__ == '__main__':
    cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
    cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
    cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)
--- a/setup.py
+++ b/setup.py
@ -26,14 +26,14 @@ classifiers = [
 current_dir = os.path.dirname(__file__)
 README_contents = open(os.path.join(current_dir, 'README.md')).read()
 doclines = README_contents.split("\n")
-dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes']
+dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
 if sys.version_info < (3, 4):
    dependencies.append('pathlib')
 setup(
    name="wordfreq",
-    version='1.0',
+    version='1.1',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/tests/test.py
+++ b/tests/test.py
@ -95,13 +95,17 @@ def test_failed_cB_conversion():
 def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
-    eq_(tokenize("can't", 'en'), ["can't"])
+    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])
    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
-    # We do split at other punctuation, causing the word-combining rule to
+    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
    # apply.
    eq_(tokenize("can.t", 'en'), ['can', 't'])
 def test_casefolding():
@ -110,11 +114,11 @@ def test_casefolding():
 def test_phrase_freq():
-    plant = word_frequency("plan.t", 'en')
+    ff = word_frequency("flip-flop", 'en')
-    assert_greater(plant, 0)
+    assert_greater(ff, 0)
    assert_almost_equal(
-        1.0 / plant,
+        1.0 / ff,
-        1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
+        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    )
@ -134,8 +138,8 @@ def test_not_really_random():
 def test_not_enough_ascii():
    random_ascii_words(lang='zh')
 def test_ar():
 def test_ar():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
@ -152,3 +156,16 @@ def test_ar():
        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )
 def test_ideographic_fallback():
    # Try tokenizing Chinese text -- it should remain stuck together.
    eq_(tokenize('中国文字', 'zh'), ['中国文字'])
    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -1,14 +1,13 @@
 from wordfreq.tokens import tokenize, simple_tokenize
 from pkg_resources import resource_filename
 from functools import lru_cache
 import langcodes
 import msgpack
 import re
 import gzip
 import itertools
 import pathlib
 import random
 import logging
 import unicodedata
 logger = logging.getLogger(__name__)
@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 def load_range(filename):
    """
    Load a file from the data path.
    """
    with (DATA_PATH / filename).open() as file:
        return file.read()
-EMOJI_RANGE = load_range('emoji.txt')
+# simple_tokenize is imported so that other things can import it from here.
-NON_PUNCT_RANGE = load_range('non_punct.txt')
+# Suppress the pyflakes warning.
-COMBINING_MARK_RANGE = load_range('combining_mark.txt')
+simple_tokenize = simple_tokenize
 COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
 TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
 def simple_tokenize(text):
    """
    A simple tokenizer that can be applied to most languages.
    It considers a word to be made of a sequence of 'token characters', an
    overly inclusive range that includes letters, Han characters, emoji, and a
    bunch of miscellaneous whatnot, but excludes most punctuation and
    whitespace.
    The single complication for the sake of English is that apostrophes are not
    considered part of the token if they appear on the edge of the character
    sequence, but they are if they appear internally. "cats'" is not a token,
    but "cat's" is.
    """
    return [token.casefold() for token in TOKEN_RE.findall(text)]
 mecab_tokenize = None
 def tokenize(text, lang):
    """
    Tokenize this text in a way that's straightforward but appropriate for
    the language.
    So far, this means that Japanese is handled by mecab_tokenize, and
    everything else is handled by simple_tokenize. Additionally, Arabic commas
    and combining marks are removed.
    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
    """
    if lang == 'ja':
        global mecab_tokenize
        if mecab_tokenize is None:
            from wordfreq.mecab import mecab_tokenize
        return mecab_tokenize(text)
    if lang == 'ar':
        text = standardize_arabic(text)
    return simple_tokenize(text)
 def standardize_arabic(text):
    """
    Standardizes arabic text by removing combining marks and tatweels.
    """
    return unicodedata.normalize(
        'NFKC',
        COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
    )
 def read_cBpack(filename):
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz
--- a/wordfreq/data/combining_mark.txt
+++ b/wordfreq/data/combining_mark.txt
@ -1 +0,0 @@
 [̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
--- a/wordfreq/data/emoji.txt
+++ b/wordfreq/data/emoji.txt
@ -1 +0,0 @@
 [☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🃿🄍-🣿]
--- a/wordfreq/data/non_punct.txt
+++ b/wordfreq/data/non_punct.txt
@ -1 +0,0 @@
 [0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ０-９Ａ-Ｚａ-ｚｦ-ￜ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯]
--- a/wordfreq/data/twitter_ar.msgpack.gz
+++ b/wordfreq/data/twitter_ar.msgpack.gz
--- a/wordfreq/data/twitter_de.msgpack.gz
+++ b/wordfreq/data/twitter_de.msgpack.gz
--- a/wordfreq/data/twitter_en.msgpack.gz
+++ b/wordfreq/data/twitter_en.msgpack.gz
--- a/wordfreq/data/twitter_es.msgpack.gz
+++ b/wordfreq/data/twitter_es.msgpack.gz
--- a/wordfreq/data/twitter_fr.msgpack.gz
+++ b/wordfreq/data/twitter_fr.msgpack.gz
--- a/wordfreq/data/twitter_id.msgpack.gz
+++ b/wordfreq/data/twitter_id.msgpack.gz
--- a/wordfreq/data/twitter_it.msgpack.gz
+++ b/wordfreq/data/twitter_it.msgpack.gz
--- a/wordfreq/data/twitter_ja.msgpack.gz
+++ b/wordfreq/data/twitter_ja.msgpack.gz
--- a/wordfreq/data/twitter_ko.msgpack.gz
+++ b/wordfreq/data/twitter_ko.msgpack.gz
--- a/wordfreq/data/twitter_ms.msgpack.gz
+++ b/wordfreq/data/twitter_ms.msgpack.gz
--- a/wordfreq/data/twitter_nl.msgpack.gz
+++ b/wordfreq/data/twitter_nl.msgpack.gz
--- a/wordfreq/data/twitter_pt.msgpack.gz
+++ b/wordfreq/data/twitter_pt.msgpack.gz
--- a/wordfreq/data/twitter_ru.msgpack.gz
+++ b/wordfreq/data/twitter_ru.msgpack.gz
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@ -1,4 +1,5 @@
 import MeCab
 import unicodedata
 # Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
@ -14,6 +15,7 @@ def mecab_tokenize(text):
    contains the same table that the command-line version of MeCab would output.
    We find the tokens in the first column of this table.
    """
    text = unicodedata.normalize('NFKC', text.strip())
    return [line.split('\t')[0]
-            for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
+            for line in MECAB_ANALYZER.parse(text).split('\n')
            if line != '' and line != 'EOS']
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -0,0 +1,114 @@
 import regex
 import unicodedata
 TOKEN_RE = regex.compile(r"""
    # Case 1: a special case for Chinese and Japanese
    # -----------------------------------------------
    # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
    # (\p{Script=Hiragana}), we allow a sequence of those characters to be
    # glued together as a single token. Without this case, the standard rule
    # (case 2) would make each character a separate token. This would be the
    # correct behavior for word-wrapping, but a messy failure mode for NLP
    # tokenization.
    #
    # It is, of course, better to use a tokenizer that is designed for Chinese
    # or Japanese text. This is effectively a fallback for when the wrong
    # tokenizer is used.
    #
    # This rule is listed first so that it takes precedence.
    [\p{IsIdeo}\p{Script=Hiragana}]+ |
    # Case 2: standard Unicode segmentation
    # -------------------------------------
    # The start of the token must be 'word-like', not punctuation or whitespace
    # or various other things. However, we allow characters of category So
    # (Symbol - Other) because many of these are emoji, which can convey
    # meaning.
    [\w\p{So}]
    # The rest of the token matches characters that are not any sort of space
    # (\S) and do not cause word breaks according to the Unicode word
    # segmentation heuristic (\B).
    (?:\B\S)*
 """, regex.V1 | regex.WORD | regex.VERBOSE)
 ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
 def simple_tokenize(text):
    """
    Tokenize the given text using a straightforward, Unicode-aware token
    expression.
    The expression mostly implements the rules of Unicode Annex #29 that
    are contained in the `regex` module's word boundary matching, including
    the refinement that splits words between apostrophes and vowels in order
    to separate tokens such as the French article «l'». Our customizations
    to the expression are:
    - It leaves sequences of Chinese or Japanese characters (specifically, Han
      ideograms and hiragana) relatively untokenized, instead of splitting each
      character into its own token.
    - It outputs only the tokens that start with a word-like character, or
      miscellaneous symbols such as emoji.
    - It breaks on all spaces, even the "non-breaking" ones.
    """
    text = unicodedata.normalize('NFC', text)
    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
 def remove_arabic_marks(text):
    """
    Remove decorations from Arabic words:
    - Combining marks of class Mn, which tend to represent non-essential
      vowel markings.
    - Tatweels, horizontal segments that are used to extend or justify a
      word.
    """
    return ARABIC_MARK_RE.sub('', text)
 mecab_tokenize = None
 def tokenize(text, lang):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language.
    So far, this means:
    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
    - Japanese will be delegated to the external mecab-python module.
    - Chinese or Japanese texts that aren't identified as the appropriate
      language will only split on punctuation and script boundaries, giving
      you untokenized globs of characters that probably represent many words.
    - All other languages will be tokenized using a regex that mostly
      implements the Word Segmentation section of Unicode Annex #29.
      See `simple_tokenize` for details.
    Additionally, the text will be case-folded to lowercase, and text marked
    as Arabic will be normalized more strongly and have combining marks and
    tatweels removed.
    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
    """
    if lang == 'ja':
        global mecab_tokenize
        if mecab_tokenize is None:
            from wordfreq.mecab import mecab_tokenize
        return mecab_tokenize(text)
    if lang == 'ar':
        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
    return simple_tokenize(text)
--- a/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/tests/test_tokenizer.py
@ -6,7 +6,7 @@ def test_tokenizer_1():
    text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
    tokens = [
        'this', 'is', 'a', 'test', 'she', 'said',
-        'and', "i'll", 'bet', "y'all", '3', '50', 'that',
+        'and', "i'll", 'bet', "y", "all", '3.50', 'that',
        'it', "won't", 'fail',
    ]
    result = cld2_surface_tokenizer(text)
--- a/wordfreq_builder/tests/test_urls.py
+++ b/wordfreq_builder/tests/test_urls.py
@ -0,0 +1,20 @@
 from wordfreq_builder.word_counts import URL_RE
 from nose.tools import eq_
 def check_url(url):
    match = URL_RE.match(url)
    assert match
    eq_(match.span(), (0, len(url)))
 def test_url_re():
    # URLs like this are all over the Arabic Wikipedia. Here's one with the
    # student ID blanked out.
    yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
    yield check_url, 'https://example.com/űnicode.html'
    yield check_url, 'http://☃.net'
    assert not URL_RE.match('ftp://127.0.0.1')
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -123,7 +123,6 @@ def google_books_deps(dirname_in):
 def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
                 languages):
    lines = []
    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,7 +1,6 @@
-from html.entities import name2codepoint
+from wordfreq import tokenize
 from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
 from ftfy.fixes import unescape_html
-import re
+import regex
 import pycld2
 CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
@ -11,19 +10,22 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
        '\x0e-\x1f',
        '\x7f-\x9f',
        '\ud800-\udfff',
-        '\ufdd0-\ufdef'
+        '\ufdd0-\ufdef',
        '\N{HANGUL FILLER}',
        '\N{HANGUL CHOSEONG FILLER}',
        '\N{HANGUL JUNGSEONG FILLER}'
    ] +
    [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
 )
-CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
+CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
-TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
+TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
-TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
+TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
 def cld2_surface_tokenizer(text):
    """
-    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
+    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
    """
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
@ -35,7 +37,7 @@ def cld2_surface_tokenizer(text):
 def cld2_detect_language(text):
    """
-    Uses CLD2 to detect the language
+    Uses CLD2 to detect the language.
    """
    # Format of pycld2.detect:
    #   (Confident in result: bool,
@ -45,16 +47,19 @@ def cld2_detect_language(text):
    #       Language code: str
    #       Percent of text in this language: float
    #       Confidence score: float))
-    
+
    text = CLD2_BAD_CHARS_RE.sub('', text)
    return pycld2.detect(text)[2][0][1]
 def tokenize_twitter(in_filename, out_prefix, tokenizer):
    """
-    Process a file by running it through the given tokenizer, sorting the
+    Process a file by running it through the Twitter-specific tokenizer,
-    results by the language of each line, and inserting newlines
+    which uses cld2 for language detection, and removes Twitter handles
-    to mark the token boundaries.
+    and t.co URLs.
    Produces output files that are separated by language, with newlines
    between the tokens.
    """
    out_files = {}
    with open(in_filename, encoding='utf-8') as in_file:
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -6,6 +6,12 @@ import math
 import csv
 import msgpack
 import gzip
 import regex
 # Match common cases of URLs: the schema http:// or https:// followed by
 # non-whitespace characters.
 URL_RE = regex.compile(r'https?://(?:\S)+')
 def count_tokens(filename):
@ -13,11 +19,13 @@ def count_tokens(filename):
    Count tokens that appear in a file, running each line through our
    simple tokenizer.
-    Unicode errors in the input data will become token boundaries.
+    URLs will be skipped, and Unicode errors will become separate tokens
    containing '<EFBFBD>'.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
        for line in infile:
            line = URL_RE.sub('', line.strip())
            for token in simple_tokenize(line):
                counts[token] += 1
		`@ -1 +0,0 @@`
			[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
		`@ -1 +0,0 @@`
			`[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🃿🄍-🣿]`
		`@ -1 +0,0 @@`
			[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ０-９Ａ-Ｚａ-ｚｦ-ￜ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯]