Merge pull request #22 from LuminosoInsight/standard-tokenizer

Use a more standard Unicode tokenizer
2024-12-23 17:31:41 +00:00 · 2015-08-27 11:56:19 -04:00 · 2015-08-27 11:56:19 -04:00 · e6d9b36203
commit e6d9b36203
parent 2b8089e2b1 b952676679
43 changed files with 218 additions and 179 deletions
--- a/README.md
+++ b/README.md
@ -2,6 +2,7 @@ Tools for working with word frequencies from various corpora.

 Author: Rob Speer

+
 ## Installation

 wordfreq requires Python 3 and depends on a few other Python modules
@ -21,11 +22,25 @@ install them on Ubuntu:
    sudo apt-get install mecab-ipadic-utf8 libmecab-dev
    pip3 install mecab-python3

-## Unicode data

-The tokenizers that split non-Japanese phrases utilize regexes built using the
-`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0.  To
-update these regexes, run `scripts/gen_regex.py`.
+## Tokenization
+
+wordfreq uses the Python package `regex`, which is a more advanced
+implementation of regular expressions than the standard library, to
+separate text into tokens that can be counted consistently. `regex`
+produces tokens that follow the recommendations in [Unicode
+Annex #29, Text Segmentation][uax29].
+
+There are language-specific exceptions:
+
+- In Arabic, it additionally normalizes ligatures and removes combining marks.
+- In Japanese, instead of using the regex library, it uses the external library
+  `mecab-python3`. This is an optional dependency of wordfreq, and compiling
+  it requires the `libmecab-dev` system package to be installed.
+- It does not yet attempt to tokenize Chinese ideograms.
+
+[uax29]: http://unicode.org/reports/tr29/
+

 ## License

@ -56,5 +71,5 @@ sources:

 Some additional data was collected by a custom application that watches the
 streaming Twitter API, in accordance with Twitter's Developer Agreement &
-Policy. This software only gives statistics about words that are very commonly
-used on Twitter; it does not display or republish any Twitter content.
+Policy. This software gives statistics about words that are commonly used on
+Twitter; it does not display or republish any Twitter content.
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@ -1,76 +0,0 @@
-import unicodedata
-from ftfy import chardata
-import pathlib
-from pkg_resources import resource_filename
-
-
-CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
-DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
-
-
-def func_to_regex(accept_func):
-    """
-    Given a function that returns True or False for a numerical codepoint,
-    return a regex character class accepting the characters resulting in True.
-    Ranges separated only by unassigned characters are merged for efficiency.
-    """
-    # parsing_range is True if the current codepoint might be in a range that
-    # the regex will accept
-    parsing_range = False
-    ranges = []
-
-    for codepoint, category in enumerate(CATEGORIES):
-        if accept_func(codepoint):
-            if not parsing_range:
-                ranges.append([codepoint, codepoint])
-                parsing_range = True
-            else:
-                ranges[-1][1] = codepoint
-        elif category != 'Cn':
-            parsing_range = False
-
-    return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
-
-
-def cache_regex_from_func(filename, func):
-    """
-    Generates a regex from a function that accepts a single unicode character,
-    and caches it in the data path at filename.
-    """
-    with (DATA_PATH / filename).open(mode='w') as file:
-        file.write(func_to_regex(func))
-
-
-def _is_emoji_codepoint(i):
-    """
-    Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
-    character (as future-proofed by the ftfy chardata module) but excluding
-    symbols like © and ™ below U+2600 and the replacement character U+FFFD.
-    """
-    return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd
-
-
-def _is_non_punct_codepoint(i):
-    """
-    Report whether a numerical codepoint is not one of the following classes:
-    - P: punctuation
-    - S: symbols
-    - Z: separators
-    - C: control characters
-    This will classify symbols, including emoji, as punctuation; users that
-    want to accept emoji should add them separately.
-    """
-    return CATEGORIES[i][0] not in 'PSZC'
-
-
-def _is_combining_mark_codepoint(i):
-    """
-    Report whether a numerical codepoint is a combining mark (Unicode 'M').
-    """
-    return CATEGORIES[i][0] == 'M'
-
-
-if __name__ == '__main__':
-    cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
-    cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
-    cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)
--- a/setup.py
+++ b/setup.py
@ -26,14 +26,14 @@ classifiers = [
 current_dir = os.path.dirname(__file__)
 README_contents = open(os.path.join(current_dir, 'README.md')).read()
 doclines = README_contents.split("\n")
-dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes']
+dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
 if sys.version_info < (3, 4):
    dependencies.append('pathlib')


 setup(
    name="wordfreq",
-    version='1.0',
+    version='1.1',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/tests/test.py
+++ b/tests/test.py
@ -95,13 +95,17 @@ def test_failed_cB_conversion():
 def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
-    eq_(tokenize("can't", 'en'), ["can't"])
+    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
+        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])

+    # Certain punctuation does not inherently split a word.
+    eq_(tokenize("Anything is possible at zombo.com", 'en'),
+        ['anything', 'is', 'possible', 'at', 'zombo.com'])
+
+    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

-    # We do split at other punctuation, causing the word-combining rule to
-    # apply.
-    eq_(tokenize("can.t", 'en'), ['can', 't'])
+    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])


 def test_casefolding():
@ -110,11 +114,11 @@ def test_casefolding():


 def test_phrase_freq():
-    plant = word_frequency("plan.t", 'en')
-    assert_greater(plant, 0)
+    ff = word_frequency("flip-flop", 'en')
+    assert_greater(ff, 0)
    assert_almost_equal(
-        1.0 / plant,
-        1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
+        1.0 / ff,
+        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    )


@ -134,8 +138,8 @@ def test_not_really_random():
 def test_not_enough_ascii():
    random_ascii_words(lang='zh')

-def test_ar():

+def test_ar():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
@ -152,3 +156,16 @@ def test_ar():
        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )
+
+
+def test_ideographic_fallback():
+    # Try tokenizing Chinese text -- it should remain stuck together.
+    eq_(tokenize('中国文字', 'zh'), ['中国文字'])
+
+    # When Japanese is tagged with the wrong language, it will be split
+    # at script boundaries.
+    ja_text = 'ひらがなカタカナromaji'
+    eq_(
+        tokenize(ja_text, 'en'),
+        ['ひらがな', 'カタカナ', 'romaji']
+    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -1,14 +1,13 @@
+from wordfreq.tokens import tokenize, simple_tokenize
 from pkg_resources import resource_filename
 from functools import lru_cache
 import langcodes
 import msgpack
-import re
 import gzip
 import itertools
 import pathlib
 import random
 import logging
-import unicodedata

 logger = logging.getLogger(__name__)

@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

-def load_range(filename):
-    """
-    Load a file from the data path.
-    """
-    with (DATA_PATH / filename).open() as file:
-        return file.read()

-EMOJI_RANGE = load_range('emoji.txt')
-NON_PUNCT_RANGE = load_range('non_punct.txt')
-COMBINING_MARK_RANGE = load_range('combining_mark.txt')
-
-COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
-TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
-
-
-def simple_tokenize(text):
-    """
-    A simple tokenizer that can be applied to most languages.
-
-    It considers a word to be made of a sequence of 'token characters', an
-    overly inclusive range that includes letters, Han characters, emoji, and a
-    bunch of miscellaneous whatnot, but excludes most punctuation and
-    whitespace.
-
-    The single complication for the sake of English is that apostrophes are not
-    considered part of the token if they appear on the edge of the character
-    sequence, but they are if they appear internally. "cats'" is not a token,
-    but "cat's" is.
-    """
-    return [token.casefold() for token in TOKEN_RE.findall(text)]
-
-
-mecab_tokenize = None
-def tokenize(text, lang):
-    """
-    Tokenize this text in a way that's straightforward but appropriate for
-    the language.
-
-    So far, this means that Japanese is handled by mecab_tokenize, and
-    everything else is handled by simple_tokenize. Additionally, Arabic commas
-    and combining marks are removed.
-
-    Strings that are looked up in wordfreq will be run through this function
-    first, so that they can be expected to match the data.
-    """
-    if lang == 'ja':
-        global mecab_tokenize
-        if mecab_tokenize is None:
-            from wordfreq.mecab import mecab_tokenize
-        return mecab_tokenize(text)
-
-    if lang == 'ar':
-        text = standardize_arabic(text)
-
-    return simple_tokenize(text)
-
-
-def standardize_arabic(text):
-    """
-    Standardizes arabic text by removing combining marks and tatweels.
-    """
-    return unicodedata.normalize(
-        'NFKC',
-        COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
-    )
+# simple_tokenize is imported so that other things can import it from here.
+# Suppress the pyflakes warning.
+simple_tokenize = simple_tokenize


 def read_cBpack(filename):
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz
--- a/wordfreq/data/combining_mark.txt
+++ b/wordfreq/data/combining_mark.txt
@ -1 +0,0 @@
-[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
--- a/wordfreq/data/emoji.txt
+++ b/wordfreq/data/emoji.txt
@ -1 +0,0 @@
-[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🃿🄍-🣿]
--- a/wordfreq/data/non_punct.txt
+++ b/wordfreq/data/non_punct.txt
@ -1 +0,0 @@
-[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ０-９Ａ-Ｚａ-ｚｦ-ￜ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯]
--- a/wordfreq/data/twitter_ar.msgpack.gz
+++ b/wordfreq/data/twitter_ar.msgpack.gz
--- a/wordfreq/data/twitter_de.msgpack.gz
+++ b/wordfreq/data/twitter_de.msgpack.gz
--- a/wordfreq/data/twitter_en.msgpack.gz
+++ b/wordfreq/data/twitter_en.msgpack.gz
--- a/wordfreq/data/twitter_es.msgpack.gz
+++ b/wordfreq/data/twitter_es.msgpack.gz
--- a/wordfreq/data/twitter_fr.msgpack.gz
+++ b/wordfreq/data/twitter_fr.msgpack.gz
--- a/wordfreq/data/twitter_id.msgpack.gz
+++ b/wordfreq/data/twitter_id.msgpack.gz
--- a/wordfreq/data/twitter_it.msgpack.gz
+++ b/wordfreq/data/twitter_it.msgpack.gz
--- a/wordfreq/data/twitter_ja.msgpack.gz
+++ b/wordfreq/data/twitter_ja.msgpack.gz
--- a/wordfreq/data/twitter_ko.msgpack.gz
+++ b/wordfreq/data/twitter_ko.msgpack.gz
--- a/wordfreq/data/twitter_ms.msgpack.gz
+++ b/wordfreq/data/twitter_ms.msgpack.gz
--- a/wordfreq/data/twitter_nl.msgpack.gz
+++ b/wordfreq/data/twitter_nl.msgpack.gz
--- a/wordfreq/data/twitter_pt.msgpack.gz
+++ b/wordfreq/data/twitter_pt.msgpack.gz
--- a/wordfreq/data/twitter_ru.msgpack.gz
+++ b/wordfreq/data/twitter_ru.msgpack.gz
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@ -1,4 +1,5 @@
 import MeCab
+import unicodedata


 # Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
@ -14,6 +15,7 @@ def mecab_tokenize(text):
    contains the same table that the command-line version of MeCab would output.
    We find the tokens in the first column of this table.
    """
+    text = unicodedata.normalize('NFKC', text.strip())
    return [line.split('\t')[0]
-            for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
+            for line in MECAB_ANALYZER.parse(text).split('\n')
            if line != '' and line != 'EOS']
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -0,0 +1,114 @@
+import regex
+import unicodedata
+
+
+TOKEN_RE = regex.compile(r"""
+    # Case 1: a special case for Chinese and Japanese
+    # -----------------------------------------------
+
+    # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
+    # (\p{Script=Hiragana}), we allow a sequence of those characters to be
+    # glued together as a single token. Without this case, the standard rule
+    # (case 2) would make each character a separate token. This would be the
+    # correct behavior for word-wrapping, but a messy failure mode for NLP
+    # tokenization.
+    #
+    # It is, of course, better to use a tokenizer that is designed for Chinese
+    # or Japanese text. This is effectively a fallback for when the wrong
+    # tokenizer is used.
+    #
+    # This rule is listed first so that it takes precedence.
+
+    [\p{IsIdeo}\p{Script=Hiragana}]+ |
+
+    # Case 2: standard Unicode segmentation
+    # -------------------------------------
+
+    # The start of the token must be 'word-like', not punctuation or whitespace
+    # or various other things. However, we allow characters of category So
+    # (Symbol - Other) because many of these are emoji, which can convey
+    # meaning.
+
+    [\w\p{So}]
+
+    # The rest of the token matches characters that are not any sort of space
+    # (\S) and do not cause word breaks according to the Unicode word
+    # segmentation heuristic (\B).
+
+    (?:\B\S)*
+""", regex.V1 | regex.WORD | regex.VERBOSE)
+
+ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
+
+
+def simple_tokenize(text):
+    """
+    Tokenize the given text using a straightforward, Unicode-aware token
+    expression.
+
+    The expression mostly implements the rules of Unicode Annex #29 that
+    are contained in the `regex` module's word boundary matching, including
+    the refinement that splits words between apostrophes and vowels in order
+    to separate tokens such as the French article «l'». Our customizations
+    to the expression are:
+
+    - It leaves sequences of Chinese or Japanese characters (specifically, Han
+      ideograms and hiragana) relatively untokenized, instead of splitting each
+      character into its own token.
+
+    - It outputs only the tokens that start with a word-like character, or
+      miscellaneous symbols such as emoji.
+
+    - It breaks on all spaces, even the "non-breaking" ones.
+    """
+    text = unicodedata.normalize('NFC', text)
+    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+
+
+def remove_arabic_marks(text):
+    """
+    Remove decorations from Arabic words:
+
+    - Combining marks of class Mn, which tend to represent non-essential
+      vowel markings.
+    - Tatweels, horizontal segments that are used to extend or justify a
+      word.
+    """
+    return ARABIC_MARK_RE.sub('', text)
+
+
+mecab_tokenize = None
+def tokenize(text, lang):
+    """
+    Tokenize this text in a way that's relatively simple but appropriate for
+    the language.
+
+    So far, this means:
+
+    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
+    - Japanese will be delegated to the external mecab-python module.
+    - Chinese or Japanese texts that aren't identified as the appropriate
+      language will only split on punctuation and script boundaries, giving
+      you untokenized globs of characters that probably represent many words.
+    - All other languages will be tokenized using a regex that mostly
+      implements the Word Segmentation section of Unicode Annex #29.
+      See `simple_tokenize` for details.
+
+    Additionally, the text will be case-folded to lowercase, and text marked
+    as Arabic will be normalized more strongly and have combining marks and
+    tatweels removed.
+
+    Strings that are looked up in wordfreq will be run through this function
+    first, so that they can be expected to match the data.
+    """
+    if lang == 'ja':
+        global mecab_tokenize
+        if mecab_tokenize is None:
+            from wordfreq.mecab import mecab_tokenize
+        return mecab_tokenize(text)
+
+    if lang == 'ar':
+        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
+
+    return simple_tokenize(text)
+
--- a/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/tests/test_tokenizer.py
@ -6,7 +6,7 @@ def test_tokenizer_1():
    text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
    tokens = [
        'this', 'is', 'a', 'test', 'she', 'said',
-        'and', "i'll", 'bet', "y'all", '3', '50', 'that',
+        'and', "i'll", 'bet', "y", "all", '3.50', 'that',
        'it', "won't", 'fail',
    ]
    result = cld2_surface_tokenizer(text)
--- a/wordfreq_builder/tests/test_urls.py
+++ b/wordfreq_builder/tests/test_urls.py
@ -0,0 +1,20 @@
+from wordfreq_builder.word_counts import URL_RE
+from nose.tools import eq_
+
+
+def check_url(url):
+    match = URL_RE.match(url)
+    assert match
+    eq_(match.span(), (0, len(url)))
+
+
+def test_url_re():
+    # URLs like this are all over the Arabic Wikipedia. Here's one with the
+    # student ID blanked out.
+    yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
+
+    yield check_url, 'https://example.com/űnicode.html'
+    yield check_url, 'http://☃.net'
+
+    assert not URL_RE.match('ftp://127.0.0.1')
+
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -123,7 +123,6 @@ def google_books_deps(dirname_in):

 def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
                 languages):
-
    lines = []

    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,7 +1,6 @@
-from html.entities import name2codepoint
-from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
+from wordfreq import tokenize
 from ftfy.fixes import unescape_html
-import re
+import regex
 import pycld2

 CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
@ -11,19 +10,22 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
        '\x0e-\x1f',
        '\x7f-\x9f',
        '\ud800-\udfff',
-        '\ufdd0-\ufdef'
+        '\ufdd0-\ufdef',
+        '\N{HANGUL FILLER}',
+        '\N{HANGUL CHOSEONG FILLER}',
+        '\N{HANGUL JUNGSEONG FILLER}'
    ] +
    [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
 )
-CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
+CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)

-TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
-TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
+TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
+TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')


 def cld2_surface_tokenizer(text):
    """
-    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
+    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
    """
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
@ -35,7 +37,7 @@ def cld2_surface_tokenizer(text):

 def cld2_detect_language(text):
    """
-    Uses CLD2 to detect the language
+    Uses CLD2 to detect the language.
    """
    # Format of pycld2.detect:
    #   (Confident in result: bool,
@ -52,9 +54,12 @@ def cld2_detect_language(text):

 def tokenize_twitter(in_filename, out_prefix, tokenizer):
    """
-    Process a file by running it through the given tokenizer, sorting the
-    results by the language of each line, and inserting newlines
-    to mark the token boundaries.
+    Process a file by running it through the Twitter-specific tokenizer,
+    which uses cld2 for language detection, and removes Twitter handles
+    and t.co URLs.
+
+    Produces output files that are separated by language, with newlines
+    between the tokens.
    """
    out_files = {}
    with open(in_filename, encoding='utf-8') as in_file:
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -6,6 +6,12 @@ import math
 import csv
 import msgpack
 import gzip
+import regex
+
+
+# Match common cases of URLs: the schema http:// or https:// followed by
+# non-whitespace characters.
+URL_RE = regex.compile(r'https?://(?:\S)+')


 def count_tokens(filename):
@ -13,11 +19,13 @@ def count_tokens(filename):
    Count tokens that appear in a file, running each line through our
    simple tokenizer.

-    Unicode errors in the input data will become token boundaries.
+    URLs will be skipped, and Unicode errors will become separate tokens
+    containing '<EFBFBD>'.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
        for line in infile:
+            line = URL_RE.sub('', line.strip())
            for token in simple_tokenize(line):
                counts[token] += 1
				`@ -1 +0,0 @@`
				[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
				`@ -1 +0,0 @@`
				`[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🃿🄍-🣿]`
				`@ -1 +0,0 @@`
				[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ０-９Ａ-Ｚａ-ｚｦ-ￜ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯]