diff --git a/README.md b/README.md
index c16b7d0..28055ca 100644
--- a/README.md
+++ b/README.md
@@ -23,8 +23,8 @@ install them on Ubuntu:
 
 ## Unicode data
 
-The tokenizers used to split non-Japanese phrases use regexes built using the
-`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0.  To
+The tokenizers that split non-Japanese phrases utilize regexes built using the
+`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0.  To
 update these regexes, run `scripts/gen_regex.py`.
 
 ## License
@@ -58,4 +58,3 @@ Some additional data was collected by a custom application that watches the
 streaming Twitter API, in accordance with Twitter's Developer Agreement &
 Policy. This software only gives statistics about words that are very commonly
 used on Twitter; it does not display or republish any Twitter content.
-
diff --git a/wordfreq_builder/wordfreq_builder/ninja2dot.py b/scripts/ninja2dot.py
similarity index 91%
rename from wordfreq_builder/wordfreq_builder/ninja2dot.py
rename to scripts/ninja2dot.py
index 431ac09..7a2f403 100644
--- a/wordfreq_builder/wordfreq_builder/ninja2dot.py
+++ b/scripts/ninja2dot.py
@@ -1,3 +1,5 @@
+""" This file generates a graph of the dependencies for the ninja build."""
+
 import sys
 
 
@@ -26,4 +28,3 @@ def ninja_to_dot():
 
 if __name__ == '__main__':
     ninja_to_dot()
-
diff --git a/tests/test.py b/tests/test.py
index 59d40f8..d38fd14 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -94,7 +94,7 @@ def test_failed_cB_conversion():
 
 def test_tokenization():
     # We preserve apostrophes within words, so "can't" is a single word in the
-    # data, while the fake word "plan't" can't be found.
+    # data
     eq_(tokenize("can't", 'en'), ["can't"])
 
     eq_(tokenize('😂test', 'en'), ['😂', 'test'])
@@ -135,12 +135,20 @@ def test_not_enough_ascii():
     random_ascii_words(lang='zh')
 
 def test_ar():
+
+    # Remove tatweels
     eq_(
         tokenize('متــــــــعب', 'ar'),
         ['متعب']
     )
 
+    # Remove combining marks
     eq_(
         tokenize('حَرَكَات', 'ar'),
         ['حركات']
     )
+
+    eq_(
+        tokenize('إﻻ', 'ar'),
+        ['إلا']
+    )
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index a5ac0ec..cb085f7 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -8,6 +8,8 @@ import itertools
 import pathlib
 import random
 import logging
+import unicodedata
+
 logger = logging.getLogger(__name__)
 
 
@@ -66,11 +68,21 @@ def tokenize(text, lang):
         return mecab_tokenize(text)
 
     if lang == 'ar':
-        text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+        text = standardize_arabic(text)
 
     return simple_tokenize(text)
 
 
+def standardize_arabic(text):
+    """
+    Standardizes arabic text by removing combining marks and tatweels.
+    """
+    return unicodedata.normalize(
+        'NFKC',
+        COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+    )
+
+
 def read_cBpack(filename):
     """
     Read a file from an idiosyncratic format that we use for storing
@@ -257,6 +269,9 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
     If a word decomposes into multiple tokens, we'll return a smoothed estimate
     of the word frequency that is no greater than the frequency of any of its
     individual tokens.
+
+    It should be noted that the current tokenizer does not support
+    multi-word Chinese phrases.
     """
     args = (word, lang, wordlist, minimum)
     try:
diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/combined_ar.msgpack.gz
index db40525..1b059e2 100644
Binary files a/wordfreq/data/combined_ar.msgpack.gz and b/wordfreq/data/combined_ar.msgpack.gz differ
diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz
index 5e5d05a..0e89465 100644
Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ
diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/combined_el.msgpack.gz
index 4f1987b..167acb0 100644
Binary files a/wordfreq/data/combined_el.msgpack.gz and b/wordfreq/data/combined_el.msgpack.gz differ
diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz
index 38c13a4..fa68552 100644
Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ
diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz
index 60993aa..dfc7f80 100644
Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ
diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/combined_fr.msgpack.gz
index 370c499..fc63301 100644
Binary files a/wordfreq/data/combined_fr.msgpack.gz and b/wordfreq/data/combined_fr.msgpack.gz differ
diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/combined_id.msgpack.gz
index 610c9b6..3989727 100644
Binary files a/wordfreq/data/combined_id.msgpack.gz and b/wordfreq/data/combined_id.msgpack.gz differ
diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/combined_it.msgpack.gz
index c3c2c21..5830417 100644
Binary files a/wordfreq/data/combined_it.msgpack.gz and b/wordfreq/data/combined_it.msgpack.gz differ
diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/combined_ja.msgpack.gz
index 3d5797a..fbaa41f 100644
Binary files a/wordfreq/data/combined_ja.msgpack.gz and b/wordfreq/data/combined_ja.msgpack.gz differ
diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/combined_ko.msgpack.gz
index 7d44281..2c2db53 100644
Binary files a/wordfreq/data/combined_ko.msgpack.gz and b/wordfreq/data/combined_ko.msgpack.gz differ
diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/combined_ms.msgpack.gz
index e7d628a..93e251c 100644
Binary files a/wordfreq/data/combined_ms.msgpack.gz and b/wordfreq/data/combined_ms.msgpack.gz differ
diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/combined_nl.msgpack.gz
index 12ef7a2..db62dde 100644
Binary files a/wordfreq/data/combined_nl.msgpack.gz and b/wordfreq/data/combined_nl.msgpack.gz differ
diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/combined_pt.msgpack.gz
index 3c1db27..a198a81 100644
Binary files a/wordfreq/data/combined_pt.msgpack.gz and b/wordfreq/data/combined_pt.msgpack.gz differ
diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/combined_ru.msgpack.gz
index ff925e2..3a0f9c4 100644
Binary files a/wordfreq/data/combined_ru.msgpack.gz and b/wordfreq/data/combined_ru.msgpack.gz differ
diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/combined_zh.msgpack.gz
index 6d99f38..0f89563 100644
Binary files a/wordfreq/data/combined_zh.msgpack.gz and b/wordfreq/data/combined_zh.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz
index 20939f9..eb9291a 100644
Binary files a/wordfreq/data/twitter_ar.msgpack.gz and b/wordfreq/data/twitter_ar.msgpack.gz differ
diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz
index 1329391..b943dee 100644
Binary files a/wordfreq/data/twitter_de.msgpack.gz and b/wordfreq/data/twitter_de.msgpack.gz differ
diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz
index 8017c56..956487e 100644
Binary files a/wordfreq/data/twitter_en.msgpack.gz and b/wordfreq/data/twitter_en.msgpack.gz differ
diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz
index 936ec75..56b253d 100644
Binary files a/wordfreq/data/twitter_es.msgpack.gz and b/wordfreq/data/twitter_es.msgpack.gz differ
diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz
index e41589a..49f8ef8 100644
Binary files a/wordfreq/data/twitter_fr.msgpack.gz and b/wordfreq/data/twitter_fr.msgpack.gz differ
diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz
index b2bc598..5ad7439 100644
Binary files a/wordfreq/data/twitter_id.msgpack.gz and b/wordfreq/data/twitter_id.msgpack.gz differ
diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz
index 5301ed7..d5c1175 100644
Binary files a/wordfreq/data/twitter_it.msgpack.gz and b/wordfreq/data/twitter_it.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz
index 74f33d5..3136f18 100644
Binary files a/wordfreq/data/twitter_ja.msgpack.gz and b/wordfreq/data/twitter_ja.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz
index 63735be..e88a6f8 100644
Binary files a/wordfreq/data/twitter_ko.msgpack.gz and b/wordfreq/data/twitter_ko.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz
index 83d2b57..0497311 100644
Binary files a/wordfreq/data/twitter_ms.msgpack.gz and b/wordfreq/data/twitter_ms.msgpack.gz differ
diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz
index b8d2281..0542cf2 100644
Binary files a/wordfreq/data/twitter_nl.msgpack.gz and b/wordfreq/data/twitter_nl.msgpack.gz differ
diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz
index 348d5a1..5b68d15 100644
Binary files a/wordfreq/data/twitter_pt.msgpack.gz and b/wordfreq/data/twitter_pt.msgpack.gz differ
diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz
index 9082723..deec3aa 100644
Binary files a/wordfreq/data/twitter_ru.msgpack.gz and b/wordfreq/data/twitter_ru.msgpack.gz differ
diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md
index 18241a1..a17c504 100644
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@@ -47,8 +47,7 @@ Start the build, and find something else to do for a few hours:
 
     ninja -v
 
-You can copy the results into wordfreq with this command (supposing that
-$WORDFREQ points to your wordfreq repo):
+You can copy the results into wordfreq with this command:
 
     cp data/dist/*.msgpack.gz ../wordfreq/data/
 
@@ -83,6 +82,19 @@ The specific rules are described by the comments in `rules.ninja`.
 
 ## Data sources
 
+### Wikipedia
+
+Wikipedia is a "free-access, free-content Internet encyclopedia".
+
+These files can be downloaded from [wikimedia dump][wikipedia]
+
+The original files are in `data/raw-input/wikipedia`, and they're processed
+by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
+[wiki2text][] package.
+
+[wikipedia]: https://dumps.wikimedia.org/backup-index.html
+[wiki2text]: https://github.com/rspeer/wiki2text
+
 ### Leeds Internet Corpus
 
 Also known as the "Web as Corpus" project, this is a University of Leeds
@@ -102,7 +114,7 @@ by the `convert_leeds` rule in `rules.ninja`.
 The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets
 collected by the `ftfy.streamtester` package in 2014.
 
-It's not possible to distribute the text of tweets. However, this process could
+We are not allowed to distribute the text of tweets. However, this process could
 be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for
 a couple of weeks.
 
@@ -162,4 +174,3 @@ longer represents the words 'don' and 'won', as we assume most of their
 frequency comes from "don't" and "won't". Words that turned into similarly
 common words, however, were left alone: this list doesn't represent "can't"
 because the word was left as "can".
-
diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index d693f52..b708533 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -29,12 +29,12 @@ rule split
 # Wikipedia dumps obtained from dumps.wikimedia.org.  The code is at
 # https://github.com/rspeer/wiki2text.
 rule wiki2text
-  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
+  command = bunzip2 -c $in | wiki2text > $out
 
 # To tokenize Japanese, we run it through Mecab and take the first column.
 # We don't have a plan for tokenizing Chinese yet.
 rule tokenize_japanese
-  command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
+  command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
 
 # Tokenizing text from Twitter requires us to language-detect and tokenize
 # in the same step.
@@ -49,12 +49,12 @@ rule tokenize_twitter
 # Grep out the term "EOS", an indication that Leeds used MeCab and didn't
 # strip out the EOS lines.
 rule convert_leeds
-  command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
+  command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
 
 # To convert the OpenSubtitles frequency data, simply replace spaces with
 # commas.
 rule convert_opensubtitles
-  command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
+  command = tr ' ' ',' < $in > $out
 
 # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
 # the input files, keep only the single words and their counts, and only keep
@@ -65,16 +65,16 @@ rule convert_opensubtitles
 # source data was already filtered to only show words in roles with at least
 # two-digit counts of occurences.)
 rule convert_google_syntactic_ngrams
-  command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
+  command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
 
 rule count
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
+  command = python -m wordfreq_builder.cli.count_tokens $in $out
 
 rule merge
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
+  command = python -m wordfreq_builder.cli.combine_lists -o $out $in
 
 rule freqs2cB
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
+  command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
 
 rule cat
   command = cat $in > $out
diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py
index c7232cc..6f0a438 100755
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@@ -9,12 +9,5 @@ setup(
     platforms=["any"],
     description="Turns raw data into word frequency lists",
     packages=['wordfreq_builder'],
-    install_requires=['msgpack-python', 'pycld2'],
-    entry_points={
-        'console_scripts': [
-            'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
-            'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
-            'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
-        ]
-    }
+    install_requires=['msgpack-python', 'pycld2']
 )
diff --git a/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/tests/test_tokenizer.py
similarity index 100%
rename from wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
rename to wordfreq_builder/tests/test_tokenizer.py
diff --git a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
index 4aeba5b..56b93cb 100644
--- a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
+++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
@@ -13,4 +13,3 @@ if __name__ == '__main__':
     parser.add_argument('filename_out', help='name of output file')
     args = parser.parse_args()
     handle_counts(args.filename_in, args.filename_out)
-
diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
index 6bf3957..9d0b1dc 100644
--- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
+++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
@@ -4,8 +4,8 @@ import argparse
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
+    parser.add_argument('language', help='language of the input file')
     parser.add_argument('filename_in', help='name of input file containing tokens')
     parser.add_argument('filename_out', help='name of output file')
     args = parser.parse_args()
-    freqs_to_cBpack(args.filename_in, args.filename_out)
-
+    freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
index df2cb6b..879caa4 100644
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
@@ -1,18 +1,13 @@
-from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
+from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
 import argparse
 
 
-def tokenize_twitter(in_filename, out_prefix):
-    tokenize_file(in_filename, out_prefix,
-                  tokenizer=cld2_surface_tokenizer)
-
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('filename', help='filename of input file containing one tweet per line')
     parser.add_argument('outprefix', help='prefix of output filenames')
     args = parser.parse_args()
-    tokenize_twitter(args.filename, args.outprefix)
+    tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
 
 
 if __name__ == '__main__':
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index ec59716..fa937cd 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -10,10 +10,6 @@ HEADER = """# This file is automatically generated. Do not edit it.
 TMPDIR = data_filename('tmp')
 
 
-# Set this to True to rebuild the Twitter tokenization (which takes days)
-TOKENIZE_TWITTER = True
-
-
 def add_dep(lines, rule, input, output, extra=None, params=None):
     if isinstance(output, list):
         output = ' '.join(output)
@@ -48,17 +44,15 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
     # The first dependency is to make sure the build file is up to date.
     add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
             extra='wordfreq_builder/ninja.py')
-
-    if TOKENIZE_TWITTER:
-        lines.extend(
-            twitter_deps(
-                data_filename('raw-input/twitter/all-2014.txt'),
-                slice_prefix=data_filename('slices/twitter/tweets-2014'),
-                combined_prefix=data_filename('generated/twitter/tweets-2014'),
-                slices=40,
-                languages=CONFIG['sources']['twitter']
-            )
+    lines.extend(
+        twitter_deps(
+            data_filename('raw-input/twitter/all-2014.txt'),
+            slice_prefix=data_filename('slices/twitter/tweets-2014'),
+            combined_prefix=data_filename('generated/twitter/tweets-2014'),
+            slices=40,
+            languages=CONFIG['sources']['twitter']
         )
+    )
     lines.extend(
         wikipedia_deps(
             data_filename('raw-input/wikipedia'),
@@ -92,17 +86,18 @@ def wikipedia_deps(dirname_in, languages):
     path_in = pathlib.Path(dirname_in)
     for language in languages:
         # Find the most recent file for this language
-        # Skip over files that do not exist
-        input_file = max(path_in.glob(
-            '{}wiki*.bz2'.format(language)
-        ))
+        input_file = max(path_in.glob('{}wiki*.bz2'.format(language)))
         plain_text_file = wordlist_filename('wikipedia', language, 'txt')
         count_file = wordlist_filename('wikipedia', language, 'counts.txt')
 
         add_dep(lines, 'wiki2text', input_file, plain_text_file)
         if language == 'ja':
-            mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
-            add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
+            mecab_token_file = wordlist_filename(
+                'wikipedia', language, 'mecab-tokens.txt'
+            )
+            add_dep(
+                lines, 'tokenize_japanese', plain_text_file, mecab_token_file
+            )
             add_dep(lines, 'count', mecab_token_file, count_file)
         else:
             add_dep(lines, 'count', plain_text_file, count_file)
@@ -126,17 +121,18 @@ def google_books_deps(dirname_in):
     return lines
 
 
-def twitter_deps(input_filename, slice_prefix,
-                            combined_prefix, slices, languages):
+def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
+                 languages):
+
     lines = []
 
-    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
+    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
+                                                    num=num)
                    for num in range(slices)]
     # split the input into slices
-    add_dep(lines,
-            'split', input_filename, slice_files,
+    add_dep(lines, 'split', input_filename, slice_files,
             params={'prefix': '{}.part'.format(slice_prefix),
-             'slices': slices})
+                    'slices': slices})
 
     for slicenum in range(slices):
         slice_file = slice_files[slicenum]
@@ -151,7 +147,9 @@ def twitter_deps(input_filename, slice_prefix,
         combined_output = wordlist_filename('twitter', language, 'tokens.txt')
 
         language_inputs = [
-            '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
+            '{prefix}.{lang}.txt'.format(
+                prefix=slice_files[slicenum], lang=language
+            )
             for slicenum in range(slices)
         ]
 
@@ -160,11 +158,14 @@ def twitter_deps(input_filename, slice_prefix,
         count_file = wordlist_filename('twitter', language, 'counts.txt')
 
         if language == 'ja':
-            mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
-            add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file)
-            add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
-        else:
-            add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py')
+            mecab_token_file = wordlist_filename(
+                'twitter', language, 'mecab-tokens.txt')
+            add_dep(
+                lines, 'tokenize_japanese', combined_output, mecab_token_file)
+            combined_output = mecab_token_file
+
+        add_dep(lines, 'count', combined_output, count_file,
+                extra='wordfreq_builder/tokenizers.py')
 
     return lines
 
@@ -187,7 +188,8 @@ def opensubtitles_deps(dirname_in, languages):
         input_file = '{prefix}/{lang}.txt'.format(
             prefix=dirname_in, lang=language
         )
-        reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
+        reformatted_file = wordlist_filename(
+            'opensubtitles', language, 'counts.txt')
         add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
 
     return lines
@@ -205,18 +207,22 @@ def combine_lists(languages):
         add_dep(lines, 'merge', input_files, output_file,
                 extra='wordfreq_builder/word_counts.py')
 
-        output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz')
+        output_cBpack = wordlist_filename(
+            'combined-dist', language, 'msgpack.gz')
         add_dep(lines, 'freqs2cB', output_file, output_cBpack,
-                extra='wordfreq_builder/word_counts.py')
+                extra='wordfreq_builder/word_counts.py',
+                params={'lang': language})
 
         lines.append('default {}'.format(output_cBpack))
 
         # Write standalone lists for Twitter frequency
         if language in CONFIG['sources']['twitter']:
             input_file = wordlist_filename('twitter', language, 'counts.txt')
-            output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz')
+            output_cBpack = wordlist_filename(
+                'twitter-dist', language, 'msgpack.gz')
             add_dep(lines, 'freqs2cB', input_file, output_cBpack,
-                    extra='wordfreq_builder/word_counts.py')
+                    extra='wordfreq_builder/word_counts.py',
+                    params={'lang': language})
 
             lines.append('default {}'.format(output_cBpack))
 
diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index 733191d..5815292 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -1,63 +1,56 @@
 from html.entities import name2codepoint
 from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
+from ftfy.fixes import unescape_html
 import re
 import pycld2
 
-CLD2_BAD_CHAR_RANGE = "".join([
-    '[',
-    '\x00-\x08',
-    '\x0b',
-    '\x0e-\x1f',
-    '\x7f-\x9f',
-    '\ud800-\udfff',
-    '\ufdd0-\ufdef'] +
-    [chr(65534+65536*x+y) for x in range(17) for y in range(2)] +
-    [']'])
+CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
+    [
+        '\x00-\x08',
+        '\x0b',
+        '\x0e-\x1f',
+        '\x7f-\x9f',
+        '\ud800-\udfff',
+        '\ufdd0-\ufdef'
+    ] +
+    [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
+)
 CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
 
 TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
-TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+'.format(NON_PUNCT_RANGE))
+TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
 
 
 def cld2_surface_tokenizer(text):
     """
     Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
     """
-    text = remove_handles_and_urls(text)
+    text = unescape_html(text)
+    text = TWITTER_HANDLE_RE.sub('', text)
+    text = TCO_RE.sub('', text)
     lang = cld2_detect_language(text)
     tokens = tokenize(text, lang)
     return lang, tokens
 
+
 def cld2_detect_language(text):
     """
     Uses CLD2 to detect the language
     """
+    # Format of pycld2.detect:
+    #   (Confident in result: bool,
+    #   Number of bytes of text: Int,
+    #   Triples of detected languages in order of certainty:
+    #       (Language name: str,
+    #       Language code: str
+    #       Percent of text in this language: float
+    #       Confidence score: float))
+    
     text = CLD2_BAD_CHARS_RE.sub('', text)
     return pycld2.detect(text)[2][0][1]
 
-def remove_handles_and_urls(text):
-    text = fix_entities(text)
-    text = TWITTER_HANDLE_RE.sub('', text)
-    text = TCO_RE.sub('', text)
-    return text
 
-def last_tab(line):
-    """
-    Read lines by keeping only the last tab-separated value.
-    """
-    return line.split('\t')[-1].strip()
-
-def lowercase_text_filter(token):
-    """
-    If this looks like a token that we want to count, return it, lowercased.
-    If not, filter it out by returning None.
-    """
-    if TOKEN_RE.search(token):
-        return token.lower()
-    else:
-        return None
-
-def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
+def tokenize_twitter(in_filename, out_prefix, tokenizer):
     """
     Process a file by running it through the given tokenizer, sorting the
     results by the language of each line, and inserting newlines
@@ -66,7 +59,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
     out_files = {}
     with open(in_filename, encoding='utf-8') as in_file:
         for line in in_file:
-            text = line_reader(line)
+            text = line.split('\t')[-1].strip()
             language, tokens = tokenizer(text)
             if language != 'un':
                 tokenized = '\n'.join(tokens)
@@ -79,37 +72,3 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
                 print(tokenized, file=out_file)
     for out_file in out_files.values():
         out_file.close()
-
-ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
-
-def fix_entities(text):
-    """
-    Fix the few HTML entities that Twitter uses -- even if they've
-    already been tokenized.
-    """
-    def replace_entity(match):
-        return chr(name2codepoint[match.group(1)])
-    return ENTITY_RE.sub(replace_entity, text)
-
-def monolingual_tokenize_file(in_filename, out_filename, language,
-                              tokenizer, line_reader=last_tab,
-                              sample_proportion=1):
-    """
-    Process a file by running it through the given tokenizer, only keeping
-    lines of the language we're asking for, and inserting newlines
-    to mark the token boundaries.
-
-    `line_reader` is applied to each line before it given to the tokenizer
-
-    Only the first line out of every `sample_proportion` lines are run through
-    then tokenizer.
-    """
-    with open(in_filename, encoding='utf-8', errors='replace') as in_file:
-        with open(out_filename, 'w', encoding='utf-8') as out_file:
-            for i, line in enumerate(in_file):
-                if i % sample_proportion == 0:
-                    text = line_reader(line)
-                    tokens, line_language = tokenizer(text)
-                    if line_language == language:
-                        for token in tokens:
-                            print(token, file=out_file)
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 8f4099c..d7ace2d 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -1,4 +1,4 @@
-from wordfreq import simple_tokenize
+from wordfreq import simple_tokenize, tokenize
 from collections import defaultdict
 from operator import itemgetter
 from ftfy import fix_text
@@ -18,41 +18,49 @@ def count_tokens(filename):
     counts = defaultdict(int)
     with open(filename, encoding='utf-8', errors='replace') as infile:
         for line in infile:
-            for token in simple_tokenize(line.strip()):
+            for token in simple_tokenize(line):
                 counts[token] += 1
+
     return counts
 
 
-def read_freqs(filename, cutoff=0):
+def read_freqs(filename, cutoff=0, lang=None):
     """
     Read words and their frequencies from a CSV file.
 
-    Only words with a frequency greater than `cutoff` are returned.
+    Only words with a frequency greater than or equal to `cutoff` are returned.
 
     If `cutoff` is greater than 0, the csv file must be sorted by frequency
     in descending order.
+
+    If lang is given, read_freqs will apply language specific preprocessing
+    operations.
     """
     raw_counts = defaultdict(float)
     total = 0.
     with open(filename, encoding='utf-8', newline='') as infile:
         reader = csv.reader(infile)
         for key, strval in reader:
+
             val = float(strval)
             if val < cutoff:
                 break
-            for token in simple_tokenize(key):
+
+            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
+            for token in tokens:
                 token = fix_text(token)
                 total += val
                 # Use += so that, if we give the reader concatenated files with
                 # duplicates, it does the right thing
                 raw_counts[token] += val
 
-    freqs = {key: raw_count / total
-             for (key, raw_count) in raw_counts.items()}
-    return freqs
+    for word in raw_counts:
+        raw_counts[word] /= total
+
+    return raw_counts
 
 
-def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
+def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
     """
     Convert a csv file of words and their frequencies to a file in the
     idiosyncratic 'cBpack' format.
@@ -61,15 +69,14 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
     written to the new file.
     """
     freq_cutoff = 10 ** (cutoff / 100.)
-    freqs = read_freqs(in_filename, freq_cutoff)
+    freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
     cBpack = []
     for token, freq in freqs.items():
         cB = round(math.log10(freq) * 100)
-        if cB >= cutoff:
-            neg_cB = -cB
-            while neg_cB >= len(cBpack):
-                cBpack.append([])
-            cBpack[neg_cB].append(token)
+        neg_cB = -cB
+        while neg_cB >= len(cBpack):
+            cBpack.append([])
+        cBpack[neg_cB].append(token)
 
     for sublist in cBpack:
         sublist.sort()
@@ -88,7 +95,7 @@ def merge_freqs(freq_dicts):
     """
     vocab = set()
     for freq_dict in freq_dicts:
-        vocab |= set(freq_dict)
+        vocab.update(freq_dict)
 
     merged = defaultdict(float)
     N = len(freq_dicts)