Merge pull request #19 from LuminosoInsight/code-review-fixes-2015-07-17

Code review fixes 2015 07 17
2024-12-23 17:31:41 +00:00 · 2015-07-22 15:09:00 -04:00 · 2015-07-22 15:09:00 -04:00 · 32102ba3c2
commit 32102ba3c2
parent 2d1020daac 93cd902899
42 changed files with 151 additions and 158 deletions
--- a/README.md
+++ b/README.md
@ -23,8 +23,8 @@ install them on Ubuntu:

 ## Unicode data

-The tokenizers used to split non-Japanese phrases use regexes built using the
-`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0.  To
+The tokenizers that split non-Japanese phrases utilize regexes built using the
+`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0.  To
 update these regexes, run `scripts/gen_regex.py`.

 ## License
@ -58,4 +58,3 @@ Some additional data was collected by a custom application that watches the
 streaming Twitter API, in accordance with Twitter's Developer Agreement &
 Policy. This software only gives statistics about words that are very commonly
 used on Twitter; it does not display or republish any Twitter content.
-
--- a/wordfreq_builder/wordfreq_builder/ninja2dot.py
+++ b/wordfreq_builder/wordfreq_builder/ninja2dot.py
@ -1,3 +1,5 @@
+""" This file generates a graph of the dependencies for the ninja build."""
+
 import sys


@ -26,4 +28,3 @@ def ninja_to_dot():

 if __name__ == '__main__':
    ninja_to_dot()
-
--- a/tests/test.py
+++ b/tests/test.py
@ -94,7 +94,7 @@ def test_failed_cB_conversion():

 def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
-    # data, while the fake word "plan't" can't be found.
+    # data
    eq_(tokenize("can't", 'en'), ["can't"])

    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
@ -135,12 +135,20 @@ def test_not_enough_ascii():
    random_ascii_words(lang='zh')

 def test_ar():
+
+    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
        ['متعب']
    )

+    # Remove combining marks
    eq_(
        tokenize('حَرَكَات', 'ar'),
        ['حركات']
    )
+
+    eq_(
+        tokenize('إﻻ', 'ar'),
+        ['إلا']
+    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -8,6 +8,8 @@ import itertools
 import pathlib
 import random
 import logging
+import unicodedata
+
 logger = logging.getLogger(__name__)


@ -66,11 +68,21 @@ def tokenize(text, lang):
        return mecab_tokenize(text)

    if lang == 'ar':
-        text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+        text = standardize_arabic(text)

    return simple_tokenize(text)


+def standardize_arabic(text):
+    """
+    Standardizes arabic text by removing combining marks and tatweels.
+    """
+    return unicodedata.normalize(
+        'NFKC',
+        COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+    )
+
+
 def read_cBpack(filename):
    """
    Read a file from an idiosyncratic format that we use for storing
@ -257,6 +269,9 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
    If a word decomposes into multiple tokens, we'll return a smoothed estimate
    of the word frequency that is no greater than the frequency of any of its
    individual tokens.
+
+    It should be noted that the current tokenizer does not support
+    multi-word Chinese phrases.
    """
    args = (word, lang, wordlist, minimum)
    try:
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz
--- a/wordfreq/data/twitter_ar.msgpack.gz
+++ b/wordfreq/data/twitter_ar.msgpack.gz
--- a/wordfreq/data/twitter_de.msgpack.gz
+++ b/wordfreq/data/twitter_de.msgpack.gz
--- a/wordfreq/data/twitter_en.msgpack.gz
+++ b/wordfreq/data/twitter_en.msgpack.gz
--- a/wordfreq/data/twitter_es.msgpack.gz
+++ b/wordfreq/data/twitter_es.msgpack.gz
--- a/wordfreq/data/twitter_fr.msgpack.gz
+++ b/wordfreq/data/twitter_fr.msgpack.gz
--- a/wordfreq/data/twitter_id.msgpack.gz
+++ b/wordfreq/data/twitter_id.msgpack.gz
--- a/wordfreq/data/twitter_it.msgpack.gz
+++ b/wordfreq/data/twitter_it.msgpack.gz
--- a/wordfreq/data/twitter_ja.msgpack.gz
+++ b/wordfreq/data/twitter_ja.msgpack.gz
--- a/wordfreq/data/twitter_ko.msgpack.gz
+++ b/wordfreq/data/twitter_ko.msgpack.gz
--- a/wordfreq/data/twitter_ms.msgpack.gz
+++ b/wordfreq/data/twitter_ms.msgpack.gz
--- a/wordfreq/data/twitter_nl.msgpack.gz
+++ b/wordfreq/data/twitter_nl.msgpack.gz
--- a/wordfreq/data/twitter_pt.msgpack.gz
+++ b/wordfreq/data/twitter_pt.msgpack.gz
--- a/wordfreq/data/twitter_ru.msgpack.gz
+++ b/wordfreq/data/twitter_ru.msgpack.gz
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@ -47,8 +47,7 @@ Start the build, and find something else to do for a few hours:

    ninja -v

-You can copy the results into wordfreq with this command (supposing that
-$WORDFREQ points to your wordfreq repo):
+You can copy the results into wordfreq with this command:

    cp data/dist/*.msgpack.gz ../wordfreq/data/

@ -83,6 +82,19 @@ The specific rules are described by the comments in `rules.ninja`.

 ## Data sources

+### Wikipedia
+
+Wikipedia is a "free-access, free-content Internet encyclopedia".
+
+These files can be downloaded from [wikimedia dump][wikipedia]
+
+The original files are in `data/raw-input/wikipedia`, and they're processed
+by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
+[wiki2text][] package.
+
+[wikipedia]: https://dumps.wikimedia.org/backup-index.html
+[wiki2text]: https://github.com/rspeer/wiki2text
+
 ### Leeds Internet Corpus

 Also known as the "Web as Corpus" project, this is a University of Leeds
@ -102,7 +114,7 @@ by the `convert_leeds` rule in `rules.ninja`.
 The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets
 collected by the `ftfy.streamtester` package in 2014.

-It's not possible to distribute the text of tweets. However, this process could
+We are not allowed to distribute the text of tweets. However, this process could
 be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for
 a couple of weeks.

@ -162,4 +174,3 @@ longer represents the words 'don' and 'won', as we assume most of their
 frequency comes from "don't" and "won't". Words that turned into similarly
 common words, however, were left alone: this list doesn't represent "can't"
 because the word was left as "can".
-
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -29,12 +29,12 @@ rule split
 # Wikipedia dumps obtained from dumps.wikimedia.org.  The code is at
 # https://github.com/rspeer/wiki2text.
 rule wiki2text
-  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
+  command = bunzip2 -c $in | wiki2text > $out

 # To tokenize Japanese, we run it through Mecab and take the first column.
 # We don't have a plan for tokenizing Chinese yet.
 rule tokenize_japanese
-  command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
+  command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out

 # Tokenizing text from Twitter requires us to language-detect and tokenize
 # in the same step.
@ -49,12 +49,12 @@ rule tokenize_twitter
 # Grep out the term "EOS", an indication that Leeds used MeCab and didn't
 # strip out the EOS lines.
 rule convert_leeds
-  command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
+  command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out

 # To convert the OpenSubtitles frequency data, simply replace spaces with
 # commas.
 rule convert_opensubtitles
-  command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
+  command = tr ' ' ',' < $in > $out

 # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
 # the input files, keep only the single words and their counts, and only keep
@ -65,16 +65,16 @@ rule convert_opensubtitles
 # source data was already filtered to only show words in roles with at least
 # two-digit counts of occurences.)
 rule convert_google_syntactic_ngrams
-  command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
+  command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out

 rule count
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
+  command = python -m wordfreq_builder.cli.count_tokens $in $out

 rule merge
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
+  command = python -m wordfreq_builder.cli.combine_lists -o $out $in

 rule freqs2cB
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
+  command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out

 rule cat
  command = cat $in > $out
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@ -9,12 +9,5 @@ setup(
    platforms=["any"],
    description="Turns raw data into word frequency lists",
    packages=['wordfreq_builder'],
-    install_requires=['msgpack-python', 'pycld2'],
-    entry_points={
-        'console_scripts': [
-            'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
-            'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
-            'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
-        ]
-    }
+    install_requires=['msgpack-python', 'pycld2']
 )
--- a/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
--- a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
+++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
@ -13,4 +13,3 @@ if __name__ == '__main__':
    parser.add_argument('filename_out', help='name of output file')
    args = parser.parse_args()
    handle_counts(args.filename_in, args.filename_out)
-
--- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
+++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
@ -4,8 +4,8 @@ import argparse

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
+    parser.add_argument('language', help='language of the input file')
    parser.add_argument('filename_in', help='name of input file containing tokens')
    parser.add_argument('filename_out', help='name of output file')
    args = parser.parse_args()
-    freqs_to_cBpack(args.filename_in, args.filename_out)
-
+    freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
@ -1,18 +1,13 @@
-from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
+from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
 import argparse


-def tokenize_twitter(in_filename, out_prefix):
-    tokenize_file(in_filename, out_prefix,
-                  tokenizer=cld2_surface_tokenizer)
-
-
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='filename of input file containing one tweet per line')
    parser.add_argument('outprefix', help='prefix of output filenames')
    args = parser.parse_args()
-    tokenize_twitter(args.filename, args.outprefix)
+    tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)


 if __name__ == '__main__':
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -10,10 +10,6 @@ HEADER = """# This file is automatically generated. Do not edit it.
 TMPDIR = data_filename('tmp')


-# Set this to True to rebuild the Twitter tokenization (which takes days)
-TOKENIZE_TWITTER = True
-
-
 def add_dep(lines, rule, input, output, extra=None, params=None):
    if isinstance(output, list):
        output = ' '.join(output)
@ -48,17 +44,15 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
    # The first dependency is to make sure the build file is up to date.
    add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
            extra='wordfreq_builder/ninja.py')
-
-    if TOKENIZE_TWITTER:
-        lines.extend(
-            twitter_deps(
-                data_filename('raw-input/twitter/all-2014.txt'),
-                slice_prefix=data_filename('slices/twitter/tweets-2014'),
-                combined_prefix=data_filename('generated/twitter/tweets-2014'),
-                slices=40,
-                languages=CONFIG['sources']['twitter']
-            )
+    lines.extend(
+        twitter_deps(
+            data_filename('raw-input/twitter/all-2014.txt'),
+            slice_prefix=data_filename('slices/twitter/tweets-2014'),
+            combined_prefix=data_filename('generated/twitter/tweets-2014'),
+            slices=40,
+            languages=CONFIG['sources']['twitter']
        )
+    )
    lines.extend(
        wikipedia_deps(
            data_filename('raw-input/wikipedia'),
@ -92,17 +86,18 @@ def wikipedia_deps(dirname_in, languages):
    path_in = pathlib.Path(dirname_in)
    for language in languages:
        # Find the most recent file for this language
-        # Skip over files that do not exist
-        input_file = max(path_in.glob(
-            '{}wiki*.bz2'.format(language)
-        ))
+        input_file = max(path_in.glob('{}wiki*.bz2'.format(language)))
        plain_text_file = wordlist_filename('wikipedia', language, 'txt')
        count_file = wordlist_filename('wikipedia', language, 'counts.txt')

        add_dep(lines, 'wiki2text', input_file, plain_text_file)
        if language == 'ja':
-            mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
-            add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
+            mecab_token_file = wordlist_filename(
+                'wikipedia', language, 'mecab-tokens.txt'
+            )
+            add_dep(
+                lines, 'tokenize_japanese', plain_text_file, mecab_token_file
+            )
            add_dep(lines, 'count', mecab_token_file, count_file)
        else:
            add_dep(lines, 'count', plain_text_file, count_file)
@ -126,17 +121,18 @@ def google_books_deps(dirname_in):
    return lines


-def twitter_deps(input_filename, slice_prefix,
-                            combined_prefix, slices, languages):
+def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
+                 languages):
+
    lines = []

-    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
+    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
+                                                    num=num)
                   for num in range(slices)]
    # split the input into slices
-    add_dep(lines,
-            'split', input_filename, slice_files,
+    add_dep(lines, 'split', input_filename, slice_files,
            params={'prefix': '{}.part'.format(slice_prefix),
-             'slices': slices})
+                    'slices': slices})

    for slicenum in range(slices):
        slice_file = slice_files[slicenum]
@ -151,7 +147,9 @@ def twitter_deps(input_filename, slice_prefix,
        combined_output = wordlist_filename('twitter', language, 'tokens.txt')

        language_inputs = [
-            '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
+            '{prefix}.{lang}.txt'.format(
+                prefix=slice_files[slicenum], lang=language
+            )
            for slicenum in range(slices)
        ]

@ -160,11 +158,14 @@ def twitter_deps(input_filename, slice_prefix,
        count_file = wordlist_filename('twitter', language, 'counts.txt')

        if language == 'ja':
-            mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
-            add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file)
-            add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
-        else:
-            add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py')
+            mecab_token_file = wordlist_filename(
+                'twitter', language, 'mecab-tokens.txt')
+            add_dep(
+                lines, 'tokenize_japanese', combined_output, mecab_token_file)
+            combined_output = mecab_token_file
+
+        add_dep(lines, 'count', combined_output, count_file,
+                extra='wordfreq_builder/tokenizers.py')

    return lines

@ -187,7 +188,8 @@ def opensubtitles_deps(dirname_in, languages):
        input_file = '{prefix}/{lang}.txt'.format(
            prefix=dirname_in, lang=language
        )
-        reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
+        reformatted_file = wordlist_filename(
+            'opensubtitles', language, 'counts.txt')
        add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)

    return lines
@ -205,18 +207,22 @@ def combine_lists(languages):
        add_dep(lines, 'merge', input_files, output_file,
                extra='wordfreq_builder/word_counts.py')

-        output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz')
+        output_cBpack = wordlist_filename(
+            'combined-dist', language, 'msgpack.gz')
        add_dep(lines, 'freqs2cB', output_file, output_cBpack,
-                extra='wordfreq_builder/word_counts.py')
+                extra='wordfreq_builder/word_counts.py',
+                params={'lang': language})

        lines.append('default {}'.format(output_cBpack))

        # Write standalone lists for Twitter frequency
        if language in CONFIG['sources']['twitter']:
            input_file = wordlist_filename('twitter', language, 'counts.txt')
-            output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz')
+            output_cBpack = wordlist_filename(
+                'twitter-dist', language, 'msgpack.gz')
            add_dep(lines, 'freqs2cB', input_file, output_cBpack,
-                    extra='wordfreq_builder/word_counts.py')
+                    extra='wordfreq_builder/word_counts.py',
+                    params={'lang': language})

            lines.append('default {}'.format(output_cBpack))

--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,63 +1,56 @@
 from html.entities import name2codepoint
 from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
+from ftfy.fixes import unescape_html
 import re
 import pycld2

-CLD2_BAD_CHAR_RANGE = "".join([
-    '[',
-    '\x00-\x08',
-    '\x0b',
-    '\x0e-\x1f',
-    '\x7f-\x9f',
-    '\ud800-\udfff',
-    '\ufdd0-\ufdef'] +
-    [chr(65534+65536*x+y) for x in range(17) for y in range(2)] +
-    [']'])
+CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
+    [
+        '\x00-\x08',
+        '\x0b',
+        '\x0e-\x1f',
+        '\x7f-\x9f',
+        '\ud800-\udfff',
+        '\ufdd0-\ufdef'
+    ] +
+    [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
+)
 CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)

 TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
-TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+'.format(NON_PUNCT_RANGE))
+TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')


 def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
    """
-    text = remove_handles_and_urls(text)
+    text = unescape_html(text)
+    text = TWITTER_HANDLE_RE.sub('', text)
+    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)
    tokens = tokenize(text, lang)
    return lang, tokens

+
 def cld2_detect_language(text):
    """
    Uses CLD2 to detect the language
    """
+    # Format of pycld2.detect:
+    #   (Confident in result: bool,
+    #   Number of bytes of text: Int,
+    #   Triples of detected languages in order of certainty:
+    #       (Language name: str,
+    #       Language code: str
+    #       Percent of text in this language: float
+    #       Confidence score: float))
+    
    text = CLD2_BAD_CHARS_RE.sub('', text)
    return pycld2.detect(text)[2][0][1]

-def remove_handles_and_urls(text):
-    text = fix_entities(text)
-    text = TWITTER_HANDLE_RE.sub('', text)
-    text = TCO_RE.sub('', text)
-    return text

-def last_tab(line):
-    """
-    Read lines by keeping only the last tab-separated value.
-    """
-    return line.split('\t')[-1].strip()
-
-def lowercase_text_filter(token):
-    """
-    If this looks like a token that we want to count, return it, lowercased.
-    If not, filter it out by returning None.
-    """
-    if TOKEN_RE.search(token):
-        return token.lower()
-    else:
-        return None
-
-def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
+def tokenize_twitter(in_filename, out_prefix, tokenizer):
    """
    Process a file by running it through the given tokenizer, sorting the
    results by the language of each line, and inserting newlines
@ -66,7 +59,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
    out_files = {}
    with open(in_filename, encoding='utf-8') as in_file:
        for line in in_file:
-            text = line_reader(line)
+            text = line.split('\t')[-1].strip()
            language, tokens = tokenizer(text)
            if language != 'un':
                tokenized = '\n'.join(tokens)
@ -79,37 +72,3 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
                print(tokenized, file=out_file)
    for out_file in out_files.values():
        out_file.close()
-
-ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
-
-def fix_entities(text):
-    """
-    Fix the few HTML entities that Twitter uses -- even if they've
-    already been tokenized.
-    """
-    def replace_entity(match):
-        return chr(name2codepoint[match.group(1)])
-    return ENTITY_RE.sub(replace_entity, text)
-
-def monolingual_tokenize_file(in_filename, out_filename, language,
-                              tokenizer, line_reader=last_tab,
-                              sample_proportion=1):
-    """
-    Process a file by running it through the given tokenizer, only keeping
-    lines of the language we're asking for, and inserting newlines
-    to mark the token boundaries.
-
-    `line_reader` is applied to each line before it given to the tokenizer
-
-    Only the first line out of every `sample_proportion` lines are run through
-    then tokenizer.
-    """
-    with open(in_filename, encoding='utf-8', errors='replace') as in_file:
-        with open(out_filename, 'w', encoding='utf-8') as out_file:
-            for i, line in enumerate(in_file):
-                if i % sample_proportion == 0:
-                    text = line_reader(line)
-                    tokens, line_language = tokenizer(text)
-                    if line_language == language:
-                        for token in tokens:
-                            print(token, file=out_file)
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -1,4 +1,4 @@
-from wordfreq import simple_tokenize
+from wordfreq import simple_tokenize, tokenize
 from collections import defaultdict
 from operator import itemgetter
 from ftfy import fix_text
@ -18,41 +18,49 @@ def count_tokens(filename):
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
        for line in infile:
-            for token in simple_tokenize(line.strip()):
+            for token in simple_tokenize(line):
                counts[token] += 1
+
    return counts


-def read_freqs(filename, cutoff=0):
+def read_freqs(filename, cutoff=0, lang=None):
    """
    Read words and their frequencies from a CSV file.

-    Only words with a frequency greater than `cutoff` are returned.
+    Only words with a frequency greater than or equal to `cutoff` are returned.

    If `cutoff` is greater than 0, the csv file must be sorted by frequency
    in descending order.
+
+    If lang is given, read_freqs will apply language specific preprocessing
+    operations.
    """
    raw_counts = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        reader = csv.reader(infile)
        for key, strval in reader:
+
            val = float(strval)
            if val < cutoff:
                break
-            for token in simple_tokenize(key):
+
+            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
+            for token in tokens:
                token = fix_text(token)
                total += val
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                raw_counts[token] += val

-    freqs = {key: raw_count / total
-             for (key, raw_count) in raw_counts.items()}
-    return freqs
+    for word in raw_counts:
+        raw_counts[word] /= total
+
+    return raw_counts


-def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
+def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
    """
    Convert a csv file of words and their frequencies to a file in the
    idiosyncratic 'cBpack' format.
@ -61,15 +69,14 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
    written to the new file.
    """
    freq_cutoff = 10 ** (cutoff / 100.)
-    freqs = read_freqs(in_filename, freq_cutoff)
+    freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
    cBpack = []
    for token, freq in freqs.items():
        cB = round(math.log10(freq) * 100)
-        if cB >= cutoff:
-            neg_cB = -cB
-            while neg_cB >= len(cBpack):
-                cBpack.append([])
-            cBpack[neg_cB].append(token)
+        neg_cB = -cB
+        while neg_cB >= len(cBpack):
+            cBpack.append([])
+        cBpack[neg_cB].append(token)

    for sublist in cBpack:
        sublist.sort()
@ -88,7 +95,7 @@ def merge_freqs(freq_dicts):
    """
    vocab = set()
    for freq_dict in freq_dicts:
-        vocab |= set(freq_dict)
+        vocab.update(freq_dict)

    merged = defaultdict(float)
    N = len(freq_dicts)