Merge pull request #19 from LuminosoInsight/code-review-fixes-2015-07-17

Code review fixes 2015 07 17
2024-12-24 01:41:39 +00:00 · 2015-07-22 15:09:00 -04:00 · 2015-07-22 15:09:00 -04:00 · 32102ba3c2
commit 32102ba3c2
parent 2d1020daac 93cd902899
42 changed files with 151 additions and 158 deletions
--- a/README.md
+++ b/README.md
@ -23,8 +23,8 @@ install them on Ubuntu:
 ## Unicode data
-The tokenizers used to split non-Japanese phrases use regexes built using the
+The tokenizers that split non-Japanese phrases utilize regexes built using the
-`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0.  To
+`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0.  To
 update these regexes, run `scripts/gen_regex.py`.
 ## License
@ -58,4 +58,3 @@ Some additional data was collected by a custom application that watches the
 streaming Twitter API, in accordance with Twitter's Developer Agreement &
 Policy. This software only gives statistics about words that are very commonly
 used on Twitter; it does not display or republish any Twitter content.
--- a/wordfreq_builder/wordfreq_builder/ninja2dot.py
+++ b/wordfreq_builder/wordfreq_builder/ninja2dot.py
@ -1,3 +1,5 @@
 """ This file generates a graph of the dependencies for the ninja build."""
 import sys
@ -26,4 +28,3 @@ def ninja_to_dot():
 if __name__ == '__main__':
    ninja_to_dot()
--- a/tests/test.py
+++ b/tests/test.py
@ -94,7 +94,7 @@ def test_failed_cB_conversion():
 def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
-    # data, while the fake word "plan't" can't be found.
+    # data
    eq_(tokenize("can't", 'en'), ["can't"])
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
@ -135,12 +135,20 @@ def test_not_enough_ascii():
    random_ascii_words(lang='zh')
 def test_ar():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
        ['متعب']
    )
    # Remove combining marks
    eq_(
        tokenize('حَرَكَات', 'ar'),
        ['حركات']
    )
    eq_(
        tokenize('إﻻ', 'ar'),
        ['إلا']
    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -8,6 +8,8 @@ import itertools
 import pathlib
 import random
 import logging
 import unicodedata
 logger = logging.getLogger(__name__)
@ -66,11 +68,21 @@ def tokenize(text, lang):
        return mecab_tokenize(text)
    if lang == 'ar':
-        text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+        text = standardize_arabic(text)
    return simple_tokenize(text)
 def standardize_arabic(text):
    """
    Standardizes arabic text by removing combining marks and tatweels.
    """
    return unicodedata.normalize(
        'NFKC',
        COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
    )
 def read_cBpack(filename):
    """
    Read a file from an idiosyncratic format that we use for storing
@ -257,6 +269,9 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
    If a word decomposes into multiple tokens, we'll return a smoothed estimate
    of the word frequency that is no greater than the frequency of any of its
    individual tokens.
    It should be noted that the current tokenizer does not support
    multi-word Chinese phrases.
    """
    args = (word, lang, wordlist, minimum)
    try:
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz
--- a/wordfreq/data/twitter_ar.msgpack.gz
+++ b/wordfreq/data/twitter_ar.msgpack.gz
--- a/wordfreq/data/twitter_de.msgpack.gz
+++ b/wordfreq/data/twitter_de.msgpack.gz
--- a/wordfreq/data/twitter_en.msgpack.gz
+++ b/wordfreq/data/twitter_en.msgpack.gz
--- a/wordfreq/data/twitter_es.msgpack.gz
+++ b/wordfreq/data/twitter_es.msgpack.gz
--- a/wordfreq/data/twitter_fr.msgpack.gz
+++ b/wordfreq/data/twitter_fr.msgpack.gz
--- a/wordfreq/data/twitter_id.msgpack.gz
+++ b/wordfreq/data/twitter_id.msgpack.gz
--- a/wordfreq/data/twitter_it.msgpack.gz
+++ b/wordfreq/data/twitter_it.msgpack.gz
--- a/wordfreq/data/twitter_ja.msgpack.gz
+++ b/wordfreq/data/twitter_ja.msgpack.gz
--- a/wordfreq/data/twitter_ko.msgpack.gz
+++ b/wordfreq/data/twitter_ko.msgpack.gz
--- a/wordfreq/data/twitter_ms.msgpack.gz
+++ b/wordfreq/data/twitter_ms.msgpack.gz
--- a/wordfreq/data/twitter_nl.msgpack.gz
+++ b/wordfreq/data/twitter_nl.msgpack.gz
--- a/wordfreq/data/twitter_pt.msgpack.gz
+++ b/wordfreq/data/twitter_pt.msgpack.gz
--- a/wordfreq/data/twitter_ru.msgpack.gz
+++ b/wordfreq/data/twitter_ru.msgpack.gz
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@ -47,8 +47,7 @@ Start the build, and find something else to do for a few hours:
    ninja -v
-You can copy the results into wordfreq with this command (supposing that
+You can copy the results into wordfreq with this command:
 $WORDFREQ points to your wordfreq repo):
    cp data/dist/*.msgpack.gz ../wordfreq/data/
@ -83,6 +82,19 @@ The specific rules are described by the comments in `rules.ninja`.
 ## Data sources
 ### Wikipedia
 Wikipedia is a "free-access, free-content Internet encyclopedia".
 These files can be downloaded from [wikimedia dump][wikipedia]
 The original files are in `data/raw-input/wikipedia`, and they're processed
 by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
 [wiki2text][] package.
 [wikipedia]: https://dumps.wikimedia.org/backup-index.html
 [wiki2text]: https://github.com/rspeer/wiki2text
 ### Leeds Internet Corpus
 Also known as the "Web as Corpus" project, this is a University of Leeds
@ -102,7 +114,7 @@ by the `convert_leeds` rule in `rules.ninja`.
 The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets
 collected by the `ftfy.streamtester` package in 2014.
-It's not possible to distribute the text of tweets. However, this process could
+We are not allowed to distribute the text of tweets. However, this process could
 be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for
 a couple of weeks.
@ -162,4 +174,3 @@ longer represents the words 'don' and 'won', as we assume most of their
 frequency comes from "don't" and "won't". Words that turned into similarly
 common words, however, were left alone: this list doesn't represent "can't"
 because the word was left as "can".
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -29,12 +29,12 @@ rule split
 # Wikipedia dumps obtained from dumps.wikimedia.org.  The code is at
 # https://github.com/rspeer/wiki2text.
 rule wiki2text
-  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
+  command = bunzip2 -c $in | wiki2text > $out
 # To tokenize Japanese, we run it through Mecab and take the first column.
 # We don't have a plan for tokenizing Chinese yet.
 rule tokenize_japanese
-  command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
+  command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
 # Tokenizing text from Twitter requires us to language-detect and tokenize
 # in the same step.
@ -49,12 +49,12 @@ rule tokenize_twitter
 # Grep out the term "EOS", an indication that Leeds used MeCab and didn't
 # strip out the EOS lines.
 rule convert_leeds
-  command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
+  command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
 # To convert the OpenSubtitles frequency data, simply replace spaces with
 # commas.
 rule convert_opensubtitles
-  command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
+  command = tr ' ' ',' < $in > $out
 # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
 # the input files, keep only the single words and their counts, and only keep
@ -65,16 +65,16 @@ rule convert_opensubtitles
 # source data was already filtered to only show words in roles with at least
 # two-digit counts of occurences.)
 rule convert_google_syntactic_ngrams
-  command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
+  command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
 rule count
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
+  command = python -m wordfreq_builder.cli.count_tokens $in $out
 rule merge
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
+  command = python -m wordfreq_builder.cli.combine_lists -o $out $in
 rule freqs2cB
-  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
+  command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
 rule cat
  command = cat $in > $out
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@ -9,12 +9,5 @@ setup(
    platforms=["any"],
    description="Turns raw data into word frequency lists",
    packages=['wordfreq_builder'],
-    install_requires=['msgpack-python', 'pycld2'],
+    install_requires=['msgpack-python', 'pycld2']
    entry_points={
        'console_scripts': [
            'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
            'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
            'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
        ]
    }
 )
--- a/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
--- a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
+++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
@ -13,4 +13,3 @@ if __name__ == '__main__':
    parser.add_argument('filename_out', help='name of output file')
    args = parser.parse_args()
    handle_counts(args.filename_in, args.filename_out)
--- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
+++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
@ -4,8 +4,8 @@ import argparse
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('language', help='language of the input file')
    parser.add_argument('filename_in', help='name of input file containing tokens')
    parser.add_argument('filename_out', help='name of output file')
    args = parser.parse_args()
-    freqs_to_cBpack(args.filename_in, args.filename_out)
+    freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
@ -1,18 +1,13 @@
-from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
+from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
 import argparse
 def tokenize_twitter(in_filename, out_prefix):
    tokenize_file(in_filename, out_prefix,
                  tokenizer=cld2_surface_tokenizer)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='filename of input file containing one tweet per line')
    parser.add_argument('outprefix', help='prefix of output filenames')
    args = parser.parse_args()
-    tokenize_twitter(args.filename, args.outprefix)
+    tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
 if __name__ == '__main__':
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -10,10 +10,6 @@ HEADER = """# This file is automatically generated. Do not edit it.
 TMPDIR = data_filename('tmp')
 # Set this to True to rebuild the Twitter tokenization (which takes days)
 TOKENIZE_TWITTER = True
 def add_dep(lines, rule, input, output, extra=None, params=None):
    if isinstance(output, list):
        output = ' '.join(output)
@ -48,17 +44,15 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
    # The first dependency is to make sure the build file is up to date.
    add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
            extra='wordfreq_builder/ninja.py')
-
+    lines.extend(
-    if TOKENIZE_TWITTER:
+        twitter_deps(
-        lines.extend(
+            data_filename('raw-input/twitter/all-2014.txt'),
-            twitter_deps(
+            slice_prefix=data_filename('slices/twitter/tweets-2014'),
-                data_filename('raw-input/twitter/all-2014.txt'),
+            combined_prefix=data_filename('generated/twitter/tweets-2014'),
-                slice_prefix=data_filename('slices/twitter/tweets-2014'),
+            slices=40,
-                combined_prefix=data_filename('generated/twitter/tweets-2014'),
+            languages=CONFIG['sources']['twitter']
                slices=40,
                languages=CONFIG['sources']['twitter']
            )
        )
    )
    lines.extend(
        wikipedia_deps(
            data_filename('raw-input/wikipedia'),
@ -92,17 +86,18 @@ def wikipedia_deps(dirname_in, languages):
    path_in = pathlib.Path(dirname_in)
    for language in languages:
        # Find the most recent file for this language
-        # Skip over files that do not exist
+        input_file = max(path_in.glob('{}wiki*.bz2'.format(language)))
        input_file = max(path_in.glob(
            '{}wiki*.bz2'.format(language)
        ))
        plain_text_file = wordlist_filename('wikipedia', language, 'txt')
        count_file = wordlist_filename('wikipedia', language, 'counts.txt')
        add_dep(lines, 'wiki2text', input_file, plain_text_file)
        if language == 'ja':
-            mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
+            mecab_token_file = wordlist_filename(
-            add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
+                'wikipedia', language, 'mecab-tokens.txt'
            )
            add_dep(
                lines, 'tokenize_japanese', plain_text_file, mecab_token_file
            )
            add_dep(lines, 'count', mecab_token_file, count_file)
        else:
            add_dep(lines, 'count', plain_text_file, count_file)
@ -126,17 +121,18 @@ def google_books_deps(dirname_in):
    return lines
-def twitter_deps(input_filename, slice_prefix,
+def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
-                            combined_prefix, slices, languages):
+                 languages):
    lines = []
-    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
+    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
                                                    num=num)
                   for num in range(slices)]
    # split the input into slices
-    add_dep(lines,
+    add_dep(lines, 'split', input_filename, slice_files,
            'split', input_filename, slice_files,
            params={'prefix': '{}.part'.format(slice_prefix),
-             'slices': slices})
+                    'slices': slices})
    for slicenum in range(slices):
        slice_file = slice_files[slicenum]
@ -151,7 +147,9 @@ def twitter_deps(input_filename, slice_prefix,
        combined_output = wordlist_filename('twitter', language, 'tokens.txt')
        language_inputs = [
-            '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
+            '{prefix}.{lang}.txt'.format(
                prefix=slice_files[slicenum], lang=language
            )
            for slicenum in range(slices)
        ]
@ -160,11 +158,14 @@ def twitter_deps(input_filename, slice_prefix,
        count_file = wordlist_filename('twitter', language, 'counts.txt')
        if language == 'ja':
-            mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
+            mecab_token_file = wordlist_filename(
-            add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file)
+                'twitter', language, 'mecab-tokens.txt')
-            add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
+            add_dep(
-        else:
+                lines, 'tokenize_japanese', combined_output, mecab_token_file)
-            add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py')
+            combined_output = mecab_token_file
        add_dep(lines, 'count', combined_output, count_file,
                extra='wordfreq_builder/tokenizers.py')
    return lines
@ -187,7 +188,8 @@ def opensubtitles_deps(dirname_in, languages):
        input_file = '{prefix}/{lang}.txt'.format(
            prefix=dirname_in, lang=language
        )
-        reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
+        reformatted_file = wordlist_filename(
            'opensubtitles', language, 'counts.txt')
        add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
    return lines
@ -205,18 +207,22 @@ def combine_lists(languages):
        add_dep(lines, 'merge', input_files, output_file,
                extra='wordfreq_builder/word_counts.py')
-        output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz')
+        output_cBpack = wordlist_filename(
            'combined-dist', language, 'msgpack.gz')
        add_dep(lines, 'freqs2cB', output_file, output_cBpack,
-                extra='wordfreq_builder/word_counts.py')
+                extra='wordfreq_builder/word_counts.py',
                params={'lang': language})
        lines.append('default {}'.format(output_cBpack))
        # Write standalone lists for Twitter frequency
        if language in CONFIG['sources']['twitter']:
            input_file = wordlist_filename('twitter', language, 'counts.txt')
-            output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz')
+            output_cBpack = wordlist_filename(
                'twitter-dist', language, 'msgpack.gz')
            add_dep(lines, 'freqs2cB', input_file, output_cBpack,
-                    extra='wordfreq_builder/word_counts.py')
+                    extra='wordfreq_builder/word_counts.py',
                    params={'lang': language})
            lines.append('default {}'.format(output_cBpack))
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,63 +1,56 @@
 from html.entities import name2codepoint
 from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
 from ftfy.fixes import unescape_html
 import re
 import pycld2
-CLD2_BAD_CHAR_RANGE = "".join([
+CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
-    '[',
+    [
-    '\x00-\x08',
+        '\x00-\x08',
-    '\x0b',
+        '\x0b',
-    '\x0e-\x1f',
+        '\x0e-\x1f',
-    '\x7f-\x9f',
+        '\x7f-\x9f',
-    '\ud800-\udfff',
+        '\ud800-\udfff',
-    '\ufdd0-\ufdef'] +
+        '\ufdd0-\ufdef'
-    [chr(65534+65536*x+y) for x in range(17) for y in range(2)] +
+    ] +
-    [']'])
+    [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
 )
 CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
 TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
-TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+'.format(NON_PUNCT_RANGE))
+TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
 def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
    """
-    text = remove_handles_and_urls(text)
+    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)
    tokens = tokenize(text, lang)
    return lang, tokens
 def cld2_detect_language(text):
    """
    Uses CLD2 to detect the language
    """
    # Format of pycld2.detect:
    #   (Confident in result: bool,
    #   Number of bytes of text: Int,
    #   Triples of detected languages in order of certainty:
    #       (Language name: str,
    #       Language code: str
    #       Percent of text in this language: float
    #       Confidence score: float))
    text = CLD2_BAD_CHARS_RE.sub('', text)
    return pycld2.detect(text)[2][0][1]
 def remove_handles_and_urls(text):
    text = fix_entities(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    return text
-def last_tab(line):
+def tokenize_twitter(in_filename, out_prefix, tokenizer):
    """
    Read lines by keeping only the last tab-separated value.
    """
    return line.split('\t')[-1].strip()
 def lowercase_text_filter(token):
    """
    If this looks like a token that we want to count, return it, lowercased.
    If not, filter it out by returning None.
    """
    if TOKEN_RE.search(token):
        return token.lower()
    else:
        return None
 def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
    """
    Process a file by running it through the given tokenizer, sorting the
    results by the language of each line, and inserting newlines
@ -66,7 +59,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
    out_files = {}
    with open(in_filename, encoding='utf-8') as in_file:
        for line in in_file:
-            text = line_reader(line)
+            text = line.split('\t')[-1].strip()
            language, tokens = tokenizer(text)
            if language != 'un':
                tokenized = '\n'.join(tokens)
@ -79,37 +72,3 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
                print(tokenized, file=out_file)
    for out_file in out_files.values():
        out_file.close()
 ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
 def fix_entities(text):
    """
    Fix the few HTML entities that Twitter uses -- even if they've
    already been tokenized.
    """
    def replace_entity(match):
        return chr(name2codepoint[match.group(1)])
    return ENTITY_RE.sub(replace_entity, text)
 def monolingual_tokenize_file(in_filename, out_filename, language,
                              tokenizer, line_reader=last_tab,
                              sample_proportion=1):
    """
    Process a file by running it through the given tokenizer, only keeping
    lines of the language we're asking for, and inserting newlines
    to mark the token boundaries.
    `line_reader` is applied to each line before it given to the tokenizer
    Only the first line out of every `sample_proportion` lines are run through
    then tokenizer.
    """
    with open(in_filename, encoding='utf-8', errors='replace') as in_file:
        with open(out_filename, 'w', encoding='utf-8') as out_file:
            for i, line in enumerate(in_file):
                if i % sample_proportion == 0:
                    text = line_reader(line)
                    tokens, line_language = tokenizer(text)
                    if line_language == language:
                        for token in tokens:
                            print(token, file=out_file)
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -1,4 +1,4 @@
-from wordfreq import simple_tokenize
+from wordfreq import simple_tokenize, tokenize
 from collections import defaultdict
 from operator import itemgetter
 from ftfy import fix_text
@ -18,41 +18,49 @@ def count_tokens(filename):
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
        for line in infile:
-            for token in simple_tokenize(line.strip()):
+            for token in simple_tokenize(line):
                counts[token] += 1
    return counts
-def read_freqs(filename, cutoff=0):
+def read_freqs(filename, cutoff=0, lang=None):
    """
    Read words and their frequencies from a CSV file.
-    Only words with a frequency greater than `cutoff` are returned.
+    Only words with a frequency greater than or equal to `cutoff` are returned.
    If `cutoff` is greater than 0, the csv file must be sorted by frequency
    in descending order.
    If lang is given, read_freqs will apply language specific preprocessing
    operations.
    """
    raw_counts = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        reader = csv.reader(infile)
        for key, strval in reader:
            val = float(strval)
            if val < cutoff:
                break
-            for token in simple_tokenize(key):
+
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                token = fix_text(token)
                total += val
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                raw_counts[token] += val
-    freqs = {key: raw_count / total
+    for word in raw_counts:
-             for (key, raw_count) in raw_counts.items()}
+        raw_counts[word] /= total
-    return freqs
+
    return raw_counts
-def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
+def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
    """
    Convert a csv file of words and their frequencies to a file in the
    idiosyncratic 'cBpack' format.
@ -61,15 +69,14 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
    written to the new file.
    """
    freq_cutoff = 10 ** (cutoff / 100.)
-    freqs = read_freqs(in_filename, freq_cutoff)
+    freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
    cBpack = []
    for token, freq in freqs.items():
        cB = round(math.log10(freq) * 100)
-        if cB >= cutoff:
+        neg_cB = -cB
-            neg_cB = -cB
+        while neg_cB >= len(cBpack):
-            while neg_cB >= len(cBpack):
+            cBpack.append([])
-                cBpack.append([])
+        cBpack[neg_cB].append(token)
            cBpack[neg_cB].append(token)
    for sublist in cBpack:
        sublist.sort()
@ -88,7 +95,7 @@ def merge_freqs(freq_dicts):
    """
    vocab = set()
    for freq_dict in freq_dicts:
-        vocab |= set(freq_dict)
+        vocab.update(freq_dict)
    merged = defaultdict(float)
    N = len(freq_dicts)