From 815d393b74918cb6f700b05f2c508078754bcfbd Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Wed, 29 Apr 2015 15:22:04 -0400
Subject: [PATCH 01/16] move commands into cli/ directory

---
 wordfreq_builder/wordfreq_builder/cli/__init__.py                 | 0
 .../{cmd_combine_lists.py => cli/combine_lists.py}                | 0
 .../{cmd_count_twitter.py => cli/count_twitter.py}                | 0
 .../{cmd_count_wikipedia.py => cli/count_wikipedia.py}            | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 wordfreq_builder/wordfreq_builder/cli/__init__.py
 rename wordfreq_builder/wordfreq_builder/{cmd_combine_lists.py => cli/combine_lists.py} (100%)
 rename wordfreq_builder/wordfreq_builder/{cmd_count_twitter.py => cli/count_twitter.py} (100%)
 rename wordfreq_builder/wordfreq_builder/{cmd_count_wikipedia.py => cli/count_wikipedia.py} (100%)

diff --git a/wordfreq_builder/wordfreq_builder/cli/__init__.py b/wordfreq_builder/wordfreq_builder/cli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
similarity index 100%
rename from wordfreq_builder/wordfreq_builder/cmd_combine_lists.py
rename to wordfreq_builder/wordfreq_builder/cli/combine_lists.py
diff --git a/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py b/wordfreq_builder/wordfreq_builder/cli/count_twitter.py
similarity index 100%
rename from wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
rename to wordfreq_builder/wordfreq_builder/cli/count_twitter.py
diff --git a/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py
similarity index 100%
rename from wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
rename to wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py

From 14e445a937915ab801544f7542b1fbf7b7985649 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Wed, 29 Apr 2015 15:59:06 -0400
Subject: [PATCH 02/16] WIP on Ninja build automation

---
 wordfreq_builder/rules.ninja  | 28 ++++++++++++++++++++++++++++
 wordfreq_builder/wordfreq.cfg |  4 ++++
 2 files changed, 32 insertions(+)
 create mode 100644 wordfreq_builder/rules.ninja
 create mode 100644 wordfreq_builder/wordfreq.cfg

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
new file mode 100644
index 0000000..ac8f386
--- /dev/null
+++ b/wordfreq_builder/rules.ninja
@@ -0,0 +1,28 @@
+# This defines the rules on how to build parts of the wordfreq lists, using the
+# Ninja build system:
+#
+#   http://martine.github.io/ninja/manual.html
+# 
+# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
+# better parallelism and the ability for build steps to produce multiple
+# outputs. The tradeoff is that its rule syntax isn't full of magic for
+# expanding wildcards and finding dependencies, so in general you have to
+# write the dependencies using a script.
+#
+# This file will become the header of the larger build.ninja file, which also
+# contains the programatically-defined dependency graph.
+
+# Variables
+DATA = ./data
+
+# Splits the single file $in into $slices parts, whose names will be
+# $prefix plus a two-digit numeric suffix.
+rule split
+  command = split -d -n $slices $in $prefix
+
+# wiki2text is a tool I wrote using the development version of Nim, which
+# extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org.
+# The code is at https://github.com/rspeer/wiki2text, but right now it'll
+# take a bit of setup to get it to run.
+rule wiki2text
+  command = bunzip2 -c $in | wiki2text > $out
diff --git a/wordfreq_builder/wordfreq.cfg b/wordfreq_builder/wordfreq.cfg
new file mode 100644
index 0000000..3146062
--- /dev/null
+++ b/wordfreq_builder/wordfreq.cfg
@@ -0,0 +1,4 @@
+[wordfreq]
+version = 0.8
+data_dir = ./data
+languages = en, es, fr, de, pt, nl, ru, it, ko, ja, zh-TW, zh-CN, ar, ms

From 4dae2f8caff82dbb50f924e817a612704841a8a9 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Wed, 29 Apr 2015 17:13:58 -0400
Subject: [PATCH 03/16] define some ninja rules

---
 wordfreq_builder/rules.ninja                  | 12 +++-
 wordfreq_builder/wordfreq_builder/config.py   | 14 ++++
 wordfreq_builder/wordfreq_builder/ninja.py    | 69 +++++++++++++++++++
 .../wordfreq_builder/tokenizers.py            | 52 ++++++++++----
 4 files changed, 129 insertions(+), 18 deletions(-)
 create mode 100644 wordfreq_builder/wordfreq_builder/config.py
 create mode 100644 wordfreq_builder/wordfreq_builder/ninja.py

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index ac8f386..2810f1e 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -2,7 +2,7 @@
 # Ninja build system:
 #
 #   http://martine.github.io/ninja/manual.html
-# 
+#
 # Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
 # better parallelism and the ability for build steps to produce multiple
 # outputs. The tradeoff is that its rule syntax isn't full of magic for
@@ -18,11 +18,17 @@ DATA = ./data
 # Splits the single file $in into $slices parts, whose names will be
 # $prefix plus a two-digit numeric suffix.
 rule split
-  command = split -d -n $slices $in $prefix
+  command = mkdir -p $$(dirname $prefix) && split -d -n $slices $in $prefix
 
 # wiki2text is a tool I wrote using the development version of Nim, which
 # extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org.
 # The code is at https://github.com/rspeer/wiki2text, but right now it'll
 # take a bit of setup to get it to run.
 rule wiki2text
-  command = bunzip2 -c $in | wiki2text > $out
+  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
+
+rule tokenize_twitter
+  command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix
+
+rule cat
+  command = cat $in > $out
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
new file mode 100644
index 0000000..829e07a
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -0,0 +1,14 @@
+import os
+
+CONFIG = {
+    'version': '0.8',
+    'data_dir': './data',
+    'languages': [
+        'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id',
+        'ja', 'ko', 'zh-TW', 'zh-CN',
+    ]
+}
+
+
+def data_filename(filename):
+    return os.path.join(CONFIG['data_dir'], filename)
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
new file mode 100644
index 0000000..21ff6c7
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -0,0 +1,69 @@
+from wordfreq_builder.config import CONFIG, data_filename
+import sys
+
+HEADER = """# This file is automatically generated. Do not edit it.
+# You can regenerate it using the 'wordfreq-build-deps' command.
+"""
+
+
+def make_ninja_deps(rules_filename, out=sys.stdout):
+    """
+    Output a complete Ninja file describing how to build the wordfreq data.
+    """
+    print(HEADER, file=out)
+    # Copy in the rules section
+    with open(rules_filename, encoding='utf-8') as rulesfile:
+        print(rulesfile.read(), file=out)
+
+    language_detect_and_tokenize_deps(
+        data_filename('raw-input/twitter/all-2014.txt'),
+        slice_prefix='slices/twitter/tweets-2014',
+        combined_prefix='generated/twitter/tweets-2014',
+        out=out, slices=10
+    )
+
+
+def language_detect_and_tokenize_deps(input_filename, slice_prefix,
+                                      combined_prefix, out, slices=10):
+    lines = []
+    # split the input into slices
+    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
+                   for num in range(slices)]
+    build_rule = "build {outs}: split {ins}".format(
+        outs=' '.join(slice_files), ins=input_filename
+    )
+    lines.append(build_rule)
+    lines.append("  prefix = {}".format(slice_prefix))
+    lines.append("  slices = {}".format(slices))
+    lines.append("")
+
+    for slicenum in range(slices):
+        slice_file = slice_files[slicenum]
+        language_outputs = [
+            '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
+            for language in CONFIG['languages']
+        ]
+        build_rule = "build {outs}: tokenize_twitter {ins}".format(
+            outs=' '.join(language_outputs), ins=slice_file
+        )
+        lines.append(build_rule)
+        lines.append("  prefix = {}".format(slice_file))
+        lines.append("")
+
+    for language in CONFIG['languages']:
+        combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language)
+        language_inputs = [
+            '{prefix}.{lang}'.format(prefix=slice_files[slicenum], lang=language)
+            for slicenum in range(slices)
+        ]
+        build_rule = "build {outs}: cat {ins}".format(
+            outs=combined_output,
+            ins=' '.join(language_inputs)
+        )
+        lines.append(build_rule)
+
+    print('\n'.join(lines), file=out)
+
+
+if __name__ == '__main__':
+    make_ninja_deps('rules.ninja')
diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index e20c96f..4a1117f 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -5,29 +5,47 @@ import re
 ROSETTE = RosetteReader()
 
 
-def rosette_tokenizer(text):
-    analysis, lang = ROSETTE.rosette.analyze(text)
-    # I'm aware this doesn't do the right things with multi-word stems.
-    # Wordfreq doesn't either. And wordfreq isn't designed to look up
-    # multiple words anyway.
-    tokens = []
-    for (stem, pos, span) in analysis:
-        for subtoken in stem.split(' '):
-            tokens.append(subtoken + '|' + lang)
-    return tokens
+# Rosette labels the orthographies of Chinese incorrectly
+ROSETTE_LANG_MAP = {
+    'zh_sc': 'zh-CN',
+    'zh_tc': 'zh-TW'
+}
+
+
+def last_tab(line):
+    """
+    Read lines by keeping only the last tab-separated value.
+    """
+    return line.split('\t')[-1].strip()
+
+
+def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
+    out_files = {}
+    for line in open(in_filename, encoding='utf-8'):
+        text = line_reader(line)
+        tokenized, language = tokenizer(text)
+        out_filename = '%s.%s.txt' % (out_prefix, language)
+        if out_filename in out_files:
+            out_file = out_files[out_filename]
+        else:
+            out_file = open(out_filename, 'w', encoding='utf-8')
+            out_files[out_filename] = out_file
+        print(tokenized, file=out_file)
+    for out_file in out_files.values():
+        out_file.close()
 
 
 def rosette_surface_tokenizer(text):
     analysis, lang = ROSETTE.rosette.analyze(text)
+    language = ROSETTE_LANG_MAP.get(lang, lang)
     tokens = []
     for (stem, pos, span) in analysis:
         surface_text = text[span[0]:span[1]]
-        for subtoken in surface_text.split(' '):
-            tokens.append(subtoken + '|' + lang)
-    return tokens
+        tokens.append(surface_text)
+    return ' '.join(tokens), language
 
 
-def treebank_surface_tokenizer(text):
+def treebank_surface_tokenizer(text, language='en'):
     """
     This is a simplified version of the Treebank tokenizer in NLTK.
 
@@ -45,6 +63,10 @@ def treebank_surface_tokenizer(text):
     as a result -- for example, it splits "wanna" into "wan" and "na", which
     are supposed to be considered unusual surface forms of "want" and "to".
     We just leave it as the word "wanna".
+
+    The language will just be returned, as this function isn't doing any
+    language detection. It defaults to 'en', as English is the language that
+    Treebank tokenization is designed for.
     """
     #starting quotes
     text = re.sub(r'^\"', r'``', text)
@@ -80,4 +102,4 @@ def treebank_surface_tokenizer(text):
     text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
                   text)
 
-    return text.split()
+    return text.split(), language

From 2a1b16b55c66b3f2693c6db3e5c53a0768602098 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 30 Apr 2015 13:02:58 -0400
Subject: [PATCH 04/16] use script codes for Chinese

---
 wordfreq_builder/wordfreq_builder/config.py     | 2 +-
 wordfreq_builder/wordfreq_builder/tokenizers.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index 829e07a..527c4e9 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -5,7 +5,7 @@ CONFIG = {
     'data_dir': './data',
     'languages': [
         'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id',
-        'ja', 'ko', 'zh-TW', 'zh-CN',
+        'ja', 'ko', 'zh-Hans', 'zh-Hant',
     ]
 }
 
diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index 4a1117f..9140cf4 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -7,8 +7,8 @@ ROSETTE = RosetteReader()
 
 # Rosette labels the orthographies of Chinese incorrectly
 ROSETTE_LANG_MAP = {
-    'zh_sc': 'zh-CN',
-    'zh_tc': 'zh-TW'
+    'zh_sc': 'zh-Hans',
+    'zh_tc': 'zh-Hant'
 }
 
 

From 5437bb4e8598ce3a28c81a871052ebe657921133 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 30 Apr 2015 16:24:28 -0400
Subject: [PATCH 05/16] WIP on new build system

---
 wordfreq_builder/.gitignore                   |  4 +++
 wordfreq_builder/Makefile                     | 12 ++++++++
 wordfreq_builder/rules.ninja                  |  2 +-
 wordfreq_builder/setup.py                     |  6 ++++
 .../wordfreq_builder/cli/build_deps.py        | 15 ++++++++++
 .../wordfreq_builder/cli/tokenize_twitter.py  | 19 +++++++++++++
 wordfreq_builder/wordfreq_builder/config.py   |  8 ++++--
 wordfreq_builder/wordfreq_builder/ninja.py    | 20 +++++++------
 .../wordfreq_builder/tokenizers.py            | 28 ++++++++++++-------
 9 files changed, 92 insertions(+), 22 deletions(-)
 create mode 100644 wordfreq_builder/Makefile
 create mode 100644 wordfreq_builder/wordfreq_builder/cli/build_deps.py
 create mode 100644 wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py

diff --git a/wordfreq_builder/.gitignore b/wordfreq_builder/.gitignore
index 46c58ff..a1da2e9 100644
--- a/wordfreq_builder/.gitignore
+++ b/wordfreq_builder/.gitignore
@@ -6,3 +6,7 @@ dist
 *.egg-info
 build
 _build
+build.ninja
+data
+.ninja_deps
+.ninja_log
diff --git a/wordfreq_builder/Makefile b/wordfreq_builder/Makefile
new file mode 100644
index 0000000..0820c5c
--- /dev/null
+++ b/wordfreq_builder/Makefile
@@ -0,0 +1,12 @@
+PYTHON = python
+
+all: build.ninja
+
+# make sure this package is in 'develop' mode and up to date
+wordfreq_builder.egg-info/PKG-INFO: setup.py
+	$(PYTHON) setup.py develop
+
+# build the Ninja file that will take over the build process
+build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO
+	wordfreq-build-deps rules.ninja > build.ninja
+
diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index 2810f1e..0355a0f 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -18,7 +18,7 @@ DATA = ./data
 # Splits the single file $in into $slices parts, whose names will be
 # $prefix plus a two-digit numeric suffix.
 rule split
-  command = mkdir -p $$(dirname $prefix) && split -d -n $slices $in $prefix
+  command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
 
 # wiki2text is a tool I wrote using the development version of Nim, which
 # extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org.
diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py
index e57c58e..1de97b5 100755
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@@ -9,4 +9,10 @@ setup(
     platforms=["any"],
     description="Turns raw data into word frequency lists",
     packages=['wordfreq_builder'],
+    entry_points={
+        'console_scripts': [
+            'wordfreq-tokenize-twitter = wordfreq_builder.cli.tokenize_twitter:main',
+            'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
+        ]
+    }
 )
diff --git a/wordfreq_builder/wordfreq_builder/cli/build_deps.py b/wordfreq_builder/wordfreq_builder/cli/build_deps.py
new file mode 100644
index 0000000..3fd74ad
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/build_deps.py
@@ -0,0 +1,15 @@
+from wordfreq_builder.ninja import make_ninja_deps
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('in_filename', help='filename of rules file')
+    args = parser.parse_args()
+
+    # Make the complete ninja file and write it to standard out
+    make_ninja_deps(args.in_filename)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
new file mode 100644
index 0000000..aa1e061
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
@@ -0,0 +1,19 @@
+from wordfreq_builder.tokenizers import rosette_surface_tokenizer, tokenize_file
+import argparse
+
+
+def tokenize_twitter(in_filename, out_prefix):
+    tokenize_file(in_filename, out_prefix,
+                  tokenizer=rosette_surface_tokenizer)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename', help='filename of input file containing one tweet per line')
+    parser.add_argument('outprefix', help='prefix of output filenames')
+    args = parser.parse_args()
+    tokenize_twitter(args.filename, args.outprefix)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index 527c4e9..c3e5cff 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -1,12 +1,14 @@
 import os
 
 CONFIG = {
-    'version': '0.8',
-    'data_dir': './data',
+    'version': '0.9.0',
+    # data_dir is a relative or absolute path to where the wordlist data
+    # is stored
+    'data_dir': 'data',
     'languages': [
         'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id',
         'ja', 'ko', 'zh-Hans', 'zh-Hant',
-    ]
+    ],
 }
 
 
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 21ff6c7..19d39d2 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -17,14 +17,14 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
 
     language_detect_and_tokenize_deps(
         data_filename('raw-input/twitter/all-2014.txt'),
-        slice_prefix='slices/twitter/tweets-2014',
-        combined_prefix='generated/twitter/tweets-2014',
-        out=out, slices=10
+        slice_prefix=data_filename('slices/twitter/tweets-2014'),
+        combined_prefix=data_filename('generated/twitter/tweets-2014'),
+        out=out, slices=40
     )
 
 
 def language_detect_and_tokenize_deps(input_filename, slice_prefix,
-                                      combined_prefix, out, slices=10):
+                                      combined_prefix, out, slices):
     lines = []
     # split the input into slices
     slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
@@ -33,7 +33,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
         outs=' '.join(slice_files), ins=input_filename
     )
     lines.append(build_rule)
-    lines.append("  prefix = {}".format(slice_prefix))
+    lines.append("  prefix = {}.part".format(slice_prefix))
     lines.append("  slices = {}".format(slices))
     lines.append("")
 
@@ -43,7 +43,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
             '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
             for language in CONFIG['languages']
         ]
-        build_rule = "build {outs}: tokenize_twitter {ins}".format(
+        build_rule = "build {outs}: tokenize_twitter {ins} | wordfreq_builder/tokenizers.py".format(
             outs=' '.join(language_outputs), ins=slice_file
         )
         lines.append(build_rule)
@@ -53,7 +53,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
     for language in CONFIG['languages']:
         combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language)
         language_inputs = [
-            '{prefix}.{lang}'.format(prefix=slice_files[slicenum], lang=language)
+            '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
             for slicenum in range(slices)
         ]
         build_rule = "build {outs}: cat {ins}".format(
@@ -65,5 +65,9 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
     print('\n'.join(lines), file=out)
 
 
-if __name__ == '__main__':
+def main():
     make_ninja_deps('rules.ninja')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index 9140cf4..d8df0f7 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -5,10 +5,13 @@ import re
 ROSETTE = RosetteReader()
 
 
-# Rosette labels the orthographies of Chinese incorrectly
+# Some of Rosette's language codes are incorrect. For example, 'zh_sc' should
+# mean "Chinese as used in Seychelles", which is kind of nonsense. What Rosette
+# really means is "Simplified Chinese", whose code is 'zh-Hans'.
 ROSETTE_LANG_MAP = {
     'zh_sc': 'zh-Hans',
-    'zh_tc': 'zh-Hant'
+    'zh_tc': 'zh-Hant',
+    'en_uc': 'en',
 }
 
 
@@ -24,19 +27,24 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
     for line in open(in_filename, encoding='utf-8'):
         text = line_reader(line)
         tokenized, language = tokenizer(text)
-        out_filename = '%s.%s.txt' % (out_prefix, language)
-        if out_filename in out_files:
-            out_file = out_files[out_filename]
-        else:
-            out_file = open(out_filename, 'w', encoding='utf-8')
-            out_files[out_filename] = out_file
-        print(tokenized, file=out_file)
+        if language is not None:
+            out_filename = '%s.%s.txt' % (out_prefix, language)
+            if out_filename in out_files:
+                out_file = out_files[out_filename]
+            else:
+                out_file = open(out_filename, 'w', encoding='utf-8')
+                out_files[out_filename] = out_file
+            print(tokenized, file=out_file)
     for out_file in out_files.values():
         out_file.close()
 
 
 def rosette_surface_tokenizer(text):
-    analysis, lang = ROSETTE.rosette.analyze(text)
+    try:
+        analysis, lang = ROSETTE.rosette.analyze(text)
+    except (RuntimeError, UnicodeError) as e:
+        # Our Rosette interface throws errors given arbitrary data. :(
+        return text, None
     language = ROSETTE_LANG_MAP.get(lang, lang)
     tokens = []
     for (stem, pos, span) in analysis:

From 34400de35a57e5a7b73d99f3fde0aa4021f1a03d Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 30 Apr 2015 16:25:42 -0400
Subject: [PATCH 06/16] not using wordfreq.cfg anymore

---
 wordfreq_builder/wordfreq.cfg | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 wordfreq_builder/wordfreq.cfg

diff --git a/wordfreq_builder/wordfreq.cfg b/wordfreq_builder/wordfreq.cfg
deleted file mode 100644
index 3146062..0000000
--- a/wordfreq_builder/wordfreq.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-[wordfreq]
-version = 0.8
-data_dir = ./data
-languages = en, es, fr, de, pt, nl, ru, it, ko, ja, zh-TW, zh-CN, ar, ms

From 61b9440e3d0960db04edd591ec977de331bb6f26 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Mon, 4 May 2015 13:25:01 -0400
Subject: [PATCH 07/16] add wiki-parsing process

---
 wordfreq_builder/wordfreq_builder/config.py |  5 +++
 wordfreq_builder/wordfreq_builder/ninja.py  | 40 +++++++++++++++++----
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index c3e5cff..bb19bdf 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -9,6 +9,11 @@ CONFIG = {
         'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id',
         'ja', 'ko', 'zh-Hans', 'zh-Hant',
     ],
+    # Skip the Chinese Wikipedia until we know what to do with it
+    'wp_languages': [
+        'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id',
+        'ja', 'ko'
+    ]
 }
 
 
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 19d39d2..f3abb9c 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -1,5 +1,6 @@
 from wordfreq_builder.config import CONFIG, data_filename
 import sys
+import pathlib
 
 HEADER = """# This file is automatically generated. Do not edit it.
 # You can regenerate it using the 'wordfreq-build-deps' command.
@@ -15,16 +16,41 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
     with open(rules_filename, encoding='utf-8') as rulesfile:
         print(rulesfile.read(), file=out)
 
-    language_detect_and_tokenize_deps(
-        data_filename('raw-input/twitter/all-2014.txt'),
-        slice_prefix=data_filename('slices/twitter/tweets-2014'),
-        combined_prefix=data_filename('generated/twitter/tweets-2014'),
-        out=out, slices=40
+    lines = (
+        language_detect_and_tokenize_deps(
+            data_filename('raw-input/twitter/all-2014.txt'),
+            slice_prefix=data_filename('slices/twitter/tweets-2014'),
+            combined_prefix=data_filename('generated/twitter/tweets-2014'),
+            slices=40
+        ) +
+        wiki_parse_deps(
+            data_filename('raw-input/wikipedia'),
+            data_filename('generated/wikipedia'),
+            CONFIG['wp_languages']
+        )
     )
+    print('\n'.join(lines), file=out)
+
+
+def wiki_parse_deps(dirname_in, dirname_out, languages):
+    lines = []
+    path_in = pathlib.Path(dirname_in)
+    path_out = pathlib.Path(dirname_out)
+    for language in languages:
+        # Find the most recent file for this language
+        input_file = max(path_in.glob(
+            '{}wiki*.bz2'.format(language)
+        ))
+        output_file = path_out / 'wikipedia_{}.txt'.format(language)
+        build_rule = "build {outs}: wiki2text {ins}".format(
+            outs=output_file, ins=input_file
+        )
+        lines.append(build_rule)
+    return lines
 
 
 def language_detect_and_tokenize_deps(input_filename, slice_prefix,
-                                      combined_prefix, out, slices):
+                                      combined_prefix, slices):
     lines = []
     # split the input into slices
     slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
@@ -62,7 +88,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
         )
         lines.append(build_rule)
 
-    print('\n'.join(lines), file=out)
+    return lines
 
 
 def main():

From 5787b6bb73cda3458d270e06c0813a2e15c415eb Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Tue, 5 May 2015 13:59:21 -0400
Subject: [PATCH 08/16] add and adjust some build steps

- more build steps for Wikipedia
- rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that
  the results are preliminary
---
 wordfreq_builder/rules.ninja                  | 23 ++++++++--
 wordfreq_builder/setup.py                     |  3 +-
 ...nize_twitter.py => pretokenize_twitter.py} | 10 ++---
 .../cli/tokenize_wikipedia.py                 | 30 +++++++++++++
 wordfreq_builder/wordfreq_builder/ninja.py    |  7 +++-
 .../wordfreq_builder/tokenizers.py            | 42 +++++++++++++++++--
 6 files changed, 101 insertions(+), 14 deletions(-)
 rename wordfreq_builder/wordfreq_builder/cli/{tokenize_twitter.py => pretokenize_twitter.py} (62%)
 create mode 100644 wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index 0355a0f..9be5bd1 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -20,15 +20,30 @@ DATA = ./data
 rule split
   command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
 
-# wiki2text is a tool I wrote using the development version of Nim, which
-# extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org.
-# The code is at https://github.com/rspeer/wiki2text, but right now it'll
-# take a bit of setup to get it to run.
+# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
+# Wikipedia dumps obtained from dumps.wikimedia.org.  The code is at
+# https://github.com/rspeer/wiki2text.
 rule wiki2text
   command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
 
+rule wiki2tokens
+  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
+
 rule tokenize_twitter
   command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix
 
+# This rule uses command-line tools to take in a file with one token per line,
+# and output a comma-separated file with the token counts:
+#
+#   * 'sort $in | uniq -c' does the actual counting.
+#   * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
+#     (the count).
+#   * The 'sed' command rearranges the lines to be comma-separated values with
+#     the count coming second, instead of the count being a right-justified
+#     number at the start of the line.
+#
+rule count
+  command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)$/\2,\1/' > $out
+
 rule cat
   command = cat $in > $out
diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py
index 1de97b5..3f3b902 100755
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@@ -11,7 +11,8 @@ setup(
     packages=['wordfreq_builder'],
     entry_points={
         'console_scripts': [
-            'wordfreq-tokenize-twitter = wordfreq_builder.cli.tokenize_twitter:main',
+            'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
+            'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
             'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
         ]
     }
diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
similarity index 62%
rename from wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
rename to wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
index aa1e061..c179988 100644
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
@@ -1,10 +1,10 @@
-from wordfreq_builder.tokenizers import rosette_surface_tokenizer, tokenize_file
+from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file
 import argparse
 
 
-def tokenize_twitter(in_filename, out_prefix):
-    tokenize_file(in_filename, out_prefix,
-                  tokenizer=rosette_surface_tokenizer)
+def pretokenize_twitter(in_filename, out_prefix):
+    pretokenize_file(in_filename, out_prefix,
+                     tokenizer=rosette_surface_tokenizer)
 
 
 def main():
@@ -12,7 +12,7 @@ def main():
     parser.add_argument('filename', help='filename of input file containing one tweet per line')
     parser.add_argument('outprefix', help='prefix of output filenames')
     args = parser.parse_args()
-    tokenize_twitter(args.filename, args.outprefix)
+    pretokenize_twitter(args.filename, args.outprefix)
 
 
 if __name__ == '__main__':
diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
new file mode 100644
index 0000000..fa97543
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
@@ -0,0 +1,30 @@
+from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file
+import argparse
+
+
+def tokenize_wikipedia(in_filename, out_filename, language, proportion):
+    monolingual_tokenize_file(
+        in_filename, out_filename,
+        language=language,
+        tokenizer=rosette_surface_tokenizer,
+        line_reader=strip_headings,
+        sample_proportion=proportion
+    )
+
+
+def strip_headings(text):
+    return text.strip().strip('=')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('in_filename', help='filename of input file')
+    parser.add_argument('out_filename', help='filename of output file')
+    parser.add_argument('language', help='the language code of the text')
+    parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
+    args = parser.parse_args()
+    tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index f3abb9c..ab74ec8 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -46,6 +46,11 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
             outs=output_file, ins=input_file
         )
         lines.append(build_rule)
+        output_file = path_out / 'wikipedia_{}.tokens.txt'.format(language)
+        build_rule = "build {outs}: wiki2tokens {ins}".format(
+            outs=output_file, ins=input_file
+        )
+        lines.append(build_rule)
     return lines
 
 
@@ -69,7 +74,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
             '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
             for language in CONFIG['languages']
         ]
-        build_rule = "build {outs}: tokenize_twitter {ins} | wordfreq_builder/tokenizers.py".format(
+        build_rule = "build {outs}: tokenize_twitter {ins}".format(
             outs=' '.join(language_outputs), ins=slice_file
         )
         lines.append(build_rule)
diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index d8df0f7..29be251 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -1,5 +1,6 @@
 from lumi_science.text_readers.rosette_readers import RosetteReader
 import re
+import unicodedata
 
 
 ROSETTE = RosetteReader()
@@ -15,6 +16,9 @@ ROSETTE_LANG_MAP = {
 }
 
 
+NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff０-９Ａ-Ｚａ-ｚ\uff66-\U0002ffff]')
+
+
 def last_tab(line):
     """
     Read lines by keeping only the last tab-separated value.
@@ -22,11 +26,26 @@ def last_tab(line):
     return line.split('\t')[-1].strip()
 
 
-def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
+def non_punct_filter(token):
+    if NON_PUNCT_RE.search(token):
+        return token.lower()
+    else:
+        return None
+
+
+def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
+    """
+    Process a file by running it through the given tokenizer, sorting the
+    results by the language of each line, and inserting spaces into lines
+    to mark the token boundaries. This computes the 'hard part' of
+    tokenization and allows the results to be saved, so that we can change
+    the finer details of the output without re-running everything.
+    """
     out_files = {}
     for line in open(in_filename, encoding='utf-8'):
         text = line_reader(line)
-        tokenized, language = tokenizer(text)
+        tokens, language = tokenizer(text)
+        tokenized = ' '.join(tokens)
         if language is not None:
             out_filename = '%s.%s.txt' % (out_prefix, language)
             if out_filename in out_files:
@@ -39,6 +58,23 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
         out_file.close()
 
 
+def monolingual_tokenize_file(in_filename, out_filename, language,
+                              tokenizer, line_reader=last_tab,
+                              token_filter=non_punct_filter,
+                              sample_proportion=100):
+    with open(in_filename, encoding='utf-8', errors='replace') as in_file:
+        with open(out_filename, 'w', encoding='utf-8') as out_file:
+            for i, line in enumerate(in_file):
+                if i % sample_proportion == 0:
+                    text = line_reader(line)
+                    tokens, line_language = tokenizer(text)
+                    if line_language == language:
+                        filtered = [token_filter(t) for t in tokens]
+                        filtered = [t for t in filtered if t is not None]
+                        for token in filtered:
+                            print(token, file=out_file)
+
+
 def rosette_surface_tokenizer(text):
     try:
         analysis, lang = ROSETTE.rosette.analyze(text)
@@ -50,7 +86,7 @@ def rosette_surface_tokenizer(text):
     for (stem, pos, span) in analysis:
         surface_text = text[span[0]:span[1]]
         tokens.append(surface_text)
-    return ' '.join(tokens), language
+    return tokens, language
 
 
 def treebank_surface_tokenizer(text, language='en'):

From bd579e231915b5b9834272aec4ce4982eb6f7466 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Tue, 5 May 2015 14:06:13 -0400
Subject: [PATCH 09/16] fix the 'count' ninja rule

---
 wordfreq_builder/rules.ninja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index 9be5bd1..032d80a 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -43,7 +43,7 @@ rule tokenize_twitter
 #     number at the start of the line.
 #
 rule count
-  command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)$/\2,\1/' > $out
+  command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
 
 rule cat
   command = cat $in > $out

From 16928ed182c8dd7196ac83223abe9d1aacd1792e Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Tue, 5 May 2015 15:21:24 -0400
Subject: [PATCH 10/16] add rules to count wikipedia tokens

---
 wordfreq_builder/rules.ninja               |  8 ++++++--
 wordfreq_builder/wordfreq_builder/ninja.py | 21 +++++++++++++++------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index 032d80a..a1dc1c7 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -35,7 +35,11 @@ rule tokenize_twitter
 # This rule uses command-line tools to take in a file with one token per line,
 # and output a comma-separated file with the token counts:
 #
-#   * 'sort $in | uniq -c' does the actual counting.
+#   * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte
+#     order, which is fine because we only need this order so we can run
+#     'uniq'.
+#   * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly
+#     large amount of temporary output goes in $tmp.
 #   * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
 #     (the count).
 #   * The 'sed' command rearranges the lines to be comma-separated values with
@@ -43,7 +47,7 @@ rule tokenize_twitter
 #     number at the start of the line.
 #
 rule count
-  command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
+  command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
 
 rule cat
   command = cat $in > $out
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index ab74ec8..1c58154 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -5,6 +5,7 @@ import pathlib
 HEADER = """# This file is automatically generated. Do not edit it.
 # You can regenerate it using the 'wordfreq-build-deps' command.
 """
+TMPDIR = data_filename('tmp')
 
 
 def make_ninja_deps(rules_filename, out=sys.stdout):
@@ -17,13 +18,13 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
         print(rulesfile.read(), file=out)
 
     lines = (
-        language_detect_and_tokenize_deps(
+        twitter_deps(
             data_filename('raw-input/twitter/all-2014.txt'),
             slice_prefix=data_filename('slices/twitter/tweets-2014'),
-            combined_prefix=data_filename('generated/twitter/tweets-2014'),
+            combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
             slices=40
         ) +
-        wiki_parse_deps(
+        wikipedia_deps(
             data_filename('raw-input/wikipedia'),
             data_filename('generated/wikipedia'),
             CONFIG['wp_languages']
@@ -32,7 +33,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
     print('\n'.join(lines), file=out)
 
 
-def wiki_parse_deps(dirname_in, dirname_out, languages):
+def wikipedia_deps(dirname_in, dirname_out, languages):
     lines = []
     path_in = pathlib.Path(dirname_in)
     path_out = pathlib.Path(dirname_out)
@@ -51,11 +52,19 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
             outs=output_file, ins=input_file
         )
         lines.append(build_rule)
+
+        token_file = output_file
+        output_file = path_out / 'wikipedia_{}.counts.txt'.format(language)
+        build_rule = "build {outs}: count {ins}".format(
+            outs=output_file, ins=token_file
+        )
+        lines.append(build_rule)
+        lines.append("  tmp = {}".format(TMPDIR))
     return lines
 
 
-def language_detect_and_tokenize_deps(input_filename, slice_prefix,
-                                      combined_prefix, slices):
+def twitter_deps(input_filename, slice_prefix,
+                 combined_prefix, slices):
     lines = []
     # split the input into slices
     slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)

From d2f9c607764c4ebdede2a1ef81ed62d9000367d7 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 7 May 2015 16:49:53 -0400
Subject: [PATCH 11/16] WIP on more build steps

---
 wordfreq_builder/Makefile                     |   2 +-
 wordfreq_builder/rules.ninja                  |  38 +++--
 wordfreq_builder/setup.py                     |   1 +
 .../wordfreq_builder/cli/combine_lists.py     |  16 +-
 .../wordfreq_builder/cli/count_tokens.py      |  16 ++
 .../wordfreq_builder/cli/count_twitter.py     |  28 ----
 .../wordfreq_builder/cli/count_wikipedia.py   |  24 ---
 .../wordfreq_builder/cli/format_twitter.py    |  14 ++
 wordfreq_builder/wordfreq_builder/config.py   |  53 +++++--
 wordfreq_builder/wordfreq_builder/ninja.py    |  74 ++++++---
 .../wordfreq_builder/tokenizers.py            |  66 +++++++-
 .../wordfreq_builder/word_counts.py           | 146 ++++++------------
 12 files changed, 268 insertions(+), 210 deletions(-)
 create mode 100644 wordfreq_builder/wordfreq_builder/cli/count_tokens.py
 delete mode 100644 wordfreq_builder/wordfreq_builder/cli/count_twitter.py
 delete mode 100644 wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py
 create mode 100644 wordfreq_builder/wordfreq_builder/cli/format_twitter.py

diff --git a/wordfreq_builder/Makefile b/wordfreq_builder/Makefile
index 0820c5c..206c432 100644
--- a/wordfreq_builder/Makefile
+++ b/wordfreq_builder/Makefile
@@ -8,5 +8,5 @@ wordfreq_builder.egg-info/PKG-INFO: setup.py
 
 # build the Ninja file that will take over the build process
 build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO
-	wordfreq-build-deps rules.ninja > build.ninja
+	$(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja
 
diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index a1dc1c7..6b6c018 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -29,25 +29,29 @@ rule wiki2text
 rule wiki2tokens
   command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
 
-rule tokenize_twitter
-  command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix
+rule tokenize_japanese
+  command = mkdir -p $$(dirname $out) && mecab < $in | cut -f 1 | grep -v "EOS"
+
+rule tokenize_twitter
+  command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.pretokenize_twitter $in $prefix
+
+rule format_twitter
+  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.format_twitter $in $out
+
+# To convert the Leeds corpus, look for space-separated lines that start with
+# an integer and a decimal. The integer is the rank, which we discard. The
+# decimal is the frequency, and the remaining text is the term. Use sed -n
+# with /p to output only lines where the match was successful.
+rule convert_leeds
+  command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in > $out
+
+# To convert the OpenSubtitles frequency data, simply replace spaces with
+# commas.
+rule convert_opensubtitles
+  command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
 
-# This rule uses command-line tools to take in a file with one token per line,
-# and output a comma-separated file with the token counts:
-#
-#   * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte
-#     order, which is fine because we only need this order so we can run
-#     'uniq'.
-#   * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly
-#     large amount of temporary output goes in $tmp.
-#   * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
-#     (the count).
-#   * The 'sed' command rearranges the lines to be comma-separated values with
-#     the count coming second, instead of the count being a right-justified
-#     number at the start of the line.
-#
 rule count
-  command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
+  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
 
 rule cat
   command = cat $in > $out
diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py
index 3f3b902..88b6d49 100755
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@@ -12,6 +12,7 @@ setup(
     entry_points={
         'console_scripts': [
             'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
+            'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
             'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
             'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
         ]
diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
index 7b67375..61d9674 100644
--- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
+++ b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
@@ -1,21 +1,19 @@
-from wordfreq_builder.word_counts import read_counts, write_counts, merge_counts
-from pathlib import Path
+from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
 import argparse
 
 
-def merge_lists(input_names, output_name, balance=False):
-    count_dicts = []
+def merge_lists(input_names, output_name):
+    freq_dicts = []
     for input_name in input_names:
-        count_dicts.append(read_counts(Path(input_name)))
-    merged = merge_counts(count_dicts, balance=balance)
-    write_counts(merged, Path(output_name))
+        freq_dicts.append(read_freqs(input_name))
+    merged = merge_freqs(freq_dicts)
+    write_wordlist(merged, output_name)
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
-    parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies')
     parser.add_argument('inputs', help='names of input files to merge', nargs='+')
     args = parser.parse_args()
-    merge_lists(args.inputs, args.output, balance=args.balance)
+    merge_lists(args.inputs, args.output)
 
diff --git a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
new file mode 100644
index 0000000..4aeba5b
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
@@ -0,0 +1,16 @@
+from wordfreq_builder.word_counts import count_tokens, write_wordlist
+import argparse
+
+
+def handle_counts(filename_in, filename_out):
+    counts = count_tokens(filename_in)
+    write_wordlist(counts, filename_out)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename_in', help='name of input file containing tokens')
+    parser.add_argument('filename_out', help='name of output file')
+    args = parser.parse_args()
+    handle_counts(args.filename_in, args.filename_out)
+
diff --git a/wordfreq_builder/wordfreq_builder/cli/count_twitter.py b/wordfreq_builder/wordfreq_builder/cli/count_twitter.py
deleted file mode 100644
index 7613d2d..0000000
--- a/wordfreq_builder/wordfreq_builder/cli/count_twitter.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from wordfreq_builder.word_counts import WordCountBuilder
-from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
-from pathlib import Path
-import argparse
-
-
-def count_twitter(pathname, offset=0, nsplit=1, surface=False):
-    path = Path(pathname)
-    if surface == True:
-        tokenizer = rosette_surface_tokenizer
-    else:
-        tokenizer = rosette_tokenizer
-    builder = WordCountBuilder(tokenizer=tokenizer)
-    save_filename = 'twitter-counts-%d.csv' % offset
-    save_pathname = path.parent / save_filename
-    builder.count_twitter(path, offset, nsplit)
-    builder.save_wordlist(save_pathname)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('filename', help='filename of input file containing one tweet per line')
-    parser.add_argument('offset', type=int)
-    parser.add_argument('nsplit', type=int)
-    parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems')
-    args = parser.parse_args()
-    count_twitter(args.filename, args.offset, args.nsplit, surface=args.surface)
-
diff --git a/wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py
deleted file mode 100644
index da51519..0000000
--- a/wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from wordfreq_builder.word_counts import WordCountBuilder
-from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
-from pathlib import Path
-import argparse
-
-
-def count_wikipedia(filename, surface=False):
-    path = Path(filename)
-    if surface == True:
-        tokenizer = rosette_surface_tokenizer
-    else:
-        tokenizer = rosette_tokenizer
-    builder = WordCountBuilder(tokenizer=tokenizer, unique_docs=False)
-    builder.count_wikipedia(path)
-    builder.save_wordlist(path.parent / 'counts.csv')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('filename', help='flat text file containing extracted Wikipedia text')
-    parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems')
-    args = parser.parse_args()
-    count_wikipedia(args.filename, surface=args.surface)
-
diff --git a/wordfreq_builder/wordfreq_builder/cli/format_twitter.py b/wordfreq_builder/wordfreq_builder/cli/format_twitter.py
new file mode 100644
index 0000000..224c5a1
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/format_twitter.py
@@ -0,0 +1,14 @@
+from wordfreq_builder.tokenizers import retokenize_file
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('in_filename', help='filename of input file containing one tweet per line')
+    parser.add_argument('out_filename', help='filename of output file')
+    args = parser.parse_args()
+    retokenize_file(args.in_filename, args.out_filename)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index bb19bdf..ec62634 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -5,17 +5,52 @@ CONFIG = {
     # data_dir is a relative or absolute path to where the wordlist data
     # is stored
     'data_dir': 'data',
-    'languages': [
-        'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id',
-        'ja', 'ko', 'zh-Hans', 'zh-Hant',
-    ],
-    # Skip the Chinese Wikipedia until we know what to do with it
-    'wp_languages': [
-        'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id',
-        'ja', 'ko'
-    ]
+    'sources': {
+        # A list of language codes (possibly un-standardized) that we'll
+        # look up in filenames for these various data sources.
+        'twitter': [
+            'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
+            'pt', 'ru'
+        ],
+        'wikipedia': [
+            'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
+            'pt', 'ru'
+        ],
+        'opensubtitles': [
+            # All languages where the most common word in OpenSubtitles
+            # appears at least 5000 times
+            'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
+            'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv',
+            'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sq', 'sr',
+            'sv', 'tr', 'uk', 'zh'
+        ],
+        'leeds': [
+            'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh'
+        ]
+    },
+    'wordlist_paths': {
+        'twitter': 'generated/twitter/tweets-2014.{lang}{ext}.txt',
+        'wikipedia': 'generated/wikipedia/wikipedia_{lang}{ext}.txt',
+        'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}{ext}.txt',
+        'leeds': 'generated/leeds/leeds_internet_{lang}{ext}.txt'
+    }
 }
 
 
 def data_filename(filename):
     return os.path.join(CONFIG['data_dir'], filename)
+
+
+def wordlist_filename(source, language, extension=''):
+    path = CONFIG['wordlist_paths'][source].format(
+        lang=language, ext=extension
+    )
+    return data_filename(path)
+
+
+def source_names(language):
+    """
+    Get the names of data sources that supply data for the given language.
+    """
+    return sorted([key for key in CONFIG['sources']
+                  if language in CONFIG['sources'][key]])
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 1c58154..3770c41 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -1,4 +1,4 @@
-from wordfreq_builder.config import CONFIG, data_filename
+from wordfreq_builder.config import CONFIG, data_filename, wordlist_filename
 import sys
 import pathlib
 
@@ -8,6 +8,10 @@ HEADER = """# This file is automatically generated. Do not edit it.
 TMPDIR = data_filename('tmp')
 
 
+# Set this to True to rebuild the Twitter tokenization (which takes days)
+PRETOKENIZE_TWITTER = False
+
+
 def make_ninja_deps(rules_filename, out=sys.stdout):
     """
     Output a complete Ninja file describing how to build the wordfreq data.
@@ -17,44 +21,53 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
     with open(rules_filename, encoding='utf-8') as rulesfile:
         print(rulesfile.read(), file=out)
 
-    lines = (
+    lines = []
+    if PRETOKENIZE_TWITTER:
+        lines.extend(
+            twitter_preprocess_deps(
+                data_filename('raw-input/twitter/all-2014.txt'),
+                slice_prefix=data_filename('slices/twitter/tweets-2014'),
+                combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
+                slices=40,
+                languages=CONFIG['sources']['twitter']
+            )
+        )
+    lines.extend(
         twitter_deps(
-            data_filename('raw-input/twitter/all-2014.txt'),
-            slice_prefix=data_filename('slices/twitter/tweets-2014'),
-            combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
-            slices=40
-        ) +
+            data_filename('intermediate/twitter/tweets-2014'),
+            languages=CONFIG['sources']['twitter']
+        )
+    )
+    lines.extend(
         wikipedia_deps(
             data_filename('raw-input/wikipedia'),
-            data_filename('generated/wikipedia'),
-            CONFIG['wp_languages']
+            CONFIG['sources']['wikipedia']
         )
     )
     print('\n'.join(lines), file=out)
 
 
-def wikipedia_deps(dirname_in, dirname_out, languages):
+def wikipedia_deps(dirname_in, languages):
     lines = []
     path_in = pathlib.Path(dirname_in)
-    path_out = pathlib.Path(dirname_out)
     for language in languages:
         # Find the most recent file for this language
         input_file = max(path_in.glob(
             '{}wiki*.bz2'.format(language)
         ))
-        output_file = path_out / 'wikipedia_{}.txt'.format(language)
+        output_file = wordlist_filename('wikipedia', language, '')
         build_rule = "build {outs}: wiki2text {ins}".format(
             outs=output_file, ins=input_file
         )
         lines.append(build_rule)
-        output_file = path_out / 'wikipedia_{}.tokens.txt'.format(language)
+        output_file = wordlist_filename('wikipedia', language, '.tokens')
         build_rule = "build {outs}: wiki2tokens {ins}".format(
             outs=output_file, ins=input_file
         )
         lines.append(build_rule)
 
         token_file = output_file
-        output_file = path_out / 'wikipedia_{}.counts.txt'.format(language)
+        output_file = wordlist_filename('wikipedia', language, '.counts')
         build_rule = "build {outs}: count {ins}".format(
             outs=output_file, ins=token_file
         )
@@ -63,12 +76,13 @@ def wikipedia_deps(dirname_in, dirname_out, languages):
     return lines
 
 
-def twitter_deps(input_filename, slice_prefix,
-                 combined_prefix, slices):
+def twitter_preprocess_deps(input_filename, slice_prefix,
+                            combined_prefix, slices, languages):
     lines = []
-    # split the input into slices
+
     slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
                    for num in range(slices)]
+    # split the input into slices
     build_rule = "build {outs}: split {ins}".format(
         outs=' '.join(slice_files), ins=input_filename
     )
@@ -81,7 +95,7 @@ def twitter_deps(input_filename, slice_prefix,
         slice_file = slice_files[slicenum]
         language_outputs = [
             '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
-            for language in CONFIG['languages']
+            for language in languages
         ]
         build_rule = "build {outs}: tokenize_twitter {ins}".format(
             outs=' '.join(language_outputs), ins=slice_file
@@ -90,8 +104,9 @@ def twitter_deps(input_filename, slice_prefix,
         lines.append("  prefix = {}".format(slice_file))
         lines.append("")
 
-    for language in CONFIG['languages']:
+    for language in languages:
         combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language)
+
         language_inputs = [
             '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
             for slicenum in range(slices)
@@ -102,6 +117,27 @@ def twitter_deps(input_filename, slice_prefix,
         )
         lines.append(build_rule)
 
+
+def twitter_deps(prefix_in, languages):
+    lines = []
+    for language in languages:
+        input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language)
+        output_file = wordlist_filename('twitter', language, '.tokens')
+        build_rule = "build {outs}: format_twitter {ins} | {deps}".format(
+            outs=output_file,
+            ins=input_file,
+            deps='wordfreq_builder/tokenizers.py'
+        )
+        lines.append(build_rule)
+
+        token_file = output_file
+        output_file = wordlist_filename('twitter', language, '.counts')
+        build_rule = "build {outs}: count {ins}".format(
+            outs=output_file, ins=token_file
+        )
+        lines.append(build_rule)
+        lines.append("  tmp = {}".format(TMPDIR))
+
     return lines
 
 
diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py
index 29be251..e4ea914 100644
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@@ -1,6 +1,6 @@
 from lumi_science.text_readers.rosette_readers import RosetteReader
+from html.entities import name2codepoint
 import re
-import unicodedata
 
 
 ROSETTE = RosetteReader()
@@ -18,6 +18,9 @@ ROSETTE_LANG_MAP = {
 
 NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff０-９Ａ-Ｚａ-ｚ\uff66-\U0002ffff]')
 
+EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF'
+RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE))
+
 
 def last_tab(line):
     """
@@ -26,13 +29,17 @@ def last_tab(line):
     return line.split('\t')[-1].strip()
 
 
-def non_punct_filter(token):
+def lowercase_text_filter(token):
     if NON_PUNCT_RE.search(token):
         return token.lower()
     else:
         return None
 
 
+def is_url(token):
+    return token.startswith('http:') or token.startswith('https:')
+
+
 def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
     """
     Process a file by running it through the given tokenizer, sorting the
@@ -58,9 +65,60 @@ def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
         out_file.close()
 
 
+ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
+
+
+def fix_entities(text):
+    """
+    Fix the few HTML entities that Twitter uses -- even if they've
+    already been tokenized.
+    """
+    def replace_entity(match):
+        return chr(name2codepoint[match.group(1)])
+    return ENTITY_RE.sub(replace_entity, text)
+
+
+def retokenize(text):
+    text = fix_entities(text)
+    tokens = RETOKENIZE_RE.findall(text)
+    skip_next = False
+    for token in tokens:
+        if token == '/' or token == '@':
+            # Avoid idiosyncratic tokens such as URLs and
+            # usernames
+            skip_next = True
+        elif skip_next:
+            skip_next = False
+        else:
+            if not is_url(token):
+                filtered = lowercase_text_filter(token)
+                if filtered:
+                    yield filtered
+
+
+def retokenize_file(in_filename, out_filename):
+    """
+    Process a file that has been tokenized (by inserting spaces) in a
+    language-specific way by Rosette.
+    """
+    with open(in_filename, encoding='utf-8') as in_file:
+        with open(out_filename, 'w', encoding='utf-8') as out_file:
+            for line in in_file:
+                skip_next = False
+                for token in retokenize(line.strip()):
+                    if skip_next:
+                        skip_next = False
+                    elif token == '/' or token == '@':
+                        # Avoid idiosyncratic tokens such as URLs and
+                        # usernames
+                        skip_next = True
+                    elif lowercase_text_filter(token):
+                        print(token, file=out_file)
+
+
 def monolingual_tokenize_file(in_filename, out_filename, language,
                               tokenizer, line_reader=last_tab,
-                              token_filter=non_punct_filter,
+                              token_filter=lowercase_text_filter,
                               sample_proportion=100):
     with open(in_filename, encoding='utf-8', errors='replace') as in_file:
         with open(out_filename, 'w', encoding='utf-8') as out_file:
@@ -78,7 +136,7 @@ def monolingual_tokenize_file(in_filename, out_filename, language,
 def rosette_surface_tokenizer(text):
     try:
         analysis, lang = ROSETTE.rosette.analyze(text)
-    except (RuntimeError, UnicodeError) as e:
+    except (RuntimeError, UnicodeError):
         # Our Rosette interface throws errors given arbitrary data. :(
         return text, None
     language = ROSETTE_LANG_MAP.get(lang, lang)
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index a379e8e..b150ddd 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -1,116 +1,64 @@
-from wordfreq_builder.tokenizers import treebank_surface_tokenizer
+from wordfreq_builder.tokenizers import retokenize
 from collections import defaultdict
 from operator import itemgetter
-from pathlib import Path
-from unicodedata import normalize
+from ftfy import fix_text
 import csv
-import sys
 
 
-def read_counts(path):
+def count_tokens(filename):
     counts = defaultdict(int)
-    with path.open(encoding='utf-8', newline='') as infile:
-        reader = csv.reader(infile)
-        for key, strval in reader:
-            val = float(strval)
-            # Use += so that, if we give the reader concatenated files with
-            # duplicates, it does the right thing
-            counts[key] += val
+    with open(filename, encoding='utf-8') as infile:
+        for line in infile:
+            for token in retokenize(line.strip()):
+                counts[token] += 1
     return counts
 
 
-def count_languages(counts):
-    langcounts = defaultdict(int)
-    for key, strval in counts.items():
-        val = int(strval)
-        text, lang = key.rsplit('|', 1)
-        langcounts[lang] += val
-    return langcounts
+def read_freqs(filename, cutoff=2):
+    raw_counts = defaultdict(float)
+    total = 0.
+    with open(filename, encoding='utf-8', newline='') as infile:
+        reader = csv.reader(infile)
+        for key, strval in reader:
+            val = float(strval)
+            if val < cutoff:
+                break
+            for token in retokenize(key):
+                token = fix_text(token)
+                total += val
+                # Use += so that, if we give the reader concatenated files with
+                # duplicates, it does the right thing
+                raw_counts[token] += val
+
+    freqs = {key: raw_count / total
+             for (key, raw_count) in raw_counts.items()}
+    return freqs
 
 
-def merge_counts(count_dicts, balance=False):
+def merge_freqs(freq_dicts):
+    vocab = set()
+    for freq_dict in freq_dicts:
+        vocab |= set(freq_dict)
+
     merged = defaultdict(float)
-    maxweight = None
-    for counts in count_dicts:
-        if balance:
-            if maxweight is None:
-                maxweight = max(counts.values())
-            weight = maxweight / max(counts.values()) / len(count_dicts)
-        else:
-            weight = 1.
-        for key, val in counts.items():
-            merged[key] += val * weight
+    N = len(freq_dicts)
+    for term in vocab:
+        term_total = 0.
+        for freq_dict in freq_dicts:
+            term_total += freq_dict.get(term, 0.)
+        merged[term] = term_total / N
+
     return merged
 
 
-def write_counts(counts, path, cutoff=2):
-    print("Writing to %s" % path)
-    with path.open('w', encoding='utf-8', newline='') as outfile:
+def write_wordlist(freqs, filename):
+    """
+    Write a dictionary of either raw counts or frequencies to a file of
+    comma-separated values.
+    """
+    with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
         writer = csv.writer(outfile)
-        items = sorted(counts.items(), key=itemgetter(1), reverse=True)
-        for word, count in items:
-            if count < cutoff:
-                # Don't write all the terms that appeared too infrequently
-                break
+        items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
+        for word, freq in items:
             if not ('"' in word or ',' in word):
-                writer.writerow([word, str(int(count))])
-
-
-class WordCountBuilder:
-    def __init__(self, unique_docs=True, tokenizer=None):
-        self.counts = defaultdict(int)
-        self.unique_docs = unique_docs
-        if tokenizer is None:
-            self.tokenizer = treebank_surface_tokenizer
-        else:
-            self.tokenizer = tokenizer
-
-    def add_text(self, text):
-        text = normalize('NFKC', text).lower()
-        try:
-            tokens = self.tokenizer(text)
-            # print(' '.join(tokens))
-        except Exception as e:
-            print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr)
-            return
-        if self.unique_docs:
-            tokens = set(tokens)
-        for tok in tokens:
-            self.counts[tok] += 1
-
-    def count_wikipedia(self, path):
-        """
-        Read a directory of extracted Wikipedia articles. The articles can be
-        grouped together into files, in which case they should be separated by
-        lines beginning with ##.
-        """
-        with path.open(encoding='utf-8') as file:
-            article_lines = []
-            for line in file:
-                line = line.strip()
-                if line.startswith('= ') and line.endswith(' ='):
-                    # Fake level-1 headings indicate boundaries between articles
-                    print(line)
-                    self.try_wiki_article(' '.join(article_lines))
-                    article_lines.clear()
-                else:
-                    # Skip other headings, so that "external" doesn't look
-                    # ridiculously common, for example
-                    if not (line.startswith('==') and line.endswith('==')):
-                        article_lines.append(line)
-            self.try_wiki_article(' '.join(article_lines))
-
-    def try_wiki_article(self, text):
-        if len(text) > 1000:
-            self.add_text(text)
-
-    def count_twitter(self, path, offset, nsplit):
-        with path.open(encoding='utf-8') as file:
-            for i, line in enumerate(file):
-                if i % nsplit == offset:
-                    line = line.strip()
-                    text = line.split('\t')[-1]
-                    self.add_text(text)
-
-    def save_wordlist(self, path):
-        write_counts(self.counts, path)
+                writer.writerow([word, str(freq)])

From 7e238cf547391fed9d532cb79ca70041681bac17 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 7 May 2015 16:59:28 -0400
Subject: [PATCH 12/16] abstract how we define build rules a bit

---
 wordfreq_builder/wordfreq_builder/ninja.py | 89 ++++++++++------------
 1 file changed, 40 insertions(+), 49 deletions(-)

diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 3770c41..4099049 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -12,6 +12,27 @@ TMPDIR = data_filename('tmp')
 PRETOKENIZE_TWITTER = False
 
 
+def add_dep(lines, rule, input, output, extra=None, params=None):
+    if isinstance(output, list):
+        output = ' '.join(output)
+    if isinstance(input, list):
+        input = ' '.join(input)
+    if extra:
+        if isinstance(extra, list):
+            extra = ' '.join(extra)
+        extrastr = ' | ' + extra
+    else:
+        extrastr = ''
+    build_rule = "build {output}: {rule} {input}{extra}".format(
+        output=output, rule=rule, input=input, extra=extrastr
+    )
+    lines.append(build_rule)
+    if params:
+        for key, val in params.items():
+            lines.append("  {key} = {val}".format(locals()))
+    lines.append("")
+
+
 def make_ninja_deps(rules_filename, out=sys.stdout):
     """
     Output a complete Ninja file describing how to build the wordfreq data.
@@ -55,24 +76,13 @@ def wikipedia_deps(dirname_in, languages):
         input_file = max(path_in.glob(
             '{}wiki*.bz2'.format(language)
         ))
-        output_file = wordlist_filename('wikipedia', language, '')
-        build_rule = "build {outs}: wiki2text {ins}".format(
-            outs=output_file, ins=input_file
-        )
-        lines.append(build_rule)
-        output_file = wordlist_filename('wikipedia', language, '.tokens')
-        build_rule = "build {outs}: wiki2tokens {ins}".format(
-            outs=output_file, ins=input_file
-        )
-        lines.append(build_rule)
+        raw_file = wordlist_filename('wikipedia', language, '')
+        token_file = wordlist_filename('wikipedia', language, '.tokens')
+        count_file = wordlist_filename('wikipedia', language, '.counts')
 
-        token_file = output_file
-        output_file = wordlist_filename('wikipedia', language, '.counts')
-        build_rule = "build {outs}: count {ins}".format(
-            outs=output_file, ins=token_file
-        )
-        lines.append(build_rule)
-        lines.append("  tmp = {}".format(TMPDIR))
+        add_dep(lines, 'wiki2text', input_file, raw_file)
+        add_dep(lines, 'wiki2tokens', input_file, token_file)
+        add_dep(lines, 'count', token_file, count_file)
     return lines
 
 
@@ -83,13 +93,10 @@ def twitter_preprocess_deps(input_filename, slice_prefix,
     slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
                    for num in range(slices)]
     # split the input into slices
-    build_rule = "build {outs}: split {ins}".format(
-        outs=' '.join(slice_files), ins=input_filename
-    )
-    lines.append(build_rule)
-    lines.append("  prefix = {}.part".format(slice_prefix))
-    lines.append("  slices = {}".format(slices))
-    lines.append("")
+    add_dep(lines,
+            'split', input_filename, slice_files,
+            {'prefix': '{}.part'.format(slice_prefix),
+             'slices': slices})
 
     for slicenum in range(slices):
         slice_file = slice_files[slicenum]
@@ -97,12 +104,8 @@ def twitter_preprocess_deps(input_filename, slice_prefix,
             '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
             for language in languages
         ]
-        build_rule = "build {outs}: tokenize_twitter {ins}".format(
-            outs=' '.join(language_outputs), ins=slice_file
-        )
-        lines.append(build_rule)
-        lines.append("  prefix = {}".format(slice_file))
-        lines.append("")
+        add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
+                {'prefix': slice_file})
 
     for language in languages:
         combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language)
@@ -111,32 +114,20 @@ def twitter_preprocess_deps(input_filename, slice_prefix,
             '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
             for slicenum in range(slices)
         ]
-        build_rule = "build {outs}: cat {ins}".format(
-            outs=combined_output,
-            ins=' '.join(language_inputs)
-        )
-        lines.append(build_rule)
+        add_dep(lines, 'cat', language_inputs, combined_output)
 
 
 def twitter_deps(prefix_in, languages):
     lines = []
     for language in languages:
         input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language)
-        output_file = wordlist_filename('twitter', language, '.tokens')
-        build_rule = "build {outs}: format_twitter {ins} | {deps}".format(
-            outs=output_file,
-            ins=input_file,
-            deps='wordfreq_builder/tokenizers.py'
-        )
-        lines.append(build_rule)
+        token_file = wordlist_filename('twitter', language, '.tokens')
+        add_dep(lines,
+                'format_twitter', input_file, token_file,
+                extra='wordfreq_builder/tokenizers.py')
 
-        token_file = output_file
-        output_file = wordlist_filename('twitter', language, '.counts')
-        build_rule = "build {outs}: count {ins}".format(
-            outs=output_file, ins=token_file
-        )
-        lines.append(build_rule)
-        lines.append("  tmp = {}".format(TMPDIR))
+        count_file = wordlist_filename('twitter', language, '.counts')
+        add_dep(lines, 'count', token_file, count_file)
 
     return lines
 

From 02d8b321195e0cbf00b5da00cdacb35a5cf043d3 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 7 May 2015 17:07:33 -0400
Subject: [PATCH 13/16] process leeds and opensubtitles

---
 wordfreq_builder/wordfreq_builder/ninja.py | 38 ++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 4099049..c1e0560 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -65,6 +65,19 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
             CONFIG['sources']['wikipedia']
         )
     )
+    lines.extend(
+        leeds_deps(
+            data_filename('source-lists/leeds'),
+            CONFIG['sources']['leeds']
+        )
+    )
+    lines.extend(
+        opensubtitles_deps(
+            data_filename('source-lists/opensubtitles'),
+            CONFIG['sources']['opensubtitles']
+        )
+    )
+
     print('\n'.join(lines), file=out)
 
 
@@ -115,6 +128,7 @@ def twitter_preprocess_deps(input_filename, slice_prefix,
             for slicenum in range(slices)
         ]
         add_dep(lines, 'cat', language_inputs, combined_output)
+    return lines
 
 
 def twitter_deps(prefix_in, languages):
@@ -132,6 +146,30 @@ def twitter_deps(prefix_in, languages):
     return lines
 
 
+def leeds_deps(dirname_in, languages):
+    lines = []
+    for language in languages:
+        input_file = '{prefix}/internet-{lang}-forms.num'.format(
+            prefix=dirname_in, lang=language
+        )
+        reformatted_file = wordlist_filename('leeds', language, '.counts')
+        add_dep(lines, 'convert_leeds', input_file, reformatted_file)
+
+    return lines
+
+
+def opensubtitles_deps(dirname_in, languages):
+    lines = []
+    for language in languages:
+        input_file = '{prefix}/{lang}.txt'.format(
+            prefix=dirname_in, lang=language
+        )
+        reformatted_file = wordlist_filename('opensubtitles', language, '.counts')
+        add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
+
+    return lines
+
+
 def main():
     make_ninja_deps('rules.ninja')
 

From abb0e059c83f0842a0549314560ce5b7f2f8986d Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 7 May 2015 19:38:33 -0400
Subject: [PATCH 14/16] a reasonably complete build process

---
 wordfreq_builder/rules.ninja                  |  6 +++
 wordfreq_builder/setup.py                     |  1 +
 .../wordfreq_builder/cli/combine_lists.py     |  2 +-
 .../wordfreq_builder/cli/freqs_to_dB.py       | 11 ++++++
 wordfreq_builder/wordfreq_builder/config.py   | 19 +++++++---
 wordfreq_builder/wordfreq_builder/ninja.py    | 37 +++++++++++++++----
 .../wordfreq_builder/word_counts.py           | 25 ++++++++++++-
 7 files changed, 85 insertions(+), 16 deletions(-)
 create mode 100644 wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index 6b6c018..12c0360 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -53,5 +53,11 @@ rule convert_opensubtitles
 rule count
   command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
 
+rule merge
+  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
+
+rule freqs2dB
+  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out
+
 rule cat
   command = cat $in > $out
diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py
index 88b6d49..1998708 100755
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@@ -9,6 +9,7 @@ setup(
     platforms=["any"],
     description="Turns raw data into word frequency lists",
     packages=['wordfreq_builder'],
+    install_requires=['msgpack'],
     entry_points={
         'console_scripts': [
             'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
index 61d9674..61d4b1d 100644
--- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
+++ b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
@@ -5,7 +5,7 @@ import argparse
 def merge_lists(input_names, output_name):
     freq_dicts = []
     for input_name in input_names:
-        freq_dicts.append(read_freqs(input_name))
+        freq_dicts.append(read_freqs(input_name, cutoff=2))
     merged = merge_freqs(freq_dicts)
     write_wordlist(merged, output_name)
 
diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py
new file mode 100644
index 0000000..81a4dde
--- /dev/null
+++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py
@@ -0,0 +1,11 @@
+from wordfreq_builder.word_counts import freqs_to_dBpack
+import argparse
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename_in', help='name of input file containing tokens')
+    parser.add_argument('filename_out', help='name of output file')
+    args = parser.parse_args()
+    freqs_to_dBpack(args.filename_in, args.filename_out)
+
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index ec62634..dafd1c0 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -29,10 +29,11 @@ CONFIG = {
         ]
     },
     'wordlist_paths': {
-        'twitter': 'generated/twitter/tweets-2014.{lang}{ext}.txt',
-        'wikipedia': 'generated/wikipedia/wikipedia_{lang}{ext}.txt',
-        'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}{ext}.txt',
-        'leeds': 'generated/leeds/leeds_internet_{lang}{ext}.txt'
+        'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}',
+        'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
+        'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
+        'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
+        'combined': 'generated/combined/combined_{lang}.{ext}'
     }
 }
 
@@ -41,7 +42,7 @@ def data_filename(filename):
     return os.path.join(CONFIG['data_dir'], filename)
 
 
-def wordlist_filename(source, language, extension=''):
+def wordlist_filename(source, language, extension='txt'):
     path = CONFIG['wordlist_paths'][source].format(
         lang=language, ext=extension
     )
@@ -54,3 +55,11 @@ def source_names(language):
     """
     return sorted([key for key in CONFIG['sources']
                   if language in CONFIG['sources'][key]])
+
+
+def all_languages():
+    languages = set()
+    for langlist in CONFIG['sources'].values():
+        languages |= set(langlist)
+    return sorted(languages)
+
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index c1e0560..1059ba3 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -1,4 +1,6 @@
-from wordfreq_builder.config import CONFIG, data_filename, wordlist_filename
+from wordfreq_builder.config import (
+    CONFIG, data_filename, wordlist_filename, all_languages, source_names
+)
 import sys
 import pathlib
 
@@ -77,6 +79,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
             CONFIG['sources']['opensubtitles']
         )
     )
+    lines.extend(combine_lists(all_languages()))
 
     print('\n'.join(lines), file=out)
 
@@ -89,9 +92,9 @@ def wikipedia_deps(dirname_in, languages):
         input_file = max(path_in.glob(
             '{}wiki*.bz2'.format(language)
         ))
-        raw_file = wordlist_filename('wikipedia', language, '')
-        token_file = wordlist_filename('wikipedia', language, '.tokens')
-        count_file = wordlist_filename('wikipedia', language, '.counts')
+        raw_file = wordlist_filename('wikipedia', language, 'txt')
+        token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
+        count_file = wordlist_filename('wikipedia', language, 'counts.txt')
 
         add_dep(lines, 'wiki2text', input_file, raw_file)
         add_dep(lines, 'wiki2tokens', input_file, token_file)
@@ -135,12 +138,12 @@ def twitter_deps(prefix_in, languages):
     lines = []
     for language in languages:
         input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language)
-        token_file = wordlist_filename('twitter', language, '.tokens')
+        token_file = wordlist_filename('twitter', language, 'tokens.txt')
         add_dep(lines,
                 'format_twitter', input_file, token_file,
                 extra='wordfreq_builder/tokenizers.py')
 
-        count_file = wordlist_filename('twitter', language, '.counts')
+        count_file = wordlist_filename('twitter', language, 'counts.txt')
         add_dep(lines, 'count', token_file, count_file)
 
     return lines
@@ -152,7 +155,7 @@ def leeds_deps(dirname_in, languages):
         input_file = '{prefix}/internet-{lang}-forms.num'.format(
             prefix=dirname_in, lang=language
         )
-        reformatted_file = wordlist_filename('leeds', language, '.counts')
+        reformatted_file = wordlist_filename('leeds', language, 'counts.txt')
         add_dep(lines, 'convert_leeds', input_file, reformatted_file)
 
     return lines
@@ -164,12 +167,30 @@ def opensubtitles_deps(dirname_in, languages):
         input_file = '{prefix}/{lang}.txt'.format(
             prefix=dirname_in, lang=language
         )
-        reformatted_file = wordlist_filename('opensubtitles', language, '.counts')
+        reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
         add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
 
     return lines
 
 
+def combine_lists(languages):
+    lines = []
+    for language in languages:
+        sources = source_names(language)
+        input_files = [
+            wordlist_filename(source, language, 'counts.txt')
+            for source in sources
+        ]
+        output_file = wordlist_filename('combined', language)
+        add_dep(lines, 'merge', input_files, output_file,
+                extra='wordfreq_builder/word_counts.py')
+
+        output_dBpack = wordlist_filename('combined', language, 'msgpack.gz')
+        add_dep(lines, 'freqs2dB', output_file, output_dBpack,
+                extra='wordfreq_builder/word_counts.py')
+    return lines
+
+
 def main():
     make_ninja_deps('rules.ninja')
 
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index b150ddd..be49288 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -2,7 +2,10 @@ from wordfreq_builder.tokenizers import retokenize
 from collections import defaultdict
 from operator import itemgetter
 from ftfy import fix_text
+import math
 import csv
+import msgpack
+import gzip
 
 
 def count_tokens(filename):
@@ -14,7 +17,7 @@ def count_tokens(filename):
     return counts
 
 
-def read_freqs(filename, cutoff=2):
+def read_freqs(filename, cutoff=0):
     raw_counts = defaultdict(float)
     total = 0.
     with open(filename, encoding='utf-8', newline='') as infile:
@@ -35,6 +38,22 @@ def read_freqs(filename, cutoff=2):
     return freqs
 
 
+def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
+    freq_cutoff = 10 ** (cutoff / 10.)
+    freqs = read_freqs(in_filename, freq_cutoff)
+    dBpack = []
+    for token, freq in freqs.items():
+        dB = round(math.log10(freq) * 10)
+        if dB >= cutoff:
+            neg_dB = -dB
+            while neg_dB >= len(dBpack):
+                dBpack.append([])
+            dBpack[neg_dB].append(token)
+
+    with gzip.open(out_filename, 'wb') as outfile:
+        msgpack.dump(dBpack, outfile)
+
+
 def merge_freqs(freq_dicts):
     vocab = set()
     for freq_dict in freq_dicts:
@@ -51,7 +70,7 @@ def merge_freqs(freq_dicts):
     return merged
 
 
-def write_wordlist(freqs, filename):
+def write_wordlist(freqs, filename, cutoff=1e-8):
     """
     Write a dictionary of either raw counts or frequencies to a file of
     comma-separated values.
@@ -60,5 +79,7 @@ def write_wordlist(freqs, filename):
         writer = csv.writer(outfile)
         items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
         for word, freq in items:
+            if freq < cutoff:
+                break
             if not ('"' in word or ',' in word):
                 writer.writerow([word, str(freq)])

From 1b7a2b9d0bcff1c79bd1f8a534528aaf3407fdaa Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 7 May 2015 23:55:57 -0400
Subject: [PATCH 15/16] fix dependency

---
 wordfreq_builder/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py
index 1998708..1466d35 100755
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@@ -9,7 +9,7 @@ setup(
     platforms=["any"],
     description="Turns raw data into word frequency lists",
     packages=['wordfreq_builder'],
-    install_requires=['msgpack'],
+    install_requires=['msgpack-python'],
     entry_points={
         'console_scripts': [
             'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',

From 2f14417bcf21e74dfacf0cf637ff699b09fe83f5 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 7 May 2015 23:59:04 -0400
Subject: [PATCH 16/16] limit final builds to languages with >= 2 sources

---
 wordfreq_builder/wordfreq_builder/config.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index dafd1c0..b6af74d 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -10,7 +10,8 @@ CONFIG = {
         # look up in filenames for these various data sources.
         'twitter': [
             'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
-            'pt', 'ru'
+            'pt', 'ru',
+            # can be added later: 'th', 'tr'
         ],
         'wikipedia': [
             'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
@@ -34,7 +35,8 @@ CONFIG = {
         'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
         'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
         'combined': 'generated/combined/combined_{lang}.{ext}'
-    }
+    },
+    'min_sources': 2
 }
 
 
@@ -61,5 +63,7 @@ def all_languages():
     languages = set()
     for langlist in CONFIG['sources'].values():
         languages |= set(langlist)
-    return sorted(languages)
+    return [lang for lang in sorted(languages)
+            if len(source_names(lang))
+            >= CONFIG['min_sources']]