From deed2f767c3fcbd4eae9648e29f14b799ef69b77 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Tue, 30 Jun 2015 15:28:01 -0400
Subject: [PATCH 1/2] remove wiki2tokens and tokenize_wikipedia

These components are no longer necessary. Wikipedia output can and
should be tokenized with the standard tokenizer, instead of the
almost-equivalent one in the Nim code.
---
 wordfreq_builder/rules.ninja                  | 10 -------
 wordfreq_builder/setup.py                     |  1 -
 .../cli/tokenize_wikipedia.py                 | 30 -------------------
 wordfreq_builder/wordfreq_builder/ninja.py    |  8 ++---
 4 files changed, 4 insertions(+), 45 deletions(-)
 delete mode 100644 wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py

diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja
index 4be9f25..d693f52 100644
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@@ -31,16 +31,6 @@ rule split
 rule wiki2text
   command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
 
-# The wiki2tokens rule is the same as the wiki2text rule, but uses the -t
-# flag to tell the Nim code to output one token per line (according to its
-# language-agnostic tokenizer, which splits on punctuation and whitespace in
-# basically the same way as wordfreq).
-#
-# The fact that this uses a language-agnostic tokenizer means it should not
-# be applied to Chinese or Japanese.
-rule wiki2tokens
-  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
-
 # To tokenize Japanese, we run it through Mecab and take the first column.
 # We don't have a plan for tokenizing Chinese yet.
 rule tokenize_japanese
diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py
index a6436ad..c7232cc 100755
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@@ -14,7 +14,6 @@ setup(
         'console_scripts': [
             'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
             'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
-            'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
             'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
         ]
     }
diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
deleted file mode 100644
index 95166a7..0000000
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from wordfreq_builder.tokenizers import cld2_surface_tokenizer, monolingual_tokenize_file
-import argparse
-
-
-def tokenize_wikipedia(in_filename, out_filename, language, proportion):
-    monolingual_tokenize_file(
-        in_filename, out_filename,
-        language=language,
-        tokenizer=cld2_surface_tokenizer,
-        line_reader=strip_headings,
-        sample_proportion=proportion
-    )
-
-
-def strip_headings(text):
-    return text.strip().strip('=')
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('in_filename', help='filename of input file')
-    parser.add_argument('out_filename', help='filename of output file')
-    parser.add_argument('language', help='the language code of the text')
-    parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
-    args = parser.parse_args()
-    tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index 04d3df3..90130b3 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -96,16 +96,16 @@ def wikipedia_deps(dirname_in, languages):
         input_file = max(path_in.glob(
             '{}wiki*.bz2'.format(language)
         ))
-        token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
+        plain_text_file = wordlist_filename('wikipedia', language, 'txt')
         count_file = wordlist_filename('wikipedia', language, 'counts.txt')
 
-        add_dep(lines, 'wiki2tokens', input_file, token_file)
+        add_dep(lines, 'wiki2text', input_file, plain_text_file)
         if language == 'ja':
             mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
-            add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
+            add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
             add_dep(lines, 'count', mecab_token_file, count_file)
         else:
-            add_dep(lines, 'count', token_file, count_file)
+            add_dep(lines, 'count', plain_text_file, count_file)
 
     return lines
 

From 58c8bda21b222227ec7f0c908cb0a7c6b8cefe60 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Tue, 30 Jun 2015 17:05:40 -0400
Subject: [PATCH 2/2] cope with occasional Unicode errors in the input

---
 wordfreq_builder/wordfreq_builder/word_counts.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index cc4c3a5..8f4099c 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -12,9 +12,11 @@ def count_tokens(filename):
     """
     Count tokens that appear in a file, running each line through our
     simple tokenizer.
+
+    Unicode errors in the input data will become token boundaries.
     """
     counts = defaultdict(int)
-    with open(filename, encoding='utf-8') as infile:
+    with open(filename, encoding='utf-8', errors='replace') as infile:
         for line in infile:
             for token in simple_tokenize(line.strip()):
                 counts[token] += 1