remove wiki2tokens and tokenize_wikipedia

These components are no longer necessary. Wikipedia output can and should be tokenized with the standard tokenizer, instead of the almost-equivalent one in the Nim code.
2024-12-25 10:15:23 +00:00 · 2015-06-30 15:28:01 -04:00 · 2015-06-30 15:28:01 -04:00 · 4771c12814
commit 4771c12814
parent 9a2855394d
4 changed files with 4 additions and 45 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -31,16 +31,6 @@ rule split
 rule wiki2text
  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
 # The wiki2tokens rule is the same as the wiki2text rule, but uses the -t
 # flag to tell the Nim code to output one token per line (according to its
 # language-agnostic tokenizer, which splits on punctuation and whitespace in
 # basically the same way as wordfreq).
 #
 # The fact that this uses a language-agnostic tokenizer means it should not
 # be applied to Chinese or Japanese.
 rule wiki2tokens
  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
 # To tokenize Japanese, we run it through Mecab and take the first column.
 # We don't have a plan for tokenizing Chinese yet.
 rule tokenize_japanese
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@ -14,7 +14,6 @@ setup(
        'console_scripts': [
            'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
            'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
            'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
            'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
        ]
    }
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
@ -1,30 +0,0 @@
 from wordfreq_builder.tokenizers import cld2_surface_tokenizer, monolingual_tokenize_file
 import argparse
 def tokenize_wikipedia(in_filename, out_filename, language, proportion):
    monolingual_tokenize_file(
        in_filename, out_filename,
        language=language,
        tokenizer=cld2_surface_tokenizer,
        line_reader=strip_headings,
        sample_proportion=proportion
    )
 def strip_headings(text):
    return text.strip().strip('=')
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('in_filename', help='filename of input file')
    parser.add_argument('out_filename', help='filename of output file')
    parser.add_argument('language', help='the language code of the text')
    parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
    args = parser.parse_args()
    tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
 if __name__ == '__main__':
    main()
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -96,16 +96,16 @@ def wikipedia_deps(dirname_in, languages):
        input_file = max(path_in.glob(
            '{}wiki*.bz2'.format(language)
        ))
-        token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
+        plain_text_file = wordlist_filename('wikipedia', language, 'txt')
        count_file = wordlist_filename('wikipedia', language, 'counts.txt')
-        add_dep(lines, 'wiki2tokens', input_file, token_file)
+        add_dep(lines, 'wiki2text', input_file, plain_text_file)
        if language == 'ja':
            mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
-            add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
+            add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
            add_dep(lines, 'count', mecab_token_file, count_file)
        else:
-            add_dep(lines, 'count', token_file, count_file)
+            add_dep(lines, 'count', plain_text_file, count_file)
    return lines