From deed2f767c3fcbd4eae9648e29f14b799ef69b77 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 30 Jun 2015 15:28:01 -0400 Subject: [PATCH 1/2] remove wiki2tokens and tokenize_wikipedia These components are no longer necessary. Wikipedia output can and should be tokenized with the standard tokenizer, instead of the almost-equivalent one in the Nim code. --- wordfreq_builder/rules.ninja | 10 ------- wordfreq_builder/setup.py | 1 - .../cli/tokenize_wikipedia.py | 30 ------------------- wordfreq_builder/wordfreq_builder/ninja.py | 8 ++--- 4 files changed, 4 insertions(+), 45 deletions(-) delete mode 100644 wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 4be9f25..d693f52 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -31,16 +31,6 @@ rule split rule wiki2text command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out -# The wiki2tokens rule is the same as the wiki2text rule, but uses the -t -# flag to tell the Nim code to output one token per line (according to its -# language-agnostic tokenizer, which splits on punctuation and whitespace in -# basically the same way as wordfreq). -# -# The fact that this uses a language-agnostic tokenizer means it should not -# be applied to Chinese or Japanese. -rule wiki2tokens - command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out - # To tokenize Japanese, we run it through Mecab and take the first column. # We don't have a plan for tokenizing Chinese yet. rule tokenize_japanese diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index a6436ad..c7232cc 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -14,7 +14,6 @@ setup( 'console_scripts': [ 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', 'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main', - 'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main', 'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main' ] } diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py deleted file mode 100644 index 95166a7..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py +++ /dev/null @@ -1,30 +0,0 @@ -from wordfreq_builder.tokenizers import cld2_surface_tokenizer, monolingual_tokenize_file -import argparse - - -def tokenize_wikipedia(in_filename, out_filename, language, proportion): - monolingual_tokenize_file( - in_filename, out_filename, - language=language, - tokenizer=cld2_surface_tokenizer, - line_reader=strip_headings, - sample_proportion=proportion - ) - - -def strip_headings(text): - return text.strip().strip('=') - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('in_filename', help='filename of input file') - parser.add_argument('out_filename', help='filename of output file') - parser.add_argument('language', help='the language code of the text') - parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100) - args = parser.parse_args() - tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion) - - -if __name__ == '__main__': - main() diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 04d3df3..90130b3 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -96,16 +96,16 @@ def wikipedia_deps(dirname_in, languages): input_file = max(path_in.glob( '{}wiki*.bz2'.format(language) )) - token_file = wordlist_filename('wikipedia', language, 'tokens.txt') + plain_text_file = wordlist_filename('wikipedia', language, 'txt') count_file = wordlist_filename('wikipedia', language, 'counts.txt') - add_dep(lines, 'wiki2tokens', input_file, token_file) + add_dep(lines, 'wiki2text', input_file, plain_text_file) if language == 'ja': mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt') - add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file) + add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file) add_dep(lines, 'count', mecab_token_file, count_file) else: - add_dep(lines, 'count', token_file, count_file) + add_dep(lines, 'count', plain_text_file, count_file) return lines From 58c8bda21b222227ec7f0c908cb0a7c6b8cefe60 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 30 Jun 2015 17:05:40 -0400 Subject: [PATCH 2/2] cope with occasional Unicode errors in the input --- wordfreq_builder/wordfreq_builder/word_counts.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index cc4c3a5..8f4099c 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -12,9 +12,11 @@ def count_tokens(filename): """ Count tokens that appear in a file, running each line through our simple tokenizer. + + Unicode errors in the input data will become token boundaries. """ counts = defaultdict(int) - with open(filename, encoding='utf-8') as infile: + with open(filename, encoding='utf-8', errors='replace') as infile: for line in infile: for token in simple_tokenize(line.strip()): counts[token] += 1