From cbe3513e08445fc1a848a733867ded34c86aff83 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 27 May 2015 17:44:58 -0400 Subject: [PATCH] Tokenize Japanese consistently with MeCab --- wordfreq_builder/wordfreq_builder/ninja.py | 18 +++++++++++------- .../wordfreq_builder/tokenizers.py | 2 +- .../wordfreq_builder/word_counts.py | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index e5c5e70..fabcb95 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -48,7 +48,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout): # The first dependency is to make sure the build file is up to date. add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja', extra='wordfreq_builder/ninja.py') - + if PRETOKENIZE_TWITTER: lines.extend( twitter_preprocess_deps( @@ -101,16 +101,15 @@ def wikipedia_deps(dirname_in, languages): input_file = max(path_in.glob( '{}wiki*.bz2'.format(language) )) - raw_file = wordlist_filename('wikipedia', language, 'txt') token_file = wordlist_filename('wikipedia', language, 'tokens.txt') count_file = wordlist_filename('wikipedia', language, 'counts.txt') - add_dep(lines, 'wiki2text', input_file, raw_file) + add_dep(lines, 'wiki2tokens', input_file, token_file) if language == 'ja': - add_dep(lines, 'tokenize_japanese', raw_file, token_file) - add_dep(lines, 'count', token_file, count_file) + mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt') + add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file) + add_dep(lines, 'count', mecab_token_file, count_file) else: - add_dep(lines, 'wiki2tokens', input_file, token_file) add_dep(lines, 'count', token_file, count_file) return lines @@ -174,7 +173,12 @@ def twitter_deps(prefix_in, languages): extra='wordfreq_builder/tokenizers.py') count_file = wordlist_filename('twitter', language, 'counts.txt') - add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py') + if language == 'ja': + mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt') + add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file) + add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py') + else: + add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py') return lines diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index e52234c..25bb8ea 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -1,6 +1,6 @@ from lumi_science.text_readers.rosette_readers import RosetteReader from html.entities import name2codepoint -from wordfreq import tokenize, TOKEN_RE +from wordfreq import TOKEN_RE import re diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 243d353..2e54d82 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -1,4 +1,4 @@ -from wordfreq import tokenize +from wordfreq import simple_tokenize from collections import defaultdict from operator import itemgetter from ftfy import fix_text @@ -16,7 +16,7 @@ def count_tokens(filename): counts = defaultdict(int) with open(filename, encoding='utf-8') as infile: for line in infile: - for token in tokenize(line.strip()): + for token in simple_tokenize(line.strip()): counts[token] += 1 return counts @@ -33,7 +33,7 @@ def read_freqs(filename, cutoff=0): val = float(strval) if val < cutoff: break - for token in tokenize(key): + for token in simple_tokenize(key): token = fix_text(token) total += val # Use += so that, if we give the reader concatenated files with