Tokenize Japanese consistently with MeCab

This commit is contained in:
Rob Speer 2015-05-27 17:44:58 -04:00
parent 536c15fbdb
commit cbe3513e08
3 changed files with 15 additions and 11 deletions

View File

@ -48,7 +48,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
# The first dependency is to make sure the build file is up to date. # The first dependency is to make sure the build file is up to date.
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja', add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
extra='wordfreq_builder/ninja.py') extra='wordfreq_builder/ninja.py')
if PRETOKENIZE_TWITTER: if PRETOKENIZE_TWITTER:
lines.extend( lines.extend(
twitter_preprocess_deps( twitter_preprocess_deps(
@ -101,16 +101,15 @@ def wikipedia_deps(dirname_in, languages):
input_file = max(path_in.glob( input_file = max(path_in.glob(
'{}wiki*.bz2'.format(language) '{}wiki*.bz2'.format(language)
)) ))
raw_file = wordlist_filename('wikipedia', language, 'txt')
token_file = wordlist_filename('wikipedia', language, 'tokens.txt') token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
count_file = wordlist_filename('wikipedia', language, 'counts.txt') count_file = wordlist_filename('wikipedia', language, 'counts.txt')
add_dep(lines, 'wiki2text', input_file, raw_file) add_dep(lines, 'wiki2tokens', input_file, token_file)
if language == 'ja': if language == 'ja':
add_dep(lines, 'tokenize_japanese', raw_file, token_file) mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
add_dep(lines, 'count', token_file, count_file) add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
add_dep(lines, 'count', mecab_token_file, count_file)
else: else:
add_dep(lines, 'wiki2tokens', input_file, token_file)
add_dep(lines, 'count', token_file, count_file) add_dep(lines, 'count', token_file, count_file)
return lines return lines
@ -174,7 +173,12 @@ def twitter_deps(prefix_in, languages):
extra='wordfreq_builder/tokenizers.py') extra='wordfreq_builder/tokenizers.py')
count_file = wordlist_filename('twitter', language, 'counts.txt') count_file = wordlist_filename('twitter', language, 'counts.txt')
add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py') if language == 'ja':
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
else:
add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')
return lines return lines

View File

@ -1,6 +1,6 @@
from lumi_science.text_readers.rosette_readers import RosetteReader from lumi_science.text_readers.rosette_readers import RosetteReader
from html.entities import name2codepoint from html.entities import name2codepoint
from wordfreq import tokenize, TOKEN_RE from wordfreq import TOKEN_RE
import re import re

View File

@ -1,4 +1,4 @@
from wordfreq import tokenize from wordfreq import simple_tokenize
from collections import defaultdict from collections import defaultdict
from operator import itemgetter from operator import itemgetter
from ftfy import fix_text from ftfy import fix_text
@ -16,7 +16,7 @@ def count_tokens(filename):
counts = defaultdict(int) counts = defaultdict(int)
with open(filename, encoding='utf-8') as infile: with open(filename, encoding='utf-8') as infile:
for line in infile: for line in infile:
for token in tokenize(line.strip()): for token in simple_tokenize(line.strip()):
counts[token] += 1 counts[token] += 1
return counts return counts
@ -33,7 +33,7 @@ def read_freqs(filename, cutoff=0):
val = float(strval) val = float(strval)
if val < cutoff: if val < cutoff:
break break
for token in tokenize(key): for token in simple_tokenize(key):
token = fix_text(token) token = fix_text(token)
total += val total += val
# Use += so that, if we give the reader concatenated files with # Use += so that, if we give the reader concatenated files with