Tokenize Japanese consistently with MeCab

2024-12-25 10:15:23 +00:00 · 2015-05-27 17:44:58 -04:00 · 2015-05-27 17:44:58 -04:00 · cbe3513e08
commit cbe3513e08
parent 536c15fbdb
3 changed files with 15 additions and 11 deletions
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -101,16 +101,15 @@ def wikipedia_deps(dirname_in, languages):
        input_file = max(path_in.glob(
            '{}wiki*.bz2'.format(language)
        ))
-        raw_file = wordlist_filename('wikipedia', language, 'txt')
        token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
        count_file = wordlist_filename('wikipedia', language, 'counts.txt')

-        add_dep(lines, 'wiki2text', input_file, raw_file)
-        if language == 'ja':
-            add_dep(lines, 'tokenize_japanese', raw_file, token_file)
-            add_dep(lines, 'count', token_file, count_file)
-        else:
        add_dep(lines, 'wiki2tokens', input_file, token_file)
+        if language == 'ja':
+            mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
+            add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
+            add_dep(lines, 'count', mecab_token_file, count_file)
+        else:
            add_dep(lines, 'count', token_file, count_file)

    return lines
@ -174,6 +173,11 @@ def twitter_deps(prefix_in, languages):
                extra='wordfreq_builder/tokenizers.py')

        count_file = wordlist_filename('twitter', language, 'counts.txt')
+        if language == 'ja':
+            mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
+            add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
+            add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
+        else:
            add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')

    return lines
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,6 +1,6 @@
 from lumi_science.text_readers.rosette_readers import RosetteReader
 from html.entities import name2codepoint
-from wordfreq import tokenize, TOKEN_RE
+from wordfreq import TOKEN_RE
 import re


--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -1,4 +1,4 @@
-from wordfreq import tokenize
+from wordfreq import simple_tokenize
 from collections import defaultdict
 from operator import itemgetter
 from ftfy import fix_text
@ -16,7 +16,7 @@ def count_tokens(filename):
    counts = defaultdict(int)
    with open(filename, encoding='utf-8') as infile:
        for line in infile:
-            for token in tokenize(line.strip()):
+            for token in simple_tokenize(line.strip()):
                counts[token] += 1
    return counts

@ -33,7 +33,7 @@ def read_freqs(filename, cutoff=0):
            val = float(strval)
            if val < cutoff:
                break
-            for token in tokenize(key):
+            for token in simple_tokenize(key):
                token = fix_text(token)
                total += val
                # Use += so that, if we give the reader concatenated files with