mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 10:15:23 +00:00
Tokenize Japanese consistently with MeCab
This commit is contained in:
parent
536c15fbdb
commit
cbe3513e08
@ -48,7 +48,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
|||||||
# The first dependency is to make sure the build file is up to date.
|
# The first dependency is to make sure the build file is up to date.
|
||||||
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
|
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
|
||||||
extra='wordfreq_builder/ninja.py')
|
extra='wordfreq_builder/ninja.py')
|
||||||
|
|
||||||
if PRETOKENIZE_TWITTER:
|
if PRETOKENIZE_TWITTER:
|
||||||
lines.extend(
|
lines.extend(
|
||||||
twitter_preprocess_deps(
|
twitter_preprocess_deps(
|
||||||
@ -101,16 +101,15 @@ def wikipedia_deps(dirname_in, languages):
|
|||||||
input_file = max(path_in.glob(
|
input_file = max(path_in.glob(
|
||||||
'{}wiki*.bz2'.format(language)
|
'{}wiki*.bz2'.format(language)
|
||||||
))
|
))
|
||||||
raw_file = wordlist_filename('wikipedia', language, 'txt')
|
|
||||||
token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
|
token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
|
||||||
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
|
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
|
||||||
|
|
||||||
add_dep(lines, 'wiki2text', input_file, raw_file)
|
add_dep(lines, 'wiki2tokens', input_file, token_file)
|
||||||
if language == 'ja':
|
if language == 'ja':
|
||||||
add_dep(lines, 'tokenize_japanese', raw_file, token_file)
|
mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
|
||||||
add_dep(lines, 'count', token_file, count_file)
|
add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
|
||||||
|
add_dep(lines, 'count', mecab_token_file, count_file)
|
||||||
else:
|
else:
|
||||||
add_dep(lines, 'wiki2tokens', input_file, token_file)
|
|
||||||
add_dep(lines, 'count', token_file, count_file)
|
add_dep(lines, 'count', token_file, count_file)
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
@ -174,7 +173,12 @@ def twitter_deps(prefix_in, languages):
|
|||||||
extra='wordfreq_builder/tokenizers.py')
|
extra='wordfreq_builder/tokenizers.py')
|
||||||
|
|
||||||
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
||||||
add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')
|
if language == 'ja':
|
||||||
|
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
|
||||||
|
add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
|
||||||
|
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||||
|
else:
|
||||||
|
add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from lumi_science.text_readers.rosette_readers import RosetteReader
|
from lumi_science.text_readers.rosette_readers import RosetteReader
|
||||||
from html.entities import name2codepoint
|
from html.entities import name2codepoint
|
||||||
from wordfreq import tokenize, TOKEN_RE
|
from wordfreq import TOKEN_RE
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from wordfreq import tokenize
|
from wordfreq import simple_tokenize
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from ftfy import fix_text
|
from ftfy import fix_text
|
||||||
@ -16,7 +16,7 @@ def count_tokens(filename):
|
|||||||
counts = defaultdict(int)
|
counts = defaultdict(int)
|
||||||
with open(filename, encoding='utf-8') as infile:
|
with open(filename, encoding='utf-8') as infile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
for token in tokenize(line.strip()):
|
for token in simple_tokenize(line.strip()):
|
||||||
counts[token] += 1
|
counts[token] += 1
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
@ -33,7 +33,7 @@ def read_freqs(filename, cutoff=0):
|
|||||||
val = float(strval)
|
val = float(strval)
|
||||||
if val < cutoff:
|
if val < cutoff:
|
||||||
break
|
break
|
||||||
for token in tokenize(key):
|
for token in simple_tokenize(key):
|
||||||
token = fix_text(token)
|
token = fix_text(token)
|
||||||
total += val
|
total += val
|
||||||
# Use += so that, if we give the reader concatenated files with
|
# Use += so that, if we give the reader concatenated files with
|
||||||
|
Loading…
Reference in New Issue
Block a user