mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 10:15:23 +00:00
Tokenize Japanese consistently with MeCab
This commit is contained in:
parent
536c15fbdb
commit
cbe3513e08
@ -101,16 +101,15 @@ def wikipedia_deps(dirname_in, languages):
|
||||
input_file = max(path_in.glob(
|
||||
'{}wiki*.bz2'.format(language)
|
||||
))
|
||||
raw_file = wordlist_filename('wikipedia', language, 'txt')
|
||||
token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
|
||||
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
|
||||
|
||||
add_dep(lines, 'wiki2text', input_file, raw_file)
|
||||
if language == 'ja':
|
||||
add_dep(lines, 'tokenize_japanese', raw_file, token_file)
|
||||
add_dep(lines, 'count', token_file, count_file)
|
||||
else:
|
||||
add_dep(lines, 'wiki2tokens', input_file, token_file)
|
||||
if language == 'ja':
|
||||
mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
|
||||
add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
|
||||
add_dep(lines, 'count', mecab_token_file, count_file)
|
||||
else:
|
||||
add_dep(lines, 'count', token_file, count_file)
|
||||
|
||||
return lines
|
||||
@ -174,6 +173,11 @@ def twitter_deps(prefix_in, languages):
|
||||
extra='wordfreq_builder/tokenizers.py')
|
||||
|
||||
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
||||
if language == 'ja':
|
||||
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
|
||||
add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
|
||||
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||
else:
|
||||
add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||
|
||||
return lines
|
||||
|
@ -1,6 +1,6 @@
|
||||
from lumi_science.text_readers.rosette_readers import RosetteReader
|
||||
from html.entities import name2codepoint
|
||||
from wordfreq import tokenize, TOKEN_RE
|
||||
from wordfreq import TOKEN_RE
|
||||
import re
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from wordfreq import tokenize
|
||||
from wordfreq import simple_tokenize
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
from ftfy import fix_text
|
||||
@ -16,7 +16,7 @@ def count_tokens(filename):
|
||||
counts = defaultdict(int)
|
||||
with open(filename, encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
for token in tokenize(line.strip()):
|
||||
for token in simple_tokenize(line.strip()):
|
||||
counts[token] += 1
|
||||
return counts
|
||||
|
||||
@ -33,7 +33,7 @@ def read_freqs(filename, cutoff=0):
|
||||
val = float(strval)
|
||||
if val < cutoff:
|
||||
break
|
||||
for token in tokenize(key):
|
||||
for token in simple_tokenize(key):
|
||||
token = fix_text(token)
|
||||
total += val
|
||||
# Use += so that, if we give the reader concatenated files with
|
||||
|
Loading…
Reference in New Issue
Block a user