tokenize Chinese using jieba and our own frequencies

This commit is contained in:
Rob Speer 2015-09-05 03:16:56 -04:00
parent 7906a671ea
commit 2327f2e4d6
12 changed files with 32088 additions and 40 deletions

View File

@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
setup(
name="wordfreq",
version='1.1',
version='1.2',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',
@ -50,8 +50,11 @@ setup(
# turn, it depends on libmecab-dev being installed on the system. It's not
# listed under 'install_requires' because wordfreq should be usable in
# other languages without it.
#
# Similarly, jieba is required for Chinese word frequencies.
extras_require={
'mecab': 'mecab-python3'
'mecab': 'mecab-python3',
'jieba': 'jieba'
},
tests_require=['mecab-python3'],
tests_require=['mecab-python3', 'jieba'],
)

View File

@ -162,8 +162,8 @@ def test_ar():
def test_ideographic_fallback():
# Try tokenizing Chinese text -- it should remain stuck together.
eq_(tokenize('中国文字', 'zh'), ['中国文字'])
# Try tokenizing Chinese text as English -- it should remain stuck together.
eq_(tokenize('中国文字', 'en'), ['中国文字'])
# When Japanese is tagged with the wrong language, it will be split
# at script boundaries.

48
tests/test_chinese.py Normal file
View File

@ -0,0 +1,48 @@
from nose.tools import eq_, assert_almost_equal, assert_greater
from wordfreq import tokenize, word_frequency
def test_tokens():
# Let's test on some Chinese text that has unusual combinations of
# syllables, because it is about an American vice-president.
#
# (He was the Chinese Wikipedia's featured article of the day when I
# wrote this test.)
hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè".
# He was the sixth American vice president to die in office.
fact_simplified = '他是历史上第六位在任期内去世的美国副总统。'
fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'
# His name breaks into five pieces, with the only piece staying together
# being the one that means 'Bart'. The dot is not included as a token.
eq_(
tokenize(hobart, 'zh'),
['', '', '', '', '巴特']
)
eq_(
tokenize(fact_simplified, 'zh'),
[
# he / is / in history / #6 / counter for people
'', '', '历史上', '第六', '',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
]
)
# You match the same tokens if you look it up in Traditional Chinese.
eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
assert_greater(word_frequency(fact_traditional, 'zh'), 0)
def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert_almost_equal(
word_frequency('谢谢谢谢', 'zh'),
xiexie_freq / 2
)

View File

@ -3,17 +3,17 @@ from wordfreq._chinese_mapping import SIMPLIFIED_MAP
import jieba
jieba_initialized = False
jieba_tokenizer = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold()
def chinese_tokenize(text):
global jieba_initialized
if not jieba_initialized:
jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
jieba_initialized = True
return list(jieba.cut(simplify_chinese(text)))
def jieba_tokenize(text):
global jieba_tokenizer
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)

31915
wordfreq/data/jieba_zh.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -118,13 +118,16 @@ def tokenize(text, lang):
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.japanese import mecab_tokenize
return mecab_tokenize(text)
tokens = mecab_tokenize(text)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'zh':
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
return jieba_tokenize(text)
tokens = jieba_tokenize(text)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'tr':
return turkish_tokenize(text)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 MiB

After

Width:  |  Height:  |  Size: 1.9 MiB

View File

@ -67,6 +67,13 @@ rule convert_opensubtitles
rule convert_subtlex
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
rule convert_jieba
command = cut -d ' ' -f 1,2 $in | grep -v '[,"]' | tr ' ' ',' > $out
rule counts_to_jieba
command = python -m wordfreq_builder.cli.counts_to_jieba $in $out
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
# the input files, keep only the single words and their counts, and only keep
# lines with counts of 100 or more.

View File

@ -0,0 +1,15 @@
from wordfreq_builder.word_counts import read_values, write_jieba
import argparse
def handle_counts(filename_in, filename_out):
freqs, total = read_values(filename_in, cutoff=1e-6)
write_jieba(freqs, filename_out)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input wordlist')
parser.add_argument('filename_out', help='name of output Jieba-compatible wordlist')
args = parser.parse_args()
handle_counts(args.filename_in, args.filename_out)

View File

@ -41,6 +41,7 @@ CONFIG = {
],
'subtlex-en': ['en'],
'subtlex-other': ['de', 'nl', 'zh'],
'jieba': ['zh']
},
# Subtlex languages that need to be pre-processed
'wordlist_paths': {
@ -51,9 +52,11 @@ CONFIG = {
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
'combined': 'generated/combined/combined_{lang}.{ext}',
'combined-dist': 'dist/combined_{lang}.{ext}',
'twitter-dist': 'dist/twitter_{lang}.{ext}'
'twitter-dist': 'dist/twitter_{lang}.{ext}',
'jieba-dist': 'dist/jieba_{lang}.{ext}'
},
'min_sources': 2
}

View File

@ -3,6 +3,7 @@ from wordfreq_builder.config import (
)
import sys
import pathlib
import itertools
HEADER = """# This file is automatically generated. Do not edit it.
# You can change its behavior by editing wordfreq_builder/ninja.py,
@ -45,51 +46,43 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
# The first dependency is to make sure the build file is up to date.
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
extra='wordfreq_builder/ninja.py')
lines.extend(
lines.extend(itertools.chain(
twitter_deps(
data_filename('raw-input/twitter/all-2014.txt'),
slice_prefix=data_filename('slices/twitter/tweets-2014'),
combined_prefix=data_filename('generated/twitter/tweets-2014'),
slices=40,
languages=CONFIG['sources']['twitter']
)
)
lines.extend(
),
wikipedia_deps(
data_filename('raw-input/wikipedia'),
CONFIG['sources']['wikipedia']
)
)
lines.extend(
),
google_books_deps(
data_filename('raw-input/google-books')
)
)
lines.extend(
),
leeds_deps(
data_filename('source-lists/leeds'),
CONFIG['sources']['leeds']
)
)
lines.extend(
),
opensubtitles_deps(
data_filename('source-lists/opensubtitles'),
CONFIG['sources']['opensubtitles']
)
)
lines.extend(
),
subtlex_en_deps(
data_filename('source-lists/subtlex'),
CONFIG['sources']['subtlex-en']
)
)
lines.extend(
),
subtlex_other_deps(
data_filename('source-lists/subtlex'),
CONFIG['sources']['subtlex-other']
)
)
lines.extend(combine_lists(all_languages()))
),
jieba_deps(
data_filename('source-lists/jieba'),
CONFIG['sources']['jieba']
),
combine_lists(all_languages())
))
print('\n'.join(lines), file=out)
@ -189,8 +182,14 @@ def leeds_deps(dirname_in, languages):
input_file = '{prefix}/internet-{lang}-forms.num'.format(
prefix=dirname_in, lang=language
)
if language == 'zh':
step2_file = wordlist_filename('leeds', 'zh-Hans', 'converted.txt')
add_dep(lines, 'simplify_chinese', input_file, step2_file)
else:
step2_file = input_file
reformatted_file = wordlist_filename('leeds', language, 'counts.txt')
add_dep(lines, 'convert_leeds', input_file, reformatted_file)
add_dep(lines, 'convert_leeds', step2_file, reformatted_file)
return lines
@ -201,14 +200,37 @@ def opensubtitles_deps(dirname_in, languages):
input_file = '{prefix}/{lang}.txt'.format(
prefix=dirname_in, lang=language
)
if language == 'zh':
step2_file = wordlist_filename('opensubtitles', 'zh-Hans', 'converted.txt')
add_dep(lines, 'simplify_chinese', input_file, step2_file)
else:
step2_file = input_file
reformatted_file = wordlist_filename(
'opensubtitles', language, 'counts.txt'
)
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
add_dep(lines, 'convert_opensubtitles', step2_file, reformatted_file)
return lines
def jieba_deps(dirname_in, languages):
lines = []
# Either subtlex_zh is turned off, or it's just in Chinese
if not languages:
return lines
assert languages == ['zh']
input_file = '{prefix}/dict.txt.big'.format(prefix=dirname_in)
transformed_file = wordlist_filename(
'jieba', 'zh-Hans', 'converted.txt'
)
reformatted_file = wordlist_filename(
'jieba', 'zh', 'counts.txt'
)
add_dep(lines, 'simplify_chinese', input_file, transformed_file)
add_dep(lines, 'convert_jieba', transformed_file, reformatted_file)
return lines
# Which columns of the SUBTLEX data files do the word and its frequency appear
# in?
SUBTLEX_COLUMN_MAP = {
@ -222,6 +244,9 @@ SUBTLEX_COLUMN_MAP = {
def subtlex_en_deps(dirname_in, languages):
lines = []
# Either subtlex_en is turned off, or it's just in English
if not languages:
return lines
assert languages == ['en']
regions = ['en-US', 'en-GB']
processed_files = []
@ -259,8 +284,14 @@ def subtlex_other_deps(dirname_in, languages):
else:
startrow = 2
if language == 'zh':
step2_file = wordlist_filename('subtlex-other', 'zh-Hans', 'converted.txt')
add_dep(lines, 'simplify_chinese', input_file, step2_file)
else:
step2_file = input_file
add_dep(
lines, 'convert_subtlex', input_file, processed_file,
lines, 'convert_subtlex', step2_file, processed_file,
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow}
)
add_dep(
@ -301,6 +332,12 @@ def combine_lists(languages):
lines.append('default {}'.format(output_cBpack))
# Write a Jieba-compatible frequency file for Chinese tokenization
chinese_combined = wordlist_filename('combined', 'zh')
jieba_output = wordlist_filename('jieba-dist', 'zh')
add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output,
extra=['wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py'])
lines.append('default {}'.format(jieba_output))
return lines

View File

@ -12,6 +12,7 @@ import regex
# Match common cases of URLs: the schema http:// or https:// followed by
# non-whitespace characters.
URL_RE = regex.compile(r'https?://(?:\S)+')
HAN_RE = regex.compile(r'[\p{Script=Han}]+')
def count_tokens(filename):
@ -162,3 +163,19 @@ def write_wordlist(freqs, filename, cutoff=1e-8):
break
if not ('"' in word or ',' in word):
writer.writerow([word, str(freq)])
def write_jieba(freqs, filename):
"""
Write a dictionary of frequencies in a format that can be used for Jieba
tokenization of Chinese.
"""
with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
for word, freq in items:
if HAN_RE.search(word):
# Only store this word as a token if it contains at least one
# Han character.
fake_count = round(freq * 1e9)
print('%s %d' % (word, fake_count), file=outfile)