mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
tokenize Chinese using jieba and our own frequencies
Former-commit-id: 2327f2e4d6
This commit is contained in:
parent
7d1c2e72e4
commit
a4554fb87c
9
setup.py
9
setup.py
@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='1.1',
|
||||
version='1.2',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
@ -50,8 +50,11 @@ setup(
|
||||
# turn, it depends on libmecab-dev being installed on the system. It's not
|
||||
# listed under 'install_requires' because wordfreq should be usable in
|
||||
# other languages without it.
|
||||
#
|
||||
# Similarly, jieba is required for Chinese word frequencies.
|
||||
extras_require={
|
||||
'mecab': 'mecab-python3'
|
||||
'mecab': 'mecab-python3',
|
||||
'jieba': 'jieba'
|
||||
},
|
||||
tests_require=['mecab-python3'],
|
||||
tests_require=['mecab-python3', 'jieba'],
|
||||
)
|
||||
|
@ -162,8 +162,8 @@ def test_ar():
|
||||
|
||||
|
||||
def test_ideographic_fallback():
|
||||
# Try tokenizing Chinese text -- it should remain stuck together.
|
||||
eq_(tokenize('中国文字', 'zh'), ['中国文字'])
|
||||
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
||||
eq_(tokenize('中国文字', 'en'), ['中国文字'])
|
||||
|
||||
# When Japanese is tagged with the wrong language, it will be split
|
||||
# at script boundaries.
|
||||
|
48
tests/test_chinese.py
Normal file
48
tests/test_chinese.py
Normal file
@ -0,0 +1,48 @@
|
||||
from nose.tools import eq_, assert_almost_equal, assert_greater
|
||||
from wordfreq import tokenize, word_frequency
|
||||
|
||||
|
||||
def test_tokens():
|
||||
# Let's test on some Chinese text that has unusual combinations of
|
||||
# syllables, because it is about an American vice-president.
|
||||
#
|
||||
# (He was the Chinese Wikipedia's featured article of the day when I
|
||||
# wrote this test.)
|
||||
|
||||
hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè".
|
||||
|
||||
# He was the sixth American vice president to die in office.
|
||||
fact_simplified = '他是历史上第六位在任期内去世的美国副总统。'
|
||||
fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'
|
||||
|
||||
# His name breaks into five pieces, with the only piece staying together
|
||||
# being the one that means 'Bart'. The dot is not included as a token.
|
||||
eq_(
|
||||
tokenize(hobart, 'zh'),
|
||||
['加', '勒', '特', '霍', '巴特']
|
||||
)
|
||||
|
||||
eq_(
|
||||
tokenize(fact_simplified, 'zh'),
|
||||
[
|
||||
# he / is / in history / #6 / counter for people
|
||||
'他', '是', '历史上', '第六', '位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
)
|
||||
|
||||
# You match the same tokens if you look it up in Traditional Chinese.
|
||||
eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
|
||||
assert_greater(word_frequency(fact_traditional, 'zh'), 0)
|
||||
|
||||
|
||||
def test_combination():
|
||||
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
||||
assert_almost_equal(
|
||||
word_frequency('谢谢谢谢', 'zh'),
|
||||
xiexie_freq / 2
|
||||
)
|
||||
|
@ -3,17 +3,17 @@ from wordfreq._chinese_mapping import SIMPLIFIED_MAP
|
||||
import jieba
|
||||
|
||||
|
||||
jieba_initialized = False
|
||||
jieba_tokenizer = None
|
||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||
|
||||
|
||||
def simplify_chinese(text):
|
||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||
|
||||
|
||||
def chinese_tokenize(text):
|
||||
global jieba_initialized
|
||||
if not jieba_initialized:
|
||||
jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
|
||||
jieba_initialized = True
|
||||
return list(jieba.cut(simplify_chinese(text)))
|
||||
def jieba_tokenize(text):
|
||||
global jieba_tokenizer
|
||||
if jieba_tokenizer is None:
|
||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||
|
||||
|
31915
wordfreq/data/jieba_zh.txt
Normal file
31915
wordfreq/data/jieba_zh.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -118,13 +118,16 @@ def tokenize(text, lang):
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.japanese import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
tokens = mecab_tokenize(text)
|
||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
||||
|
||||
if lang == 'zh':
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
return jieba_tokenize(text)
|
||||
tokens = jieba_tokenize(text)
|
||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
||||
|
||||
|
||||
if lang == 'tr':
|
||||
return turkish_tokenize(text)
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 1.9 MiB After Width: | Height: | Size: 1.9 MiB |
@ -67,6 +67,13 @@ rule convert_opensubtitles
|
||||
rule convert_subtlex
|
||||
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
|
||||
|
||||
rule convert_jieba
|
||||
command = cut -d ' ' -f 1,2 $in | grep -v '[,"]' | tr ' ' ',' > $out
|
||||
|
||||
rule counts_to_jieba
|
||||
command = python -m wordfreq_builder.cli.counts_to_jieba $in $out
|
||||
|
||||
|
||||
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
||||
# the input files, keep only the single words and their counts, and only keep
|
||||
# lines with counts of 100 or more.
|
||||
|
15
wordfreq_builder/wordfreq_builder/cli/counts_to_jieba.py
Normal file
15
wordfreq_builder/wordfreq_builder/cli/counts_to_jieba.py
Normal file
@ -0,0 +1,15 @@
|
||||
from wordfreq_builder.word_counts import read_values, write_jieba
|
||||
import argparse
|
||||
|
||||
|
||||
def handle_counts(filename_in, filename_out):
|
||||
freqs, total = read_values(filename_in, cutoff=1e-6)
|
||||
write_jieba(freqs, filename_out)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename_in', help='name of input wordlist')
|
||||
parser.add_argument('filename_out', help='name of output Jieba-compatible wordlist')
|
||||
args = parser.parse_args()
|
||||
handle_counts(args.filename_in, args.filename_out)
|
@ -41,6 +41,7 @@ CONFIG = {
|
||||
],
|
||||
'subtlex-en': ['en'],
|
||||
'subtlex-other': ['de', 'nl', 'zh'],
|
||||
'jieba': ['zh']
|
||||
},
|
||||
# Subtlex languages that need to be pre-processed
|
||||
'wordlist_paths': {
|
||||
@ -51,9 +52,11 @@ CONFIG = {
|
||||
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
|
||||
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
|
||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||
'twitter-dist': 'dist/twitter_{lang}.{ext}'
|
||||
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
||||
'jieba-dist': 'dist/jieba_{lang}.{ext}'
|
||||
},
|
||||
'min_sources': 2
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ from wordfreq_builder.config import (
|
||||
)
|
||||
import sys
|
||||
import pathlib
|
||||
import itertools
|
||||
|
||||
HEADER = """# This file is automatically generated. Do not edit it.
|
||||
# You can change its behavior by editing wordfreq_builder/ninja.py,
|
||||
@ -45,51 +46,43 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||
# The first dependency is to make sure the build file is up to date.
|
||||
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
|
||||
extra='wordfreq_builder/ninja.py')
|
||||
lines.extend(
|
||||
lines.extend(itertools.chain(
|
||||
twitter_deps(
|
||||
data_filename('raw-input/twitter/all-2014.txt'),
|
||||
slice_prefix=data_filename('slices/twitter/tweets-2014'),
|
||||
combined_prefix=data_filename('generated/twitter/tweets-2014'),
|
||||
slices=40,
|
||||
languages=CONFIG['sources']['twitter']
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
),
|
||||
wikipedia_deps(
|
||||
data_filename('raw-input/wikipedia'),
|
||||
CONFIG['sources']['wikipedia']
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
),
|
||||
google_books_deps(
|
||||
data_filename('raw-input/google-books')
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
),
|
||||
leeds_deps(
|
||||
data_filename('source-lists/leeds'),
|
||||
CONFIG['sources']['leeds']
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
),
|
||||
opensubtitles_deps(
|
||||
data_filename('source-lists/opensubtitles'),
|
||||
CONFIG['sources']['opensubtitles']
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
),
|
||||
subtlex_en_deps(
|
||||
data_filename('source-lists/subtlex'),
|
||||
CONFIG['sources']['subtlex-en']
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
),
|
||||
subtlex_other_deps(
|
||||
data_filename('source-lists/subtlex'),
|
||||
CONFIG['sources']['subtlex-other']
|
||||
)
|
||||
)
|
||||
lines.extend(combine_lists(all_languages()))
|
||||
),
|
||||
jieba_deps(
|
||||
data_filename('source-lists/jieba'),
|
||||
CONFIG['sources']['jieba']
|
||||
),
|
||||
combine_lists(all_languages())
|
||||
))
|
||||
|
||||
print('\n'.join(lines), file=out)
|
||||
|
||||
@ -189,8 +182,14 @@ def leeds_deps(dirname_in, languages):
|
||||
input_file = '{prefix}/internet-{lang}-forms.num'.format(
|
||||
prefix=dirname_in, lang=language
|
||||
)
|
||||
if language == 'zh':
|
||||
step2_file = wordlist_filename('leeds', 'zh-Hans', 'converted.txt')
|
||||
add_dep(lines, 'simplify_chinese', input_file, step2_file)
|
||||
else:
|
||||
step2_file = input_file
|
||||
|
||||
reformatted_file = wordlist_filename('leeds', language, 'counts.txt')
|
||||
add_dep(lines, 'convert_leeds', input_file, reformatted_file)
|
||||
add_dep(lines, 'convert_leeds', step2_file, reformatted_file)
|
||||
|
||||
return lines
|
||||
|
||||
@ -201,14 +200,37 @@ def opensubtitles_deps(dirname_in, languages):
|
||||
input_file = '{prefix}/{lang}.txt'.format(
|
||||
prefix=dirname_in, lang=language
|
||||
)
|
||||
if language == 'zh':
|
||||
step2_file = wordlist_filename('opensubtitles', 'zh-Hans', 'converted.txt')
|
||||
add_dep(lines, 'simplify_chinese', input_file, step2_file)
|
||||
else:
|
||||
step2_file = input_file
|
||||
reformatted_file = wordlist_filename(
|
||||
'opensubtitles', language, 'counts.txt'
|
||||
)
|
||||
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
|
||||
add_dep(lines, 'convert_opensubtitles', step2_file, reformatted_file)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def jieba_deps(dirname_in, languages):
|
||||
lines = []
|
||||
# Either subtlex_zh is turned off, or it's just in Chinese
|
||||
if not languages:
|
||||
return lines
|
||||
assert languages == ['zh']
|
||||
input_file = '{prefix}/dict.txt.big'.format(prefix=dirname_in)
|
||||
transformed_file = wordlist_filename(
|
||||
'jieba', 'zh-Hans', 'converted.txt'
|
||||
)
|
||||
reformatted_file = wordlist_filename(
|
||||
'jieba', 'zh', 'counts.txt'
|
||||
)
|
||||
add_dep(lines, 'simplify_chinese', input_file, transformed_file)
|
||||
add_dep(lines, 'convert_jieba', transformed_file, reformatted_file)
|
||||
return lines
|
||||
|
||||
|
||||
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
||||
# in?
|
||||
SUBTLEX_COLUMN_MAP = {
|
||||
@ -222,6 +244,9 @@ SUBTLEX_COLUMN_MAP = {
|
||||
|
||||
def subtlex_en_deps(dirname_in, languages):
|
||||
lines = []
|
||||
# Either subtlex_en is turned off, or it's just in English
|
||||
if not languages:
|
||||
return lines
|
||||
assert languages == ['en']
|
||||
regions = ['en-US', 'en-GB']
|
||||
processed_files = []
|
||||
@ -259,8 +284,14 @@ def subtlex_other_deps(dirname_in, languages):
|
||||
else:
|
||||
startrow = 2
|
||||
|
||||
if language == 'zh':
|
||||
step2_file = wordlist_filename('subtlex-other', 'zh-Hans', 'converted.txt')
|
||||
add_dep(lines, 'simplify_chinese', input_file, step2_file)
|
||||
else:
|
||||
step2_file = input_file
|
||||
|
||||
add_dep(
|
||||
lines, 'convert_subtlex', input_file, processed_file,
|
||||
lines, 'convert_subtlex', step2_file, processed_file,
|
||||
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow}
|
||||
)
|
||||
add_dep(
|
||||
@ -301,6 +332,12 @@ def combine_lists(languages):
|
||||
|
||||
lines.append('default {}'.format(output_cBpack))
|
||||
|
||||
# Write a Jieba-compatible frequency file for Chinese tokenization
|
||||
chinese_combined = wordlist_filename('combined', 'zh')
|
||||
jieba_output = wordlist_filename('jieba-dist', 'zh')
|
||||
add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output,
|
||||
extra=['wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py'])
|
||||
lines.append('default {}'.format(jieba_output))
|
||||
return lines
|
||||
|
||||
|
||||
|
@ -12,6 +12,7 @@ import regex
|
||||
# Match common cases of URLs: the schema http:// or https:// followed by
|
||||
# non-whitespace characters.
|
||||
URL_RE = regex.compile(r'https?://(?:\S)+')
|
||||
HAN_RE = regex.compile(r'[\p{Script=Han}]+')
|
||||
|
||||
|
||||
def count_tokens(filename):
|
||||
@ -162,3 +163,19 @@ def write_wordlist(freqs, filename, cutoff=1e-8):
|
||||
break
|
||||
if not ('"' in word or ',' in word):
|
||||
writer.writerow([word, str(freq)])
|
||||
|
||||
|
||||
def write_jieba(freqs, filename):
|
||||
"""
|
||||
Write a dictionary of frequencies in a format that can be used for Jieba
|
||||
tokenization of Chinese.
|
||||
"""
|
||||
with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
|
||||
items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
|
||||
for word, freq in items:
|
||||
if HAN_RE.search(word):
|
||||
# Only store this word as a token if it contains at least one
|
||||
# Han character.
|
||||
fake_count = round(freq * 1e9)
|
||||
print('%s %d' % (word, fake_count), file=outfile)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user