Merge branch 'master' into chinese-external-wordlist

Conflicts:
	wordfreq/chinese.py
This commit is contained in:
Rob Speer 2015-09-24 13:40:08 -04:00
commit cea2a61444
10 changed files with 58 additions and 3317 deletions

View File

@ -1,2 +1,3 @@
recursive-include wordfreq/data *.gz recursive-include wordfreq/data *.gz
include README.md include README.md
recursive-include wordfreq/data *.txt

View File

@ -232,20 +232,14 @@ sources:
- Wikipedia, the free encyclopedia (http://www.wikipedia.org) - Wikipedia, the free encyclopedia (http://www.wikipedia.org)
<<<<<<< HEAD
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. (see citations below) and SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
available at http://crr.ugent.be/programs-data/subtitle-frequencies. (see citations below) and available at
=======
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
SUBTLEX-CH, created by Marc Brysbaert et al. and available at
http://crr.ugent.be/programs-data/subtitle-frequencies. http://crr.ugent.be/programs-data/subtitle-frequencies.
>>>>>>> greek-and-turkish
I (Rob Speer) have I (Rob Speer) have obtained permission by e-mail from Marc Brysbaert to
obtained permission by e-mail from Marc Brysbaert to distribute these wordlists distribute these wordlists in wordfreq, to be used for any purpose, not just
in wordfreq, to be used for any purpose, not just for academic use, under these for academic use, under these conditions:
conditions:
- Wordfreq and code derived from it must credit the SUBTLEX authors. - Wordfreq and code derived from it must credit the SUBTLEX authors.
- It must remain clear that SUBTLEX is freely available data. - It must remain clear that SUBTLEX is freely available data.
@ -297,4 +291,3 @@ Twitter; it does not display or republish any Twitter content.
SUBTLEX-UK: A new and improved word frequency database for British English. SUBTLEX-UK: A new and improved word frequency database for British English.
The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190. The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521 http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521

View File

@ -1,7 +1,21 @@
"""
Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
Chinese characters to their Simplified Chinese equivalents.
This is meant to be a normalization of text, somewhat like case-folding -- not
an actual translator, a task for which this method would be unsuitable. We
store word frequencies using Simplified Chinese characters so that, in the
large number of cases where a Traditional Chinese word has an obvious
Simplified Chinese mapping, we can get a frequency for it that's the same in
Simplified and Traditional Chinese.
Generating this mapping requires the external Chinese conversion tool OpenCC.
"""
import unicodedata import unicodedata
import itertools import itertools
import os import os
import pprint import msgpack
import gzip
def make_hanzi_table(filename): def make_hanzi_table(filename):
@ -12,7 +26,7 @@ def make_hanzi_table(filename):
print('%5X\t%s' % (codept, char), file=out) print('%5X\t%s' % (codept, char), file=out)
def make_hanzi_converter(table_in, python_out): def make_hanzi_converter(table_in, msgpack_out):
table = {} table = {}
with open(table_in, encoding='utf-8') as infile: with open(table_in, encoding='utf-8') as infile:
for line in infile: for line in infile:
@ -21,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
assert len(char) == 1 assert len(char) == 1
if chr(codept) != char: if chr(codept) != char:
table[codept] = char table[codept] = char
with open(python_out, 'w', encoding='utf-8') as outfile: with gzip.open(msgpack_out, 'wb') as outfile:
print('SIMPLIFIED_MAP = ', end='', file=outfile) msgpack.dump(table, outfile, encoding='utf-8')
pprint.pprint(table, stream=outfile)
def build(): def build():
make_hanzi_table('/tmp/han_in.txt') make_hanzi_table('/tmp/han_in.txt')
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt') os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py') make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -14,10 +14,10 @@ def test_combination():
assert_almost_equal( assert_almost_equal(
word_frequency('おはようおはよう', 'ja'), word_frequency('おはようおはよう', 'ja'),
ohayou_freq / 20 ohayou_freq / 2
) )
assert_almost_equal( assert_almost_equal(
1.0 / word_frequency('おはようございます', 'ja'), 1.0 / word_frequency('おはようございます', 'ja'),
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq) 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
) )

View File

@ -15,11 +15,19 @@ logger = logging.getLogger(__name__)
CACHE_SIZE = 100000 CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
# Chinese and Japanese are written without spaces. This means we have to # Chinese and Japanese are written without spaces. In Chinese, in particular,
# run language-specific code to infer token boundaries on them, and also # we have to infer word boundaries from the frequencies of the words they
# that we need to adjust frequencies of multi-token phrases to account # would create. When this happens, we should adjust the resulting frequency
# for the fact that token boundaries were inferred. # to avoid creating a bias toward improbable word combinations.
SPACELESS_LANGUAGES = {'zh', 'ja'} INFERRED_SPACE_LANGUAGES = {'zh'}
# We'll divide the frequency by 10 for each token boundary that was inferred.
# (We determined the factor of 10 empirically by looking at words in the
# Chinese wordlist that weren't common enough to be identified by the
# tokenizer. These words would get split into multiple tokens, and their
# inferred frequency would be on average 9.77 times higher than their actual
# frequency.)
INFERRED_SPACE_FACTOR = 10.0
# simple_tokenize is imported so that other things can import it from here. # simple_tokenize is imported so that other things can import it from here.
# Suppress the pyflakes warning. # Suppress the pyflakes warning.
@ -85,10 +93,11 @@ def available_languages(wordlist='combined'):
""" """
available = {} available = {}
for path in DATA_PATH.glob('*.msgpack.gz'): for path in DATA_PATH.glob('*.msgpack.gz'):
list_name = path.name.split('.')[0] if not path.name.startswith('_'):
name, lang = list_name.split('_') list_name = path.name.split('.')[0]
if name == wordlist: name, lang = list_name.split('_')
available[lang] = str(path) if name == wordlist:
available[lang] = str(path)
return available return available
@ -188,14 +197,8 @@ def _word_frequency(word, lang, wordlist, minimum):
freq = 1.0 / one_over_result freq = 1.0 / one_over_result
if lang in SPACELESS_LANGUAGES: if lang in INFERRED_SPACE_LANGUAGES:
# Divide the frequency by 10 for each token boundary that was inferred. freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
# (We determined the factor of 10 empirically by looking at words in
# the Chinese wordlist that weren't common enough to be identified by
# the tokenizer. These words would get split into multiple tokens, and
# their inferred frequency would be on average 9.77 times higher than
# their actual frequency.)
freq /= 10 ** (len(tokens) - 1)
return max(freq, minimum) return max(freq, minimum)

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,14 @@
from pkg_resources import resource_filename from pkg_resources import resource_filename
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
import jieba import jieba
import msgpack
import gzip
jieba_tokenizer = None
jieba_orig_tokenizer = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt') ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
jieba_tokenizer = None
jieba_orig_tokenizer = None
def simplify_chinese(text): def simplify_chinese(text):

Binary file not shown.

View File

@ -215,7 +215,8 @@ def opensubtitles_deps(dirname_in, languages):
def jieba_deps(dirname_in, languages): def jieba_deps(dirname_in, languages):
lines = [] lines = []
# Either subtlex_zh is turned off, or it's just in Chinese # Because there's Chinese-specific handling here, the valid options for
# 'languages' are [] and ['zh']. Make sure it's one of those.
if not languages: if not languages:
return lines return lines
assert languages == ['zh'] assert languages == ['zh']

View File

@ -42,6 +42,9 @@ def read_values(filename, cutoff=0, lang=None):
If `cutoff` is greater than 0, the csv file must be sorted by value If `cutoff` is greater than 0, the csv file must be sorted by value
in descending order. in descending order.
If `lang` is given, it will apply language-specific tokenization to the
words that it reads.
""" """
values = defaultdict(float) values = defaultdict(float)
total = 0. total = 0.