Merge branch 'master' into chinese-external-wordlist

Conflicts:
	wordfreq/chinese.py

Former-commit-id: cea2a61444
This commit is contained in:
Rob Speer 2015-09-24 13:40:08 -04:00
commit faf66e9b08
10 changed files with 58 additions and 3317 deletions

View File

@ -1,2 +1,3 @@
recursive-include wordfreq/data *.gz
include README.md
recursive-include wordfreq/data *.txt

View File

@ -232,20 +232,14 @@ sources:
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
<<<<<<< HEAD
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. (see citations below) and
available at http://crr.ugent.be/programs-data/subtitle-frequencies.
=======
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
SUBTLEX-CH, created by Marc Brysbaert et al. and available at
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
(see citations below) and available at
http://crr.ugent.be/programs-data/subtitle-frequencies.
>>>>>>> greek-and-turkish
I (Rob Speer) have
obtained permission by e-mail from Marc Brysbaert to distribute these wordlists
in wordfreq, to be used for any purpose, not just for academic use, under these
conditions:
I (Rob Speer) have obtained permission by e-mail from Marc Brysbaert to
distribute these wordlists in wordfreq, to be used for any purpose, not just
for academic use, under these conditions:
- Wordfreq and code derived from it must credit the SUBTLEX authors.
- It must remain clear that SUBTLEX is freely available data.
@ -297,4 +291,3 @@ Twitter; it does not display or republish any Twitter content.
SUBTLEX-UK: A new and improved word frequency database for British English.
The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521

View File

@ -1,7 +1,21 @@
"""
Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
Chinese characters to their Simplified Chinese equivalents.
This is meant to be a normalization of text, somewhat like case-folding -- not
an actual translator, a task for which this method would be unsuitable. We
store word frequencies using Simplified Chinese characters so that, in the
large number of cases where a Traditional Chinese word has an obvious
Simplified Chinese mapping, we can get a frequency for it that's the same in
Simplified and Traditional Chinese.
Generating this mapping requires the external Chinese conversion tool OpenCC.
"""
import unicodedata
import itertools
import os
import pprint
import msgpack
import gzip
def make_hanzi_table(filename):
@ -12,7 +26,7 @@ def make_hanzi_table(filename):
print('%5X\t%s' % (codept, char), file=out)
def make_hanzi_converter(table_in, python_out):
def make_hanzi_converter(table_in, msgpack_out):
table = {}
with open(table_in, encoding='utf-8') as infile:
for line in infile:
@ -21,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
assert len(char) == 1
if chr(codept) != char:
table[codept] = char
with open(python_out, 'w', encoding='utf-8') as outfile:
print('SIMPLIFIED_MAP = ', end='', file=outfile)
pprint.pprint(table, stream=outfile)
with gzip.open(msgpack_out, 'wb') as outfile:
msgpack.dump(table, outfile, encoding='utf-8')
def build():
make_hanzi_table('/tmp/han_in.txt')
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
if __name__ == '__main__':

View File

@ -14,10 +14,10 @@ def test_combination():
assert_almost_equal(
word_frequency('おはようおはよう', 'ja'),
ohayou_freq / 20
ohayou_freq / 2
)
assert_almost_equal(
1.0 / word_frequency('おはようございます', 'ja'),
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
)

View File

@ -15,11 +15,19 @@ logger = logging.getLogger(__name__)
CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
# Chinese and Japanese are written without spaces. This means we have to
# run language-specific code to infer token boundaries on them, and also
# that we need to adjust frequencies of multi-token phrases to account
# for the fact that token boundaries were inferred.
SPACELESS_LANGUAGES = {'zh', 'ja'}
# Chinese and Japanese are written without spaces. In Chinese, in particular,
# we have to infer word boundaries from the frequencies of the words they
# would create. When this happens, we should adjust the resulting frequency
# to avoid creating a bias toward improbable word combinations.
INFERRED_SPACE_LANGUAGES = {'zh'}
# We'll divide the frequency by 10 for each token boundary that was inferred.
# (We determined the factor of 10 empirically by looking at words in the
# Chinese wordlist that weren't common enough to be identified by the
# tokenizer. These words would get split into multiple tokens, and their
# inferred frequency would be on average 9.77 times higher than their actual
# frequency.)
INFERRED_SPACE_FACTOR = 10.0
# simple_tokenize is imported so that other things can import it from here.
# Suppress the pyflakes warning.
@ -85,10 +93,11 @@ def available_languages(wordlist='combined'):
"""
available = {}
for path in DATA_PATH.glob('*.msgpack.gz'):
list_name = path.name.split('.')[0]
name, lang = list_name.split('_')
if name == wordlist:
available[lang] = str(path)
if not path.name.startswith('_'):
list_name = path.name.split('.')[0]
name, lang = list_name.split('_')
if name == wordlist:
available[lang] = str(path)
return available
@ -188,14 +197,8 @@ def _word_frequency(word, lang, wordlist, minimum):
freq = 1.0 / one_over_result
if lang in SPACELESS_LANGUAGES:
# Divide the frequency by 10 for each token boundary that was inferred.
# (We determined the factor of 10 empirically by looking at words in
# the Chinese wordlist that weren't common enough to be identified by
# the tokenizer. These words would get split into multiple tokens, and
# their inferred frequency would be on average 9.77 times higher than
# their actual frequency.)
freq /= 10 ** (len(tokens) - 1)
if lang in INFERRED_SPACE_LANGUAGES:
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
return max(freq, minimum)

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,14 @@
from pkg_resources import resource_filename
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
import jieba
import msgpack
import gzip
jieba_tokenizer = None
jieba_orig_tokenizer = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
jieba_tokenizer = None
jieba_orig_tokenizer = None
def simplify_chinese(text):

Binary file not shown.

View File

@ -215,7 +215,8 @@ def opensubtitles_deps(dirname_in, languages):
def jieba_deps(dirname_in, languages):
lines = []
# Either subtlex_zh is turned off, or it's just in Chinese
# Because there's Chinese-specific handling here, the valid options for
# 'languages' are [] and ['zh']. Make sure it's one of those.
if not languages:
return lines
assert languages == ['zh']

View File

@ -42,6 +42,9 @@ def read_values(filename, cutoff=0, lang=None):
If `cutoff` is greater than 0, the csv file must be sorted by value
in descending order.
If `lang` is given, it will apply language-specific tokenization to the
words that it reads.
"""
values = defaultdict(float)
total = 0.