mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge branch 'master' into chinese-external-wordlist
Conflicts: wordfreq/chinese.py
This commit is contained in:
commit
cea2a61444
@ -1,2 +1,3 @@
|
||||
recursive-include wordfreq/data *.gz
|
||||
include README.md
|
||||
recursive-include wordfreq/data *.txt
|
||||
|
17
README.md
17
README.md
@ -232,20 +232,14 @@ sources:
|
||||
|
||||
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
|
||||
|
||||
<<<<<<< HEAD
|
||||
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
|
||||
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. (see citations below) and
|
||||
available at http://crr.ugent.be/programs-data/subtitle-frequencies.
|
||||
=======
|
||||
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
|
||||
SUBTLEX-CH, created by Marc Brysbaert et al. and available at
|
||||
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
|
||||
(see citations below) and available at
|
||||
http://crr.ugent.be/programs-data/subtitle-frequencies.
|
||||
>>>>>>> greek-and-turkish
|
||||
|
||||
I (Rob Speer) have
|
||||
obtained permission by e-mail from Marc Brysbaert to distribute these wordlists
|
||||
in wordfreq, to be used for any purpose, not just for academic use, under these
|
||||
conditions:
|
||||
I (Rob Speer) have obtained permission by e-mail from Marc Brysbaert to
|
||||
distribute these wordlists in wordfreq, to be used for any purpose, not just
|
||||
for academic use, under these conditions:
|
||||
|
||||
- Wordfreq and code derived from it must credit the SUBTLEX authors.
|
||||
- It must remain clear that SUBTLEX is freely available data.
|
||||
@ -297,4 +291,3 @@ Twitter; it does not display or republish any Twitter content.
|
||||
SUBTLEX-UK: A new and improved word frequency database for British English.
|
||||
The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
|
||||
http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521
|
||||
|
||||
|
@ -1,7 +1,21 @@
|
||||
"""
|
||||
Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
|
||||
Chinese characters to their Simplified Chinese equivalents.
|
||||
|
||||
This is meant to be a normalization of text, somewhat like case-folding -- not
|
||||
an actual translator, a task for which this method would be unsuitable. We
|
||||
store word frequencies using Simplified Chinese characters so that, in the
|
||||
large number of cases where a Traditional Chinese word has an obvious
|
||||
Simplified Chinese mapping, we can get a frequency for it that's the same in
|
||||
Simplified and Traditional Chinese.
|
||||
|
||||
Generating this mapping requires the external Chinese conversion tool OpenCC.
|
||||
"""
|
||||
import unicodedata
|
||||
import itertools
|
||||
import os
|
||||
import pprint
|
||||
import msgpack
|
||||
import gzip
|
||||
|
||||
|
||||
def make_hanzi_table(filename):
|
||||
@ -12,7 +26,7 @@ def make_hanzi_table(filename):
|
||||
print('%5X\t%s' % (codept, char), file=out)
|
||||
|
||||
|
||||
def make_hanzi_converter(table_in, python_out):
|
||||
def make_hanzi_converter(table_in, msgpack_out):
|
||||
table = {}
|
||||
with open(table_in, encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
@ -21,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
|
||||
assert len(char) == 1
|
||||
if chr(codept) != char:
|
||||
table[codept] = char
|
||||
with open(python_out, 'w', encoding='utf-8') as outfile:
|
||||
print('SIMPLIFIED_MAP = ', end='', file=outfile)
|
||||
pprint.pprint(table, stream=outfile)
|
||||
with gzip.open(msgpack_out, 'wb') as outfile:
|
||||
msgpack.dump(table, outfile, encoding='utf-8')
|
||||
|
||||
|
||||
def build():
|
||||
make_hanzi_table('/tmp/han_in.txt')
|
||||
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
|
||||
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
|
||||
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -14,10 +14,10 @@ def test_combination():
|
||||
|
||||
assert_almost_equal(
|
||||
word_frequency('おはようおはよう', 'ja'),
|
||||
ohayou_freq / 20
|
||||
ohayou_freq / 2
|
||||
)
|
||||
assert_almost_equal(
|
||||
1.0 / word_frequency('おはようございます', 'ja'),
|
||||
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
|
||||
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
|
||||
)
|
||||
|
||||
|
@ -15,11 +15,19 @@ logger = logging.getLogger(__name__)
|
||||
CACHE_SIZE = 100000
|
||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||
|
||||
# Chinese and Japanese are written without spaces. This means we have to
|
||||
# run language-specific code to infer token boundaries on them, and also
|
||||
# that we need to adjust frequencies of multi-token phrases to account
|
||||
# for the fact that token boundaries were inferred.
|
||||
SPACELESS_LANGUAGES = {'zh', 'ja'}
|
||||
# Chinese and Japanese are written without spaces. In Chinese, in particular,
|
||||
# we have to infer word boundaries from the frequencies of the words they
|
||||
# would create. When this happens, we should adjust the resulting frequency
|
||||
# to avoid creating a bias toward improbable word combinations.
|
||||
INFERRED_SPACE_LANGUAGES = {'zh'}
|
||||
|
||||
# We'll divide the frequency by 10 for each token boundary that was inferred.
|
||||
# (We determined the factor of 10 empirically by looking at words in the
|
||||
# Chinese wordlist that weren't common enough to be identified by the
|
||||
# tokenizer. These words would get split into multiple tokens, and their
|
||||
# inferred frequency would be on average 9.77 times higher than their actual
|
||||
# frequency.)
|
||||
INFERRED_SPACE_FACTOR = 10.0
|
||||
|
||||
# simple_tokenize is imported so that other things can import it from here.
|
||||
# Suppress the pyflakes warning.
|
||||
@ -85,6 +93,7 @@ def available_languages(wordlist='combined'):
|
||||
"""
|
||||
available = {}
|
||||
for path in DATA_PATH.glob('*.msgpack.gz'):
|
||||
if not path.name.startswith('_'):
|
||||
list_name = path.name.split('.')[0]
|
||||
name, lang = list_name.split('_')
|
||||
if name == wordlist:
|
||||
@ -188,14 +197,8 @@ def _word_frequency(word, lang, wordlist, minimum):
|
||||
|
||||
freq = 1.0 / one_over_result
|
||||
|
||||
if lang in SPACELESS_LANGUAGES:
|
||||
# Divide the frequency by 10 for each token boundary that was inferred.
|
||||
# (We determined the factor of 10 empirically by looking at words in
|
||||
# the Chinese wordlist that weren't common enough to be identified by
|
||||
# the tokenizer. These words would get split into multiple tokens, and
|
||||
# their inferred frequency would be on average 9.77 times higher than
|
||||
# their actual frequency.)
|
||||
freq /= 10 ** (len(tokens) - 1)
|
||||
if lang in INFERRED_SPACE_LANGUAGES:
|
||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
||||
|
||||
return max(freq, minimum)
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,12 +1,14 @@
|
||||
from pkg_resources import resource_filename
|
||||
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
|
||||
import jieba
|
||||
import msgpack
|
||||
import gzip
|
||||
|
||||
|
||||
jieba_tokenizer = None
|
||||
jieba_orig_tokenizer = None
|
||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
||||
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
||||
jieba_tokenizer = None
|
||||
jieba_orig_tokenizer = None
|
||||
|
||||
|
||||
def simplify_chinese(text):
|
||||
|
BIN
wordfreq/data/_chinese_mapping.msgpack.gz
Normal file
BIN
wordfreq/data/_chinese_mapping.msgpack.gz
Normal file
Binary file not shown.
@ -215,7 +215,8 @@ def opensubtitles_deps(dirname_in, languages):
|
||||
|
||||
def jieba_deps(dirname_in, languages):
|
||||
lines = []
|
||||
# Either subtlex_zh is turned off, or it's just in Chinese
|
||||
# Because there's Chinese-specific handling here, the valid options for
|
||||
# 'languages' are [] and ['zh']. Make sure it's one of those.
|
||||
if not languages:
|
||||
return lines
|
||||
assert languages == ['zh']
|
||||
|
@ -42,6 +42,9 @@ def read_values(filename, cutoff=0, lang=None):
|
||||
|
||||
If `cutoff` is greater than 0, the csv file must be sorted by value
|
||||
in descending order.
|
||||
|
||||
If `lang` is given, it will apply language-specific tokenization to the
|
||||
words that it reads.
|
||||
"""
|
||||
values = defaultdict(float)
|
||||
total = 0.
|
||||
|
Loading…
Reference in New Issue
Block a user