mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge branch 'master' into chinese-external-wordlist
Conflicts: wordfreq/chinese.py
This commit is contained in:
commit
cea2a61444
@ -1,2 +1,3 @@
|
|||||||
recursive-include wordfreq/data *.gz
|
recursive-include wordfreq/data *.gz
|
||||||
include README.md
|
include README.md
|
||||||
|
recursive-include wordfreq/data *.txt
|
||||||
|
17
README.md
17
README.md
@ -232,20 +232,14 @@ sources:
|
|||||||
|
|
||||||
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
|
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
|
||||||
|
|
||||||
<<<<<<< HEAD
|
|
||||||
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
|
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
|
||||||
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. (see citations below) and
|
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
|
||||||
available at http://crr.ugent.be/programs-data/subtitle-frequencies.
|
(see citations below) and available at
|
||||||
=======
|
|
||||||
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
|
|
||||||
SUBTLEX-CH, created by Marc Brysbaert et al. and available at
|
|
||||||
http://crr.ugent.be/programs-data/subtitle-frequencies.
|
http://crr.ugent.be/programs-data/subtitle-frequencies.
|
||||||
>>>>>>> greek-and-turkish
|
|
||||||
|
|
||||||
I (Rob Speer) have
|
I (Rob Speer) have obtained permission by e-mail from Marc Brysbaert to
|
||||||
obtained permission by e-mail from Marc Brysbaert to distribute these wordlists
|
distribute these wordlists in wordfreq, to be used for any purpose, not just
|
||||||
in wordfreq, to be used for any purpose, not just for academic use, under these
|
for academic use, under these conditions:
|
||||||
conditions:
|
|
||||||
|
|
||||||
- Wordfreq and code derived from it must credit the SUBTLEX authors.
|
- Wordfreq and code derived from it must credit the SUBTLEX authors.
|
||||||
- It must remain clear that SUBTLEX is freely available data.
|
- It must remain clear that SUBTLEX is freely available data.
|
||||||
@ -297,4 +291,3 @@ Twitter; it does not display or republish any Twitter content.
|
|||||||
SUBTLEX-UK: A new and improved word frequency database for British English.
|
SUBTLEX-UK: A new and improved word frequency database for British English.
|
||||||
The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
|
The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
|
||||||
http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521
|
http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521
|
||||||
|
|
||||||
|
@ -1,7 +1,21 @@
|
|||||||
|
"""
|
||||||
|
Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
|
||||||
|
Chinese characters to their Simplified Chinese equivalents.
|
||||||
|
|
||||||
|
This is meant to be a normalization of text, somewhat like case-folding -- not
|
||||||
|
an actual translator, a task for which this method would be unsuitable. We
|
||||||
|
store word frequencies using Simplified Chinese characters so that, in the
|
||||||
|
large number of cases where a Traditional Chinese word has an obvious
|
||||||
|
Simplified Chinese mapping, we can get a frequency for it that's the same in
|
||||||
|
Simplified and Traditional Chinese.
|
||||||
|
|
||||||
|
Generating this mapping requires the external Chinese conversion tool OpenCC.
|
||||||
|
"""
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
import os
|
||||||
import pprint
|
import msgpack
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
|
||||||
def make_hanzi_table(filename):
|
def make_hanzi_table(filename):
|
||||||
@ -12,7 +26,7 @@ def make_hanzi_table(filename):
|
|||||||
print('%5X\t%s' % (codept, char), file=out)
|
print('%5X\t%s' % (codept, char), file=out)
|
||||||
|
|
||||||
|
|
||||||
def make_hanzi_converter(table_in, python_out):
|
def make_hanzi_converter(table_in, msgpack_out):
|
||||||
table = {}
|
table = {}
|
||||||
with open(table_in, encoding='utf-8') as infile:
|
with open(table_in, encoding='utf-8') as infile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
@ -21,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
|
|||||||
assert len(char) == 1
|
assert len(char) == 1
|
||||||
if chr(codept) != char:
|
if chr(codept) != char:
|
||||||
table[codept] = char
|
table[codept] = char
|
||||||
with open(python_out, 'w', encoding='utf-8') as outfile:
|
with gzip.open(msgpack_out, 'wb') as outfile:
|
||||||
print('SIMPLIFIED_MAP = ', end='', file=outfile)
|
msgpack.dump(table, outfile, encoding='utf-8')
|
||||||
pprint.pprint(table, stream=outfile)
|
|
||||||
|
|
||||||
|
|
||||||
def build():
|
def build():
|
||||||
make_hanzi_table('/tmp/han_in.txt')
|
make_hanzi_table('/tmp/han_in.txt')
|
||||||
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
|
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
|
||||||
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
|
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -14,10 +14,10 @@ def test_combination():
|
|||||||
|
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
word_frequency('おはようおはよう', 'ja'),
|
word_frequency('おはようおはよう', 'ja'),
|
||||||
ohayou_freq / 20
|
ohayou_freq / 2
|
||||||
)
|
)
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
1.0 / word_frequency('おはようございます', 'ja'),
|
1.0 / word_frequency('おはようございます', 'ja'),
|
||||||
(100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
|
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -15,11 +15,19 @@ logger = logging.getLogger(__name__)
|
|||||||
CACHE_SIZE = 100000
|
CACHE_SIZE = 100000
|
||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
# Chinese and Japanese are written without spaces. This means we have to
|
# Chinese and Japanese are written without spaces. In Chinese, in particular,
|
||||||
# run language-specific code to infer token boundaries on them, and also
|
# we have to infer word boundaries from the frequencies of the words they
|
||||||
# that we need to adjust frequencies of multi-token phrases to account
|
# would create. When this happens, we should adjust the resulting frequency
|
||||||
# for the fact that token boundaries were inferred.
|
# to avoid creating a bias toward improbable word combinations.
|
||||||
SPACELESS_LANGUAGES = {'zh', 'ja'}
|
INFERRED_SPACE_LANGUAGES = {'zh'}
|
||||||
|
|
||||||
|
# We'll divide the frequency by 10 for each token boundary that was inferred.
|
||||||
|
# (We determined the factor of 10 empirically by looking at words in the
|
||||||
|
# Chinese wordlist that weren't common enough to be identified by the
|
||||||
|
# tokenizer. These words would get split into multiple tokens, and their
|
||||||
|
# inferred frequency would be on average 9.77 times higher than their actual
|
||||||
|
# frequency.)
|
||||||
|
INFERRED_SPACE_FACTOR = 10.0
|
||||||
|
|
||||||
# simple_tokenize is imported so that other things can import it from here.
|
# simple_tokenize is imported so that other things can import it from here.
|
||||||
# Suppress the pyflakes warning.
|
# Suppress the pyflakes warning.
|
||||||
@ -85,10 +93,11 @@ def available_languages(wordlist='combined'):
|
|||||||
"""
|
"""
|
||||||
available = {}
|
available = {}
|
||||||
for path in DATA_PATH.glob('*.msgpack.gz'):
|
for path in DATA_PATH.glob('*.msgpack.gz'):
|
||||||
list_name = path.name.split('.')[0]
|
if not path.name.startswith('_'):
|
||||||
name, lang = list_name.split('_')
|
list_name = path.name.split('.')[0]
|
||||||
if name == wordlist:
|
name, lang = list_name.split('_')
|
||||||
available[lang] = str(path)
|
if name == wordlist:
|
||||||
|
available[lang] = str(path)
|
||||||
return available
|
return available
|
||||||
|
|
||||||
|
|
||||||
@ -188,14 +197,8 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
|
|
||||||
freq = 1.0 / one_over_result
|
freq = 1.0 / one_over_result
|
||||||
|
|
||||||
if lang in SPACELESS_LANGUAGES:
|
if lang in INFERRED_SPACE_LANGUAGES:
|
||||||
# Divide the frequency by 10 for each token boundary that was inferred.
|
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
||||||
# (We determined the factor of 10 empirically by looking at words in
|
|
||||||
# the Chinese wordlist that weren't common enough to be identified by
|
|
||||||
# the tokenizer. These words would get split into multiple tokens, and
|
|
||||||
# their inferred frequency would be on average 9.77 times higher than
|
|
||||||
# their actual frequency.)
|
|
||||||
freq /= 10 ** (len(tokens) - 1)
|
|
||||||
|
|
||||||
return max(freq, minimum)
|
return max(freq, minimum)
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,12 +1,14 @@
|
|||||||
from pkg_resources import resource_filename
|
from pkg_resources import resource_filename
|
||||||
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
|
|
||||||
import jieba
|
import jieba
|
||||||
|
import msgpack
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
|
||||||
jieba_tokenizer = None
|
|
||||||
jieba_orig_tokenizer = None
|
|
||||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||||
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
||||||
|
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
||||||
|
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
||||||
|
jieba_tokenizer = None
|
||||||
|
jieba_orig_tokenizer = None
|
||||||
|
|
||||||
|
|
||||||
def simplify_chinese(text):
|
def simplify_chinese(text):
|
||||||
|
BIN
wordfreq/data/_chinese_mapping.msgpack.gz
Normal file
BIN
wordfreq/data/_chinese_mapping.msgpack.gz
Normal file
Binary file not shown.
@ -215,7 +215,8 @@ def opensubtitles_deps(dirname_in, languages):
|
|||||||
|
|
||||||
def jieba_deps(dirname_in, languages):
|
def jieba_deps(dirname_in, languages):
|
||||||
lines = []
|
lines = []
|
||||||
# Either subtlex_zh is turned off, or it's just in Chinese
|
# Because there's Chinese-specific handling here, the valid options for
|
||||||
|
# 'languages' are [] and ['zh']. Make sure it's one of those.
|
||||||
if not languages:
|
if not languages:
|
||||||
return lines
|
return lines
|
||||||
assert languages == ['zh']
|
assert languages == ['zh']
|
||||||
|
@ -42,6 +42,9 @@ def read_values(filename, cutoff=0, lang=None):
|
|||||||
|
|
||||||
If `cutoff` is greater than 0, the csv file must be sorted by value
|
If `cutoff` is greater than 0, the csv file must be sorted by value
|
||||||
in descending order.
|
in descending order.
|
||||||
|
|
||||||
|
If `lang` is given, it will apply language-specific tokenization to the
|
||||||
|
words that it reads.
|
||||||
"""
|
"""
|
||||||
values = defaultdict(float)
|
values = defaultdict(float)
|
||||||
total = 0.
|
total = 0.
|
||||||
|
Loading…
Reference in New Issue
Block a user