Merge branch 'master' into chinese-external-wordlist

Conflicts: wordfreq/chinese.py
2024-12-23 09:21:37 +00:00 · 2015-09-24 13:40:08 -04:00 · 2015-09-24 13:40:08 -04:00 · cea2a61444
commit cea2a61444
parent e8e6e0a231 cd0797e1c8
10 changed files with 58 additions and 3317 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,2 +1,3 @@
 recursive-include wordfreq/data *.gz
 include README.md
 recursive-include wordfreq/data *.txt
--- a/README.md
+++ b/README.md
@ -232,20 +232,14 @@ sources:
 - Wikipedia, the free encyclopedia (http://www.wikipedia.org)
 <<<<<<< HEAD
 It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
-SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. (see citations below) and
+SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
-available at http://crr.ugent.be/programs-data/subtitle-frequencies.
+(see citations below) and available at
 =======
 It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
 SUBTLEX-CH, created by Marc Brysbaert et al. and available at
 http://crr.ugent.be/programs-data/subtitle-frequencies.
 >>>>>>> greek-and-turkish
-I (Rob Speer) have
+I (Rob Speer) have obtained permission by e-mail from Marc Brysbaert to
-obtained permission by e-mail from Marc Brysbaert to distribute these wordlists
+distribute these wordlists in wordfreq, to be used for any purpose, not just
-in wordfreq, to be used for any purpose, not just for academic use, under these
+for academic use, under these conditions:
 conditions:
 - Wordfreq and code derived from it must credit the SUBTLEX authors.
 - It must remain clear that SUBTLEX is freely available data.
@ -297,4 +291,3 @@ Twitter; it does not display or republish any Twitter content.
  SUBTLEX-UK: A new and improved word frequency database for British English.
  The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
  http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521
--- a/scripts/make_chinese_mapping.py
+++ b/scripts/make_chinese_mapping.py
@ -1,7 +1,21 @@
 """
 Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
 Chinese characters to their Simplified Chinese equivalents.
 This is meant to be a normalization of text, somewhat like case-folding -- not
 an actual translator, a task for which this method would be unsuitable. We
 store word frequencies using Simplified Chinese characters so that, in the
 large number of cases where a Traditional Chinese word has an obvious
 Simplified Chinese mapping, we can get a frequency for it that's the same in
 Simplified and Traditional Chinese.
 Generating this mapping requires the external Chinese conversion tool OpenCC.
 """
 import unicodedata
 import itertools
 import os
-import pprint
+import msgpack
 import gzip
 def make_hanzi_table(filename):
@ -12,7 +26,7 @@ def make_hanzi_table(filename):
                print('%5X\t%s' % (codept, char), file=out)
-def make_hanzi_converter(table_in, python_out):
+def make_hanzi_converter(table_in, msgpack_out):
    table = {}
    with open(table_in, encoding='utf-8') as infile:
        for line in infile:
@ -21,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
            assert len(char) == 1
            if chr(codept) != char:
                table[codept] = char
-    with open(python_out, 'w', encoding='utf-8') as outfile:
+    with gzip.open(msgpack_out, 'wb') as outfile:
-        print('SIMPLIFIED_MAP = ', end='', file=outfile)
+        msgpack.dump(table, outfile, encoding='utf-8')
        pprint.pprint(table, stream=outfile)
 def build():
    make_hanzi_table('/tmp/han_in.txt')
    os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
-    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
+    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
 if __name__ == '__main__':
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -14,10 +14,10 @@ def test_combination():
    assert_almost_equal(
        word_frequency('おはようおはよう', 'ja'),
-        ohayou_freq / 20
+        ohayou_freq / 2
    )
    assert_almost_equal(
        1.0 / word_frequency('おはようございます', 'ja'),
-        (100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
+        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -15,11 +15,19 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
-# Chinese and Japanese are written without spaces. This means we have to
+# Chinese and Japanese are written without spaces. In Chinese, in particular,
-# run language-specific code to infer token boundaries on them, and also
+# we have to infer word boundaries from the frequencies of the words they
-# that we need to adjust frequencies of multi-token phrases to account
+# would create. When this happens, we should adjust the resulting frequency
-# for the fact that token boundaries were inferred.
+# to avoid creating a bias toward improbable word combinations.
-SPACELESS_LANGUAGES = {'zh', 'ja'}
+INFERRED_SPACE_LANGUAGES = {'zh'}
 # We'll divide the frequency by 10 for each token boundary that was inferred.
 # (We determined the factor of 10 empirically by looking at words in the
 # Chinese wordlist that weren't common enough to be identified by the
 # tokenizer. These words would get split into multiple tokens, and their
 # inferred frequency would be on average 9.77 times higher than their actual
 # frequency.)
 INFERRED_SPACE_FACTOR = 10.0
 # simple_tokenize is imported so that other things can import it from here.
 # Suppress the pyflakes warning.
@ -85,10 +93,11 @@ def available_languages(wordlist='combined'):
    """
    available = {}
    for path in DATA_PATH.glob('*.msgpack.gz'):
-        list_name = path.name.split('.')[0]
+        if not path.name.startswith('_'):
-        name, lang = list_name.split('_')
+            list_name = path.name.split('.')[0]
-        if name == wordlist:
+            name, lang = list_name.split('_')
-            available[lang] = str(path)
+            if name == wordlist:
                available[lang] = str(path)
    return available
@ -188,14 +197,8 @@ def _word_frequency(word, lang, wordlist, minimum):
    freq = 1.0 / one_over_result
-    if lang in SPACELESS_LANGUAGES:
+    if lang in INFERRED_SPACE_LANGUAGES:
-        # Divide the frequency by 10 for each token boundary that was inferred.
+        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
        # (We determined the factor of 10 empirically by looking at words in
        # the Chinese wordlist that weren't common enough to be identified by
        # the tokenizer. These words would get split into multiple tokens, and
        # their inferred frequency would be on average 9.77 times higher than
        # their actual frequency.)
        freq /= 10 ** (len(tokens) - 1)
    return max(freq, minimum)
--- a/wordfreq/_chinese_mapping.py
+++ b/wordfreq/_chinese_mapping.py
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -1,12 +1,14 @@
 from pkg_resources import resource_filename
 from wordfreq._chinese_mapping import SIMPLIFIED_MAP
 import jieba
 import msgpack
 import gzip
 jieba_tokenizer = None
 jieba_orig_tokenizer = None
 DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
 ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
 SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
 SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
 jieba_tokenizer = None
 jieba_orig_tokenizer = None
 def simplify_chinese(text):
--- a/wordfreq/data/_chinese_mapping.msgpack.gz
+++ b/wordfreq/data/_chinese_mapping.msgpack.gz
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -215,7 +215,8 @@ def opensubtitles_deps(dirname_in, languages):
 def jieba_deps(dirname_in, languages):
    lines = []
-    # Either subtlex_zh is turned off, or it's just in Chinese
+    # Because there's Chinese-specific handling here, the valid options for
    # 'languages' are [] and ['zh']. Make sure it's one of those.
    if not languages:
        return lines
    assert languages == ['zh']
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -42,6 +42,9 @@ def read_values(filename, cutoff=0, lang=None):
    If `cutoff` is greater than 0, the csv file must be sorted by value
    in descending order.
    If `lang` is given, it will apply language-specific tokenization to the
    words that it reads.
    """
    values = defaultdict(float)
    total = 0.