Merge branch 'master' into chinese-external-wordlist

Conflicts: wordfreq/chinese.py Former-commit-id: cea2a61444
2024-12-23 09:21:37 +00:00 · 2015-09-24 13:40:08 -04:00 · 2015-09-24 13:40:08 -04:00 · e15a231401
commit e15a231401
parent 9a007b9948 e27a75029d
10 changed files with 58 additions and 3317 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,2 +1,3 @@
 recursive-include wordfreq/data *.gz
 include README.md
+recursive-include wordfreq/data *.txt
--- a/README.md
+++ b/README.md
@ -232,20 +232,14 @@ sources:

 - Wikipedia, the free encyclopedia (http://www.wikipedia.org)

-<<<<<<< HEAD
 It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
-SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. (see citations below) and
-available at http://crr.ugent.be/programs-data/subtitle-frequencies.
-=======
-It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
-SUBTLEX-CH, created by Marc Brysbaert et al. and available at
+SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
+(see citations below) and available at
 http://crr.ugent.be/programs-data/subtitle-frequencies.
->>>>>>> greek-and-turkish

-I (Robyn Speer) have
-obtained permission by e-mail from Marc Brysbaert to distribute these wordlists
-in wordfreq, to be used for any purpose, not just for academic use, under these
-conditions:
+I (Robyn Speer) have obtained permission by e-mail from Marc Brysbaert to
+distribute these wordlists in wordfreq, to be used for any purpose, not just
+for academic use, under these conditions:

 - Wordfreq and code derived from it must credit the SUBTLEX authors.
 - It must remain clear that SUBTLEX is freely available data.
@ -297,4 +291,3 @@ Twitter; it does not display or republish any Twitter content.
  SUBTLEX-UK: A new and improved word frequency database for British English.
  The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
  http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521
-
--- a/scripts/make_chinese_mapping.py
+++ b/scripts/make_chinese_mapping.py
@ -1,7 +1,21 @@
+"""
+Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
+Chinese characters to their Simplified Chinese equivalents.
+
+This is meant to be a normalization of text, somewhat like case-folding -- not
+an actual translator, a task for which this method would be unsuitable. We
+store word frequencies using Simplified Chinese characters so that, in the
+large number of cases where a Traditional Chinese word has an obvious
+Simplified Chinese mapping, we can get a frequency for it that's the same in
+Simplified and Traditional Chinese.
+
+Generating this mapping requires the external Chinese conversion tool OpenCC.
+"""
 import unicodedata
 import itertools
 import os
-import pprint
+import msgpack
+import gzip


 def make_hanzi_table(filename):
@ -12,7 +26,7 @@ def make_hanzi_table(filename):
                print('%5X\t%s' % (codept, char), file=out)


-def make_hanzi_converter(table_in, python_out):
+def make_hanzi_converter(table_in, msgpack_out):
    table = {}
    with open(table_in, encoding='utf-8') as infile:
        for line in infile:
@ -21,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
            assert len(char) == 1
            if chr(codept) != char:
                table[codept] = char
-    with open(python_out, 'w', encoding='utf-8') as outfile:
-        print('SIMPLIFIED_MAP = ', end='', file=outfile)
-        pprint.pprint(table, stream=outfile)
+    with gzip.open(msgpack_out, 'wb') as outfile:
+        msgpack.dump(table, outfile, encoding='utf-8')


 def build():
    make_hanzi_table('/tmp/han_in.txt')
    os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
-    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
+    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')


 if __name__ == '__main__':
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -14,10 +14,10 @@ def test_combination():

    assert_almost_equal(
        word_frequency('おはようおはよう', 'ja'),
-        ohayou_freq / 20
+        ohayou_freq / 2
    )
    assert_almost_equal(
        1.0 / word_frequency('おはようございます', 'ja'),
-        (100.0 / ohayou_freq + 100.0 / gozai_freq + 100.0 / masu_freq)
+        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
    )

--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -15,11 +15,19 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

-# Chinese and Japanese are written without spaces. This means we have to
-# run language-specific code to infer token boundaries on them, and also
-# that we need to adjust frequencies of multi-token phrases to account
-# for the fact that token boundaries were inferred.
-SPACELESS_LANGUAGES = {'zh', 'ja'}
+# Chinese and Japanese are written without spaces. In Chinese, in particular,
+# we have to infer word boundaries from the frequencies of the words they
+# would create. When this happens, we should adjust the resulting frequency
+# to avoid creating a bias toward improbable word combinations.
+INFERRED_SPACE_LANGUAGES = {'zh'}
+
+# We'll divide the frequency by 10 for each token boundary that was inferred.
+# (We determined the factor of 10 empirically by looking at words in the
+# Chinese wordlist that weren't common enough to be identified by the
+# tokenizer. These words would get split into multiple tokens, and their
+# inferred frequency would be on average 9.77 times higher than their actual
+# frequency.)
+INFERRED_SPACE_FACTOR = 10.0

 # simple_tokenize is imported so that other things can import it from here.
 # Suppress the pyflakes warning.
@ -85,10 +93,11 @@ def available_languages(wordlist='combined'):
    """
    available = {}
    for path in DATA_PATH.glob('*.msgpack.gz'):
-        list_name = path.name.split('.')[0]
-        name, lang = list_name.split('_')
-        if name == wordlist:
-            available[lang] = str(path)
+        if not path.name.startswith('_'):
+            list_name = path.name.split('.')[0]
+            name, lang = list_name.split('_')
+            if name == wordlist:
+                available[lang] = str(path)
    return available


@ -188,14 +197,8 @@ def _word_frequency(word, lang, wordlist, minimum):

    freq = 1.0 / one_over_result

-    if lang in SPACELESS_LANGUAGES:
-        # Divide the frequency by 10 for each token boundary that was inferred.
-        # (We determined the factor of 10 empirically by looking at words in
-        # the Chinese wordlist that weren't common enough to be identified by
-        # the tokenizer. These words would get split into multiple tokens, and
-        # their inferred frequency would be on average 9.77 times higher than
-        # their actual frequency.)
-        freq /= 10 ** (len(tokens) - 1)
+    if lang in INFERRED_SPACE_LANGUAGES:
+        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)

    return max(freq, minimum)

--- a/wordfreq/_chinese_mapping.py
+++ b/wordfreq/_chinese_mapping.py
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -1,12 +1,14 @@
 from pkg_resources import resource_filename
-from wordfreq._chinese_mapping import SIMPLIFIED_MAP
 import jieba
+import msgpack
+import gzip

-
-jieba_tokenizer = None
-jieba_orig_tokenizer = None
 DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
 ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
+SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
+SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
+jieba_tokenizer = None
+jieba_orig_tokenizer = None


 def simplify_chinese(text):
--- a/wordfreq/data/_chinese_mapping.msgpack.gz
+++ b/wordfreq/data/_chinese_mapping.msgpack.gz
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -215,7 +215,8 @@ def opensubtitles_deps(dirname_in, languages):

 def jieba_deps(dirname_in, languages):
    lines = []
-    # Either subtlex_zh is turned off, or it's just in Chinese
+    # Because there's Chinese-specific handling here, the valid options for
+    # 'languages' are [] and ['zh']. Make sure it's one of those.
    if not languages:
        return lines
    assert languages == ['zh']
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -42,6 +42,9 @@ def read_values(filename, cutoff=0, lang=None):

    If `cutoff` is greater than 0, the csv file must be sorted by value
    in descending order.
+
+    If `lang` is given, it will apply language-specific tokenization to the
+    words that it reads.
    """
    values = defaultdict(float)
    total = 0.