diff --git a/tests/test.py b/tests/test.py index abc33c3..91f990a 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,5 +1,5 @@ from wordfreq import ( - word_frequency, available_languages, dB_to_freq, iter_wordlist, + word_frequency, available_languages, cB_to_freq, iter_wordlist, top_n_list, random_words, random_ascii_words ) from nose.tools import ( @@ -48,7 +48,7 @@ def test_most_common_words(): return top_n_list(lang, 1)[0] eq_(get_most_common('ar'), 'في') - eq_(get_most_common('de'), 'der') + eq_(get_most_common('de'), 'die') eq_(get_most_common('en'), 'the') eq_(get_most_common('es'), 'de') eq_(get_most_common('fr'), 'de') @@ -70,15 +70,15 @@ def test_language_matching(): eq_(word_frequency('的', 'cmn'), freq) -def test_dB_conversion(): - eq_(dB_to_freq(0), 1.) - assert_almost_equal(dB_to_freq(-10), 0.1) - assert_almost_equal(dB_to_freq(-60), 1e-6) +def test_cB_conversion(): + eq_(cB_to_freq(0), 1.) + assert_almost_equal(cB_to_freq(-100), 0.1) + assert_almost_equal(cB_to_freq(-600), 1e-6) @raises(ValueError) -def test_failed_dB_conversion(): - dB_to_freq(1) +def test_failed_cB_conversion(): + cB_to_freq(1) def test_tokenization(): diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 3858b98..dd26811 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -53,36 +53,56 @@ def tokenize(text, lang): return simple_tokenize(text) -def read_dBpack(filename): +def read_cBpack(filename): """ Read a file from an idiosyncratic format that we use for storing - approximate word frequencies, called "dBpack". + approximate word frequencies, called "cBpack". - The dBpack format is as follows: + The cBpack format is as follows: - The file on disk is a gzipped file in msgpack format, which decodes to a - list of lists of words. + list whose first element is a header, and whose remaining elements are + lists of words, preceded by a header. + + - The header is a dictionary with 'format' and 'version' keys that make + sure that we're reading the right thing. - Each inner list of words corresponds to a particular word frequency, - rounded to the nearest decibel. 0 dB represents a word that occurs with - probability 1, so it is the only word in the data (this of course doesn't - happen). -20 dB represents a word that occurs once per 100 tokens, -30 dB - represents a word that occurs once per 1000 tokens, and so on. + rounded to the nearest centibel -- that is, one tenth of a decibel, or + a factor of 10 ** .01. - - The index of each list within the overall list is the negative of its - frequency in decibels. + 0 cB represents a word that occurs with probability 1, so it is the only + word in the data (this of course doesn't happen). -200 cB represents a + word that occurs once per 100 tokens, -300 cB represents a word that + occurs once per 1000 tokens, and so on. + + - The index of each list within the overall list (without the header) is + the negative of its frequency in centibels. - Each inner list is sorted in alphabetical order. As an example, consider a corpus consisting only of the words "red fish - blue fish". The word "fish" occurs as 50% of tokens (-3 dB), while "red" - and "blue" occur as 25% of tokens (-6 dB). The dBpack file of their word - frequencies would decode to this list: + blue fish". The word "fish" occurs as 50% of tokens (-30 cB), while "red" + and "blue" occur as 25% of tokens (-60 cB). The cBpack file of their word + frequencies would decode to this: - [[], [], [], ['fish'], [], [], ['blue', 'red']] + [ + {'format': 'cB', 'version': 1}, + [], [], [], ... # 30 empty lists + ['fish'], + [], [], [], ... # 29 more empty lists + ['blue', 'red'] + ] """ with gzip.open(filename, 'rb') as infile: - return msgpack.load(infile, encoding='utf-8') + data = msgpack.load(infile, encoding='utf-8') + header = data[0] + if ( + not isinstance(header, dict) or header.get('format') != 'cB' + or header.get('version') != 1 + ): + raise ValueError("Unexpected header: %r" % header) + return data[1:] def available_languages(wordlist='combined'): @@ -103,7 +123,7 @@ def available_languages(wordlist='combined'): def get_frequency_list(lang, wordlist='combined', match_cutoff=30): """ Read the raw data from a wordlist file, returning it as a list of - lists. (See `read_dBpack` for what this represents.) + lists. (See `read_cBpack` for what this represents.) Because we use the `langcodes` module, we can handle slight variations in language codes. For example, looking for 'pt-BR', @@ -123,25 +143,25 @@ def get_frequency_list(lang, wordlist='combined', match_cutoff=30): % (lang, best, langcodes.get(best).language_name('en')) ) - return read_dBpack(available[best]) + return read_cBpack(available[best]) -def dB_to_freq(dB): +def cB_to_freq(cB): """ - Convert a word frequency from the logarithmic decibel scale that we use + Convert a word frequency from the logarithmic centibel scale that we use internally, to a proportion from 0 to 1. - On this scale, 0 dB represents the maximum possible frequency of - 1.0. -10 dB represents a word that happens 1 in 10 times, - -20 dB represents something that happens 1 in 100 times, and so on. + On this scale, 0 cB represents the maximum possible frequency of + 1.0. -100 cB represents a word that happens 1 in 10 times, + -200 cB represents something that happens 1 in 100 times, and so on. - In general, x dB represents a frequency of 10 ** (x/10). + In general, x cB represents a frequency of 10 ** (x/100). """ - if dB > 0: + if cB > 0: raise ValueError( "A frequency cannot be a positive number of decibels." ) - return 10 ** (dB / 10) + return 10 ** (cB / 100) @lru_cache(maxsize=None) @@ -154,7 +174,7 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30): pack = get_frequency_list(lang, wordlist, match_cutoff) for index, bucket in enumerate(pack): for word in bucket: - freqs[word] = dB_to_freq(-index) + freqs[word] = cB_to_freq(-index) return freqs diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/combined_ar.msgpack.gz index c97a420..5ce708f 100644 Binary files a/wordfreq/data/combined_ar.msgpack.gz and b/wordfreq/data/combined_ar.msgpack.gz differ diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz index eec9c65..35384fc 100644 Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/combined_el.msgpack.gz index 107e3b5..bc0beab 100644 Binary files a/wordfreq/data/combined_el.msgpack.gz and b/wordfreq/data/combined_el.msgpack.gz differ diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz index 842df21..673b9ca 100644 Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz index ff030ca..9a4b475 100644 Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/combined_fr.msgpack.gz index 33140ee..bbcfc60 100644 Binary files a/wordfreq/data/combined_fr.msgpack.gz and b/wordfreq/data/combined_fr.msgpack.gz differ diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/combined_id.msgpack.gz index 7da4279..4ff43c3 100644 Binary files a/wordfreq/data/combined_id.msgpack.gz and b/wordfreq/data/combined_id.msgpack.gz differ diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/combined_it.msgpack.gz index 6460266..f02c507 100644 Binary files a/wordfreq/data/combined_it.msgpack.gz and b/wordfreq/data/combined_it.msgpack.gz differ diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/combined_ja.msgpack.gz index 0b515b1..1c38166 100644 Binary files a/wordfreq/data/combined_ja.msgpack.gz and b/wordfreq/data/combined_ja.msgpack.gz differ diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/combined_ko.msgpack.gz index f23a604..5469582 100644 Binary files a/wordfreq/data/combined_ko.msgpack.gz and b/wordfreq/data/combined_ko.msgpack.gz differ diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/combined_ms.msgpack.gz index ca7e111..3c63beb 100644 Binary files a/wordfreq/data/combined_ms.msgpack.gz and b/wordfreq/data/combined_ms.msgpack.gz differ diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/combined_nl.msgpack.gz index eaaef0b..eed5525 100644 Binary files a/wordfreq/data/combined_nl.msgpack.gz and b/wordfreq/data/combined_nl.msgpack.gz differ diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/combined_pt.msgpack.gz index c7fb843..b8251b3 100644 Binary files a/wordfreq/data/combined_pt.msgpack.gz and b/wordfreq/data/combined_pt.msgpack.gz differ diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/combined_ru.msgpack.gz index f2d848c..6a05d2b 100644 Binary files a/wordfreq/data/combined_ru.msgpack.gz and b/wordfreq/data/combined_ru.msgpack.gz differ diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/combined_zh.msgpack.gz index 84cf890..c988cb8 100644 Binary files a/wordfreq/data/combined_zh.msgpack.gz and b/wordfreq/data/combined_zh.msgpack.gz differ