mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
parent
2c5ffa1b52
commit
f3958d63ae
@ -1,5 +1,5 @@
|
|||||||
from wordfreq import (
|
from wordfreq import (
|
||||||
word_frequency, available_languages, dB_to_freq, iter_wordlist,
|
word_frequency, available_languages, cB_to_freq, iter_wordlist,
|
||||||
top_n_list, random_words, random_ascii_words
|
top_n_list, random_words, random_ascii_words
|
||||||
)
|
)
|
||||||
from nose.tools import (
|
from nose.tools import (
|
||||||
@ -48,7 +48,7 @@ def test_most_common_words():
|
|||||||
return top_n_list(lang, 1)[0]
|
return top_n_list(lang, 1)[0]
|
||||||
|
|
||||||
eq_(get_most_common('ar'), 'في')
|
eq_(get_most_common('ar'), 'في')
|
||||||
eq_(get_most_common('de'), 'der')
|
eq_(get_most_common('de'), 'die')
|
||||||
eq_(get_most_common('en'), 'the')
|
eq_(get_most_common('en'), 'the')
|
||||||
eq_(get_most_common('es'), 'de')
|
eq_(get_most_common('es'), 'de')
|
||||||
eq_(get_most_common('fr'), 'de')
|
eq_(get_most_common('fr'), 'de')
|
||||||
@ -70,15 +70,15 @@ def test_language_matching():
|
|||||||
eq_(word_frequency('的', 'cmn'), freq)
|
eq_(word_frequency('的', 'cmn'), freq)
|
||||||
|
|
||||||
|
|
||||||
def test_dB_conversion():
|
def test_cB_conversion():
|
||||||
eq_(dB_to_freq(0), 1.)
|
eq_(cB_to_freq(0), 1.)
|
||||||
assert_almost_equal(dB_to_freq(-10), 0.1)
|
assert_almost_equal(cB_to_freq(-100), 0.1)
|
||||||
assert_almost_equal(dB_to_freq(-60), 1e-6)
|
assert_almost_equal(cB_to_freq(-600), 1e-6)
|
||||||
|
|
||||||
|
|
||||||
@raises(ValueError)
|
@raises(ValueError)
|
||||||
def test_failed_dB_conversion():
|
def test_failed_cB_conversion():
|
||||||
dB_to_freq(1)
|
cB_to_freq(1)
|
||||||
|
|
||||||
|
|
||||||
def test_tokenization():
|
def test_tokenization():
|
||||||
|
@ -53,36 +53,56 @@ def tokenize(text, lang):
|
|||||||
return simple_tokenize(text)
|
return simple_tokenize(text)
|
||||||
|
|
||||||
|
|
||||||
def read_dBpack(filename):
|
def read_cBpack(filename):
|
||||||
"""
|
"""
|
||||||
Read a file from an idiosyncratic format that we use for storing
|
Read a file from an idiosyncratic format that we use for storing
|
||||||
approximate word frequencies, called "dBpack".
|
approximate word frequencies, called "cBpack".
|
||||||
|
|
||||||
The dBpack format is as follows:
|
The cBpack format is as follows:
|
||||||
|
|
||||||
- The file on disk is a gzipped file in msgpack format, which decodes to a
|
- The file on disk is a gzipped file in msgpack format, which decodes to a
|
||||||
list of lists of words.
|
list whose first element is a header, and whose remaining elements are
|
||||||
|
lists of words, preceded by a header.
|
||||||
|
|
||||||
|
- The header is a dictionary with 'format' and 'version' keys that make
|
||||||
|
sure that we're reading the right thing.
|
||||||
|
|
||||||
- Each inner list of words corresponds to a particular word frequency,
|
- Each inner list of words corresponds to a particular word frequency,
|
||||||
rounded to the nearest decibel. 0 dB represents a word that occurs with
|
rounded to the nearest centibel -- that is, one tenth of a decibel, or
|
||||||
probability 1, so it is the only word in the data (this of course doesn't
|
a factor of 10 ** .01.
|
||||||
happen). -20 dB represents a word that occurs once per 100 tokens, -30 dB
|
|
||||||
represents a word that occurs once per 1000 tokens, and so on.
|
|
||||||
|
|
||||||
- The index of each list within the overall list is the negative of its
|
0 cB represents a word that occurs with probability 1, so it is the only
|
||||||
frequency in decibels.
|
word in the data (this of course doesn't happen). -200 cB represents a
|
||||||
|
word that occurs once per 100 tokens, -300 cB represents a word that
|
||||||
|
occurs once per 1000 tokens, and so on.
|
||||||
|
|
||||||
|
- The index of each list within the overall list (without the header) is
|
||||||
|
the negative of its frequency in centibels.
|
||||||
|
|
||||||
- Each inner list is sorted in alphabetical order.
|
- Each inner list is sorted in alphabetical order.
|
||||||
|
|
||||||
As an example, consider a corpus consisting only of the words "red fish
|
As an example, consider a corpus consisting only of the words "red fish
|
||||||
blue fish". The word "fish" occurs as 50% of tokens (-3 dB), while "red"
|
blue fish". The word "fish" occurs as 50% of tokens (-30 cB), while "red"
|
||||||
and "blue" occur as 25% of tokens (-6 dB). The dBpack file of their word
|
and "blue" occur as 25% of tokens (-60 cB). The cBpack file of their word
|
||||||
frequencies would decode to this list:
|
frequencies would decode to this:
|
||||||
|
|
||||||
[[], [], [], ['fish'], [], [], ['blue', 'red']]
|
[
|
||||||
|
{'format': 'cB', 'version': 1},
|
||||||
|
[], [], [], ... # 30 empty lists
|
||||||
|
['fish'],
|
||||||
|
[], [], [], ... # 29 more empty lists
|
||||||
|
['blue', 'red']
|
||||||
|
]
|
||||||
"""
|
"""
|
||||||
with gzip.open(filename, 'rb') as infile:
|
with gzip.open(filename, 'rb') as infile:
|
||||||
return msgpack.load(infile, encoding='utf-8')
|
data = msgpack.load(infile, encoding='utf-8')
|
||||||
|
header = data[0]
|
||||||
|
if (
|
||||||
|
not isinstance(header, dict) or header.get('format') != 'cB'
|
||||||
|
or header.get('version') != 1
|
||||||
|
):
|
||||||
|
raise ValueError("Unexpected header: %r" % header)
|
||||||
|
return data[1:]
|
||||||
|
|
||||||
|
|
||||||
def available_languages(wordlist='combined'):
|
def available_languages(wordlist='combined'):
|
||||||
@ -103,7 +123,7 @@ def available_languages(wordlist='combined'):
|
|||||||
def get_frequency_list(lang, wordlist='combined', match_cutoff=30):
|
def get_frequency_list(lang, wordlist='combined', match_cutoff=30):
|
||||||
"""
|
"""
|
||||||
Read the raw data from a wordlist file, returning it as a list of
|
Read the raw data from a wordlist file, returning it as a list of
|
||||||
lists. (See `read_dBpack` for what this represents.)
|
lists. (See `read_cBpack` for what this represents.)
|
||||||
|
|
||||||
Because we use the `langcodes` module, we can handle slight
|
Because we use the `langcodes` module, we can handle slight
|
||||||
variations in language codes. For example, looking for 'pt-BR',
|
variations in language codes. For example, looking for 'pt-BR',
|
||||||
@ -123,25 +143,25 @@ def get_frequency_list(lang, wordlist='combined', match_cutoff=30):
|
|||||||
% (lang, best, langcodes.get(best).language_name('en'))
|
% (lang, best, langcodes.get(best).language_name('en'))
|
||||||
)
|
)
|
||||||
|
|
||||||
return read_dBpack(available[best])
|
return read_cBpack(available[best])
|
||||||
|
|
||||||
|
|
||||||
def dB_to_freq(dB):
|
def cB_to_freq(cB):
|
||||||
"""
|
"""
|
||||||
Convert a word frequency from the logarithmic decibel scale that we use
|
Convert a word frequency from the logarithmic centibel scale that we use
|
||||||
internally, to a proportion from 0 to 1.
|
internally, to a proportion from 0 to 1.
|
||||||
|
|
||||||
On this scale, 0 dB represents the maximum possible frequency of
|
On this scale, 0 cB represents the maximum possible frequency of
|
||||||
1.0. -10 dB represents a word that happens 1 in 10 times,
|
1.0. -100 cB represents a word that happens 1 in 10 times,
|
||||||
-20 dB represents something that happens 1 in 100 times, and so on.
|
-200 cB represents something that happens 1 in 100 times, and so on.
|
||||||
|
|
||||||
In general, x dB represents a frequency of 10 ** (x/10).
|
In general, x cB represents a frequency of 10 ** (x/100).
|
||||||
"""
|
"""
|
||||||
if dB > 0:
|
if cB > 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"A frequency cannot be a positive number of decibels."
|
"A frequency cannot be a positive number of decibels."
|
||||||
)
|
)
|
||||||
return 10 ** (dB / 10)
|
return 10 ** (cB / 100)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
@ -154,7 +174,7 @@ def get_frequency_dict(lang, wordlist='combined', match_cutoff=30):
|
|||||||
pack = get_frequency_list(lang, wordlist, match_cutoff)
|
pack = get_frequency_list(lang, wordlist, match_cutoff)
|
||||||
for index, bucket in enumerate(pack):
|
for index, bucket in enumerate(pack):
|
||||||
for word in bucket:
|
for word in bucket:
|
||||||
freqs[word] = dB_to_freq(-index)
|
freqs[word] = cB_to_freq(-index)
|
||||||
return freqs
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user