mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
put back the freqs_to_cBpack cutoff; prepare for 1.0
This commit is contained in:
parent
32102ba3c2
commit
c5708b24e4
2
setup.py
2
setup.py
@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='1.0b4',
|
version='1.0',
|
||||||
maintainer='Luminoso Technologies, Inc.',
|
maintainer='Luminoso Technologies, Inc.',
|
||||||
maintainer_email='info@luminoso.com',
|
maintainer_email='info@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
@ -69,10 +69,16 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
|||||||
written to the new file.
|
written to the new file.
|
||||||
"""
|
"""
|
||||||
freq_cutoff = 10 ** (cutoff / 100.)
|
freq_cutoff = 10 ** (cutoff / 100.)
|
||||||
|
# freq_cutoff will only be effective here if the data we're reading
|
||||||
|
# is already normalized to frequencies. If we're reading counts,
|
||||||
|
# it just won't matter. This is why we check for cB <= cutoff again
|
||||||
|
# below.
|
||||||
freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
|
freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
|
||||||
cBpack = []
|
cBpack = []
|
||||||
for token, freq in freqs.items():
|
for token, freq in freqs.items():
|
||||||
cB = round(math.log10(freq) * 100)
|
cB = round(math.log10(freq) * 100)
|
||||||
|
if cB <= cutoff:
|
||||||
|
continue
|
||||||
neg_cB = -cB
|
neg_cB = -cB
|
||||||
while neg_cB >= len(cBpack):
|
while neg_cB >= len(cBpack):
|
||||||
cBpack.append([])
|
cBpack.append([])
|
||||||
|
Loading…
Reference in New Issue
Block a user