put back the freqs_to_cBpack cutoff; prepare for 1.0

This commit is contained in:
Rob Speer 2015-07-28 18:01:12 -04:00
parent 32102ba3c2
commit c5708b24e4
2 changed files with 7 additions and 1 deletions

View File

@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
setup(
name="wordfreq",
version='1.0b4',
version='1.0',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -69,10 +69,16 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
written to the new file.
"""
freq_cutoff = 10 ** (cutoff / 100.)
# freq_cutoff will only be effective here if the data we're reading
# is already normalized to frequencies. If we're reading counts,
# it just won't matter. This is why we check for cB <= cutoff again
# below.
freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
cBpack = []
for token, freq in freqs.items():
cB = round(math.log10(freq) * 100)
if cB <= cutoff:
continue
neg_cB = -cB
while neg_cB >= len(cBpack):
cBpack.append([])