put back the freqs_to_cBpack cutoff; prepare for 1.0

2024-12-23 17:31:41 +00:00 · 2015-07-28 18:01:12 -04:00 · 2015-07-28 18:01:12 -04:00 · c5708b24e4
commit c5708b24e4
parent 32102ba3c2
2 changed files with 7 additions and 1 deletions
--- a/setup.py
+++ b/setup.py
@ -33,7 +33,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='1.0b4',
+    version='1.0',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -69,10 +69,16 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
    written to the new file.
    """
    freq_cutoff = 10 ** (cutoff / 100.)
+    # freq_cutoff will only be effective here if the data we're reading
+    # is already normalized to frequencies. If we're reading counts,
+    # it just won't matter. This is why we check for cB <= cutoff again
+    # below.
    freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
    cBpack = []
    for token, freq in freqs.items():
        cB = round(math.log10(freq) * 100)
+        if cB <= cutoff:
+            continue
        neg_cB = -cB
        while neg_cB >= len(cBpack):
            cBpack.append([])