Don't use the file-reading cutoff when writing centibels

Former-commit-id: e9f9c94e36
2024-12-24 01:41:39 +00:00 · 2015-07-28 18:45:26 -04:00 · 2015-07-28 18:45:26 -04:00 · e9dd253f1d
commit e9dd253f1d
parent 0a032dfa97
29 changed files with 4 additions and 6 deletions
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz
--- a/wordfreq/data/twitter_ar.msgpack.gz
+++ b/wordfreq/data/twitter_ar.msgpack.gz
--- a/wordfreq/data/twitter_de.msgpack.gz
+++ b/wordfreq/data/twitter_de.msgpack.gz
--- a/wordfreq/data/twitter_en.msgpack.gz
+++ b/wordfreq/data/twitter_en.msgpack.gz
--- a/wordfreq/data/twitter_es.msgpack.gz
+++ b/wordfreq/data/twitter_es.msgpack.gz
--- a/wordfreq/data/twitter_fr.msgpack.gz
+++ b/wordfreq/data/twitter_fr.msgpack.gz
--- a/wordfreq/data/twitter_id.msgpack.gz
+++ b/wordfreq/data/twitter_id.msgpack.gz
--- a/wordfreq/data/twitter_it.msgpack.gz
+++ b/wordfreq/data/twitter_it.msgpack.gz
--- a/wordfreq/data/twitter_ja.msgpack.gz
+++ b/wordfreq/data/twitter_ja.msgpack.gz
--- a/wordfreq/data/twitter_ko.msgpack.gz
+++ b/wordfreq/data/twitter_ko.msgpack.gz
--- a/wordfreq/data/twitter_ms.msgpack.gz
+++ b/wordfreq/data/twitter_ms.msgpack.gz
--- a/wordfreq/data/twitter_nl.msgpack.gz
+++ b/wordfreq/data/twitter_nl.msgpack.gz
--- a/wordfreq/data/twitter_pt.msgpack.gz
+++ b/wordfreq/data/twitter_pt.msgpack.gz
--- a/wordfreq/data/twitter_ru.msgpack.gz
+++ b/wordfreq/data/twitter_ru.msgpack.gz
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -67,13 +67,11 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):

    Only words with a frequency greater than `cutoff` centibels will be
    written to the new file.
+
+    This cutoff should not be stacked with a cutoff in `read_freqs`; doing
+    so would skew the resulting frequencies.
    """
-    freq_cutoff = 10 ** (cutoff / 100.)
-    # freq_cutoff will only be effective here if the data we're reading
-    # is already normalized to frequencies. If we're reading counts,
-    # it just won't matter. This is why we check for cB <= cutoff again
-    # below.
-    freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
+    freqs = read_freqs(in_filename, cutoff=0, lang=lang)
    cBpack = []
    for token, freq in freqs.items():
        cB = round(math.log10(freq) * 100)