diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/combined_ar.msgpack.gz index 0564baf..629fbb9 100644 Binary files a/wordfreq/data/combined_ar.msgpack.gz and b/wordfreq/data/combined_ar.msgpack.gz differ diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz index a6f79c1..1b72aa4 100644 Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/combined_el.msgpack.gz index 9d84707..8ea9a65 100644 Binary files a/wordfreq/data/combined_el.msgpack.gz and b/wordfreq/data/combined_el.msgpack.gz differ diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz index 91d4f96..2e3e806 100644 Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz index 4ee5dc2..0e6f3d3 100644 Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/combined_fr.msgpack.gz index cded8c3..f832c10 100644 Binary files a/wordfreq/data/combined_fr.msgpack.gz and b/wordfreq/data/combined_fr.msgpack.gz differ diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/combined_id.msgpack.gz index 557f20c..dd5a9a3 100644 Binary files a/wordfreq/data/combined_id.msgpack.gz and b/wordfreq/data/combined_id.msgpack.gz differ diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/combined_it.msgpack.gz index 896c091..f226c83 100644 Binary files a/wordfreq/data/combined_it.msgpack.gz and b/wordfreq/data/combined_it.msgpack.gz differ diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/combined_ja.msgpack.gz index 1f8c892..6b63293 100644 Binary files a/wordfreq/data/combined_ja.msgpack.gz and b/wordfreq/data/combined_ja.msgpack.gz differ diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/combined_ko.msgpack.gz index cc915f9..b3bb675 100644 Binary files a/wordfreq/data/combined_ko.msgpack.gz and b/wordfreq/data/combined_ko.msgpack.gz differ diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/combined_ms.msgpack.gz index 38700fd..9ddfeff 100644 Binary files a/wordfreq/data/combined_ms.msgpack.gz and b/wordfreq/data/combined_ms.msgpack.gz differ diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/combined_nl.msgpack.gz index 21a58d9..4a6e497 100644 Binary files a/wordfreq/data/combined_nl.msgpack.gz and b/wordfreq/data/combined_nl.msgpack.gz differ diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/combined_pt.msgpack.gz index 9fa94d6..9763390 100644 Binary files a/wordfreq/data/combined_pt.msgpack.gz and b/wordfreq/data/combined_pt.msgpack.gz differ diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/combined_ru.msgpack.gz index caf78fe..1fa086a 100644 Binary files a/wordfreq/data/combined_ru.msgpack.gz and b/wordfreq/data/combined_ru.msgpack.gz differ diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/combined_zh.msgpack.gz index bac7018..a21533c 100644 Binary files a/wordfreq/data/combined_zh.msgpack.gz and b/wordfreq/data/combined_zh.msgpack.gz differ diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz index e47ece4..bef10ad 100644 Binary files a/wordfreq/data/twitter_ar.msgpack.gz and b/wordfreq/data/twitter_ar.msgpack.gz differ diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz index 0761ed8..4d2e354 100644 Binary files a/wordfreq/data/twitter_de.msgpack.gz and b/wordfreq/data/twitter_de.msgpack.gz differ diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz index f7a58db..d65819f 100644 Binary files a/wordfreq/data/twitter_en.msgpack.gz and b/wordfreq/data/twitter_en.msgpack.gz differ diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz index 1dd32bd..8091257 100644 Binary files a/wordfreq/data/twitter_es.msgpack.gz and b/wordfreq/data/twitter_es.msgpack.gz differ diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz index c8a558f..73ba874 100644 Binary files a/wordfreq/data/twitter_fr.msgpack.gz and b/wordfreq/data/twitter_fr.msgpack.gz differ diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz index f08f4e2..02af62f 100644 Binary files a/wordfreq/data/twitter_id.msgpack.gz and b/wordfreq/data/twitter_id.msgpack.gz differ diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz index f452259..46600ba 100644 Binary files a/wordfreq/data/twitter_it.msgpack.gz and b/wordfreq/data/twitter_it.msgpack.gz differ diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz index aed2c13..d54b8b2 100644 Binary files a/wordfreq/data/twitter_ja.msgpack.gz and b/wordfreq/data/twitter_ja.msgpack.gz differ diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz index bcba7b9..0a5e21f 100644 Binary files a/wordfreq/data/twitter_ko.msgpack.gz and b/wordfreq/data/twitter_ko.msgpack.gz differ diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz index 739021e..39456cd 100644 Binary files a/wordfreq/data/twitter_ms.msgpack.gz and b/wordfreq/data/twitter_ms.msgpack.gz differ diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz index 081afc8..2231731 100644 Binary files a/wordfreq/data/twitter_nl.msgpack.gz and b/wordfreq/data/twitter_nl.msgpack.gz differ diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz index 305b68b..f0de0e8 100644 Binary files a/wordfreq/data/twitter_pt.msgpack.gz and b/wordfreq/data/twitter_pt.msgpack.gz differ diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz index 87e81bc..3955094 100644 Binary files a/wordfreq/data/twitter_ru.msgpack.gz and b/wordfreq/data/twitter_ru.msgpack.gz differ diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 4d3efd2..8accf2b 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -67,13 +67,11 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): Only words with a frequency greater than `cutoff` centibels will be written to the new file. + + This cutoff should not be stacked with a cutoff in `read_freqs`; doing + so would skew the resulting frequencies. """ - freq_cutoff = 10 ** (cutoff / 100.) - # freq_cutoff will only be effective here if the data we're reading - # is already normalized to frequencies. If we're reading counts, - # it just won't matter. This is why we check for cB <= cutoff again - # below. - freqs = read_freqs(in_filename, freq_cutoff, lang=lang) + freqs = read_freqs(in_filename, cutoff=0, lang=lang) cBpack = [] for token, freq in freqs.items(): cB = round(math.log10(freq) * 100)