From bc8ebd23e9d8c91fc35a8d98172c0daf28539f5b Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Tue, 8 Sep 2015 14:46:04 -0400 Subject: [PATCH] don't do language-specific tokenization in freqs_to_cBpack Tokenizing in the 'merge' step is sufficient. --- wordfreq_builder/rules.ninja | 2 +- wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py | 3 +-- wordfreq_builder/wordfreq_builder/word_counts.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 2a4fa0f..49a2e10 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -95,7 +95,7 @@ rule merge_counts command = python -m wordfreq_builder.cli.merge_counts -o $out $in rule freqs2cB - command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out + command = python -m wordfreq_builder.cli.freqs_to_cB $in $out rule cat command = cat $in > $out diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py index 9d0b1dc..73edb44 100644 --- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py +++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py @@ -4,8 +4,7 @@ import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('language', help='language of the input file') parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_out', help='name of output file') args = parser.parse_args() - freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language) + freqs_to_cBpack(args.filename_in, args.filename_out) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 93c65aa..d083fb1 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -83,7 +83,7 @@ def read_freqs(filename, cutoff=0, lang=None): return values -def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): +def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): """ Convert a csv file of words and their frequencies to a file in the idiosyncratic 'cBpack' format. @@ -94,7 +94,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): This cutoff should not be stacked with a cutoff in `read_freqs`; doing so would skew the resulting frequencies. """ - freqs = read_freqs(in_filename, cutoff=0, lang=lang) + freqs = read_freqs(in_filename, cutoff=0, lang=None) cBpack = [] for token, freq in freqs.items(): cB = round(math.log10(freq) * 100)