don't do language-specific tokenization in freqs_to_cBpack

Tokenizing in the 'merge' step is sufficient.


Former-commit-id: bc8ebd23e9
This commit is contained in:
Robyn Speer 2015-09-08 14:46:04 -04:00
parent 64b0b76ee1
commit 4aef1dc338
3 changed files with 4 additions and 5 deletions

View File

@ -95,7 +95,7 @@ rule merge_counts
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
rule freqs2cB
command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
rule cat
command = cat $in > $out

View File

@ -4,8 +4,7 @@ import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('language', help='language of the input file')
parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args()
freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
freqs_to_cBpack(args.filename_in, args.filename_out)

View File

@ -83,7 +83,7 @@ def read_freqs(filename, cutoff=0, lang=None):
return values
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
"""
Convert a csv file of words and their frequencies to a file in the
idiosyncratic 'cBpack' format.
@ -94,7 +94,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
This cutoff should not be stacked with a cutoff in `read_freqs`; doing
so would skew the resulting frequencies.
"""
freqs = read_freqs(in_filename, cutoff=0, lang=lang)
freqs = read_freqs(in_filename, cutoff=0, lang=None)
cBpack = []
for token, freq in freqs.items():
cB = round(math.log10(freq) * 100)