don't do language-specific tokenization in freqs_to_cBpack

Tokenizing in the 'merge' step is sufficient.


Former-commit-id: bc8ebd23e9
This commit is contained in:
Robyn Speer 2015-09-08 14:46:04 -04:00
parent 64b0b76ee1
commit 4aef1dc338
3 changed files with 4 additions and 5 deletions

View File

@ -95,7 +95,7 @@ rule merge_counts
command = python -m wordfreq_builder.cli.merge_counts -o $out $in command = python -m wordfreq_builder.cli.merge_counts -o $out $in
rule freqs2cB rule freqs2cB
command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
rule cat rule cat
command = cat $in > $out command = cat $in > $out

View File

@ -4,8 +4,7 @@ import argparse
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('language', help='language of the input file')
parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file') parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args() args = parser.parse_args()
freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language) freqs_to_cBpack(args.filename_in, args.filename_out)

View File

@ -83,7 +83,7 @@ def read_freqs(filename, cutoff=0, lang=None):
return values return values
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
""" """
Convert a csv file of words and their frequencies to a file in the Convert a csv file of words and their frequencies to a file in the
idiosyncratic 'cBpack' format. idiosyncratic 'cBpack' format.
@ -94,7 +94,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
This cutoff should not be stacked with a cutoff in `read_freqs`; doing This cutoff should not be stacked with a cutoff in `read_freqs`; doing
so would skew the resulting frequencies. so would skew the resulting frequencies.
""" """
freqs = read_freqs(in_filename, cutoff=0, lang=lang) freqs = read_freqs(in_filename, cutoff=0, lang=None)
cBpack = [] cBpack = []
for token, freq in freqs.items(): for token, freq in freqs.items():
cB = round(math.log10(freq) * 100) cB = round(math.log10(freq) * 100)