mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
don't do language-specific tokenization in freqs_to_cBpack
Tokenizing in the 'merge' step is sufficient.
Former-commit-id: bc8ebd23e9
This commit is contained in:
parent
64b0b76ee1
commit
4aef1dc338
@ -95,7 +95,7 @@ rule merge_counts
|
|||||||
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
||||||
|
|
||||||
rule freqs2cB
|
rule freqs2cB
|
||||||
command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
|
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
||||||
|
|
||||||
rule cat
|
rule cat
|
||||||
command = cat $in > $out
|
command = cat $in > $out
|
||||||
|
@ -4,8 +4,7 @@ import argparse
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('language', help='language of the input file')
|
|
||||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||||
parser.add_argument('filename_out', help='name of output file')
|
parser.add_argument('filename_out', help='name of output file')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
|
freqs_to_cBpack(args.filename_in, args.filename_out)
|
||||||
|
@ -83,7 +83,7 @@ def read_freqs(filename, cutoff=0, lang=None):
|
|||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
|
||||||
"""
|
"""
|
||||||
Convert a csv file of words and their frequencies to a file in the
|
Convert a csv file of words and their frequencies to a file in the
|
||||||
idiosyncratic 'cBpack' format.
|
idiosyncratic 'cBpack' format.
|
||||||
@ -94,7 +94,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
|||||||
This cutoff should not be stacked with a cutoff in `read_freqs`; doing
|
This cutoff should not be stacked with a cutoff in `read_freqs`; doing
|
||||||
so would skew the resulting frequencies.
|
so would skew the resulting frequencies.
|
||||||
"""
|
"""
|
||||||
freqs = read_freqs(in_filename, cutoff=0, lang=lang)
|
freqs = read_freqs(in_filename, cutoff=0, lang=None)
|
||||||
cBpack = []
|
cBpack = []
|
||||||
for token, freq in freqs.items():
|
for token, freq in freqs.items():
|
||||||
cB = round(math.log10(freq) * 100)
|
cB = round(math.log10(freq) * 100)
|
||||||
|
Loading…
Reference in New Issue
Block a user