mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 18:01:38 +00:00
parent
a868c99839
commit
5c7e0dd0dd
@ -68,7 +68,7 @@ rule convert_google_syntactic_ngrams
|
||||
command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
|
||||
|
||||
rule count
|
||||
command = python -m wordfreq_builder.cli.count_tokens $in $out
|
||||
command = python -m wordfreq_builder.cli.count_tokens $in $out $lang
|
||||
|
||||
rule merge
|
||||
command = python -m wordfreq_builder.cli.combine_lists -o $out $in
|
||||
|
@ -2,8 +2,8 @@ from wordfreq_builder.word_counts import count_tokens, write_wordlist
|
||||
import argparse
|
||||
|
||||
|
||||
def handle_counts(filename_in, filename_out):
|
||||
counts = count_tokens(filename_in)
|
||||
def handle_counts(filename_in, filename_out, lang):
|
||||
counts = count_tokens(filename_in, lang)
|
||||
write_wordlist(counts, filename_out)
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@ if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||
parser.add_argument('filename_out', help='name of output file')
|
||||
parser.add_argument('lang', help='language of input file')
|
||||
args = parser.parse_args()
|
||||
handle_counts(args.filename_in, args.filename_out)
|
||||
|
||||
handle_counts(args.filename_in, args.filename_out, args.lang)
|
||||
|
@ -151,9 +151,10 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, language
|
||||
if language == 'ja':
|
||||
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
|
||||
add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file)
|
||||
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||
else:
|
||||
add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||
combined_output = mecab_token_file
|
||||
|
||||
add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py',
|
||||
params={'lang': language})
|
||||
|
||||
return lines
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from wordfreq import simple_tokenize
|
||||
from wordfreq import simple_tokenize, standardize_arabic
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
from ftfy import fix_text
|
||||
@ -8,7 +8,7 @@ import msgpack
|
||||
import gzip
|
||||
|
||||
|
||||
def count_tokens(filename):
|
||||
def count_tokens(filename, lang):
|
||||
"""
|
||||
Count tokens that appear in a file, running each line through our
|
||||
simple tokenizer.
|
||||
@ -17,9 +17,14 @@ def count_tokens(filename):
|
||||
"""
|
||||
counts = defaultdict(int)
|
||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||
for line in infile:
|
||||
for token in simple_tokenize(line):
|
||||
counts[token] += 1
|
||||
if lang == 'ar':
|
||||
for line in infile:
|
||||
for token in simple_tokenize(line):
|
||||
counts[standardize_arabic(token)] += 1
|
||||
else:
|
||||
for line in infile:
|
||||
for token in simple_tokenize(line):
|
||||
counts[token] += 1
|
||||
return counts
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user