fix arabic tokens

Former-commit-id: 11a1c51321
This commit is contained in:
Joshua Chin 2015-07-17 15:52:12 -04:00
parent a868c99839
commit 5c7e0dd0dd
4 changed files with 19 additions and 13 deletions

View File

@ -68,7 +68,7 @@ rule convert_google_syntactic_ngrams
command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
rule count
command = python -m wordfreq_builder.cli.count_tokens $in $out
command = python -m wordfreq_builder.cli.count_tokens $in $out $lang
rule merge
command = python -m wordfreq_builder.cli.combine_lists -o $out $in

View File

@ -2,8 +2,8 @@ from wordfreq_builder.word_counts import count_tokens, write_wordlist
import argparse
def handle_counts(filename_in, filename_out):
counts = count_tokens(filename_in)
def handle_counts(filename_in, filename_out, lang):
counts = count_tokens(filename_in, lang)
write_wordlist(counts, filename_out)
@ -11,6 +11,6 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file')
parser.add_argument('lang', help='language of input file')
args = parser.parse_args()
handle_counts(args.filename_in, args.filename_out)
handle_counts(args.filename_in, args.filename_out, args.lang)

View File

@ -151,9 +151,10 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, language
if language == 'ja':
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file)
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
else:
add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py')
combined_output = mecab_token_file
add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py',
params={'lang': language})
return lines

View File

@ -1,4 +1,4 @@
from wordfreq import simple_tokenize
from wordfreq import simple_tokenize, standardize_arabic
from collections import defaultdict
from operator import itemgetter
from ftfy import fix_text
@ -8,7 +8,7 @@ import msgpack
import gzip
def count_tokens(filename):
def count_tokens(filename, lang):
"""
Count tokens that appear in a file, running each line through our
simple tokenizer.
@ -17,9 +17,14 @@ def count_tokens(filename):
"""
counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile:
for token in simple_tokenize(line):
counts[token] += 1
if lang == 'ar':
for line in infile:
for token in simple_tokenize(line):
counts[standardize_arabic(token)] += 1
else:
for line in infile:
for token in simple_tokenize(line):
counts[token] += 1
return counts