fix arabic tokens

Former-commit-id: 11a1c51321
2024-12-24 18:01:38 +00:00 · 2015-07-17 15:52:12 -04:00 · 2015-07-17 15:52:12 -04:00 · 5c7e0dd0dd
commit 5c7e0dd0dd
parent a868c99839
4 changed files with 19 additions and 13 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -68,7 +68,7 @@ rule convert_google_syntactic_ngrams
  command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out

 rule count
-  command = python -m wordfreq_builder.cli.count_tokens $in $out
+  command = python -m wordfreq_builder.cli.count_tokens $in $out $lang

 rule merge
  command = python -m wordfreq_builder.cli.combine_lists -o $out $in
--- a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
+++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
@ -2,8 +2,8 @@ from wordfreq_builder.word_counts import count_tokens, write_wordlist
 import argparse


-def handle_counts(filename_in, filename_out):
-    counts = count_tokens(filename_in)
+def handle_counts(filename_in, filename_out, lang):
+    counts = count_tokens(filename_in, lang)
    write_wordlist(counts, filename_out)


@ -11,6 +11,6 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('filename_in', help='name of input file containing tokens')
    parser.add_argument('filename_out', help='name of output file')
+    parser.add_argument('lang', help='language of input file')
    args = parser.parse_args()
-    handle_counts(args.filename_in, args.filename_out)
-
+    handle_counts(args.filename_in, args.filename_out, args.lang)
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -151,9 +151,10 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, language
        if language == 'ja':
            mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
            add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file)
-            add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
-        else:
-            add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py')
+            combined_output = mecab_token_file
+            
+        add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py',
+                params={'lang': language})

    return lines

--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -1,4 +1,4 @@
-from wordfreq import simple_tokenize
+from wordfreq import simple_tokenize, standardize_arabic
 from collections import defaultdict
 from operator import itemgetter
 from ftfy import fix_text
@ -8,7 +8,7 @@ import msgpack
 import gzip


-def count_tokens(filename):
+def count_tokens(filename, lang):
    """
    Count tokens that appear in a file, running each line through our
    simple tokenizer.
@ -17,9 +17,14 @@ def count_tokens(filename):
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8', errors='replace') as infile:
-        for line in infile:
-            for token in simple_tokenize(line):
-                counts[token] += 1
+        if lang == 'ar':
+            for line in infile:
+                for token in simple_tokenize(line):
+                    counts[standardize_arabic(token)] += 1
+        else:
+            for line in infile:
+                for token in simple_tokenize(line):
+                    counts[token] += 1
    return counts