From 173278fdd3e5554063228abd1f2dac8f771995f6 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Mon, 20 Jul 2015 16:48:36 -0400
Subject: [PATCH] ensure removal of tatweels (hopefully)

---
 tests/test.py                                 |  8 ++++++++
 wordfreq/__init__.py                          |  7 ++++++-
 .../wordfreq_builder/cli/freqs_to_cB.py       |  4 ++--
 wordfreq_builder/wordfreq_builder/ninja.py    | 15 +++++++-------
 .../wordfreq_builder/word_counts.py           | 20 ++++++++-----------
 5 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 8553c19..d38fd14 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -135,12 +135,20 @@ def test_not_enough_ascii():
     random_ascii_words(lang='zh')
 
 def test_ar():
+
+    # Remove tatweels
     eq_(
         tokenize('متــــــــعب', 'ar'),
         ['متعب']
     )
 
+    # Remove combining marks
     eq_(
         tokenize('حَرَكَات', 'ar'),
         ['حركات']
     )
+
+    eq_(
+        tokenize('إﻻ', 'ar'),
+        ['إلا']
+    )
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 1b39257..cb085f7 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -8,6 +8,8 @@ import itertools
 import pathlib
 import random
 import logging
+import unicodedata
+
 logger = logging.getLogger(__name__)
 
 
@@ -75,7 +77,10 @@ def standardize_arabic(text):
     """
     Standardizes arabic text by removing combining marks and tatweels.
     """
-    return COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+    return unicodedata.normalize(
+        'NFKC',
+        COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
+    )
 
 
 def read_cBpack(filename):
diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
index 6bf3957..288e3d6 100644
--- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
+++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py
@@ -5,7 +5,7 @@ import argparse
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('filename_in', help='name of input file containing tokens')
+    parser.add_argument('language', help='language of the input file')
     parser.add_argument('filename_out', help='name of output file')
     args = parser.parse_args()
-    freqs_to_cBpack(args.filename_in, args.filename_out)
-
+    freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py
index b36e1cf..094479f 100644
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@@ -96,11 +96,9 @@ def wikipedia_deps(dirname_in, languages):
                 'wikipedia', language, 'mecab-tokens.txt')
             add_dep(
                 lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
-            add_dep(lines, 'count', mecab_token_file,
-                    count_file, params={'lang': language})
+            add_dep(lines, 'count', mecab_token_file, count_file)
         else:
-            add_dep(lines, 'count', plain_text_file,
-                    count_file, params={'lang': language})
+            add_dep(lines, 'count', plain_text_file, count_file})
 
     return lines
 
@@ -165,8 +163,7 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
             combined_output = mecab_token_file
 
         add_dep(lines, 'count', combined_output, count_file,
-                extra='wordfreq_builder/tokenizers.py',
-                params={'lang': language})
+                extra='wordfreq_builder/tokenizers.py')
 
     return lines
 
@@ -211,7 +208,8 @@ def combine_lists(languages):
         output_cBpack = wordlist_filename(
             'combined-dist', language, 'msgpack.gz')
         add_dep(lines, 'freqs2cB', output_file, output_cBpack,
-                extra='wordfreq_builder/word_counts.py')
+                extra='wordfreq_builder/word_counts.py',
+                params={'lang': language})
 
         lines.append('default {}'.format(output_cBpack))
 
@@ -221,7 +219,8 @@ def combine_lists(languages):
             output_cBpack = wordlist_filename(
                 'twitter-dist', language, 'msgpack.gz')
             add_dep(lines, 'freqs2cB', input_file, output_cBpack,
-                    extra='wordfreq_builder/word_counts.py')
+                    extra='wordfreq_builder/word_counts.py',
+                    params={'lang': language})
 
             lines.append('default {}'.format(output_cBpack))
 
diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index e877262..717ab0a 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -1,4 +1,4 @@
-from wordfreq import simple_tokenize, standardize_arabic
+from wordfreq import simple_tokenize, tokenize
 from collections import defaultdict
 from operator import itemgetter
 from ftfy import fix_text
@@ -8,7 +8,7 @@ import msgpack
 import gzip
 
 
-def count_tokens(filename, lang):
+def count_tokens(filename):
     """
     Count tokens that appear in a file, running each line through our
     simple tokenizer.
@@ -19,18 +19,12 @@ def count_tokens(filename, lang):
     with open(filename, encoding='utf-8', errors='replace') as infile:
         for line in infile:
             for token in simple_tokenize(line):
-                if lang == 'ar':
-                    token = standardize_arabic(token)
-                    if not token:
-                        # skip empty strings
-                        continue
-
                 counts[token] += 1
 
     return counts
 
 
-def read_freqs(filename, cutoff=0):
+def read_freqs(filename, cutoff=0, lang=None):
     """
     Read words and their frequencies from a CSV file.
 
@@ -47,7 +41,9 @@ def read_freqs(filename, cutoff=0):
             val = float(strval)
             if val < cutoff:
                 break
-            for token in simple_tokenize(key):
+                
+            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(lang)
+            for token in tokens:
                 token = fix_text(token)
                 total += val
                 # Use += so that, if we give the reader concatenated files with
@@ -60,7 +56,7 @@ def read_freqs(filename, cutoff=0):
     return raw_counts
 
 
-def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
+def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
     """
     Convert a csv file of words and their frequencies to a file in the
     idiosyncratic 'cBpack' format.
@@ -69,7 +65,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
     written to the new file.
     """
     freq_cutoff = 10 ** (cutoff / 100.)
-    freqs = read_freqs(in_filename, freq_cutoff)
+    freqs = read_freqs(in_filename, lang, freq_cutoff)
     cBpack = []
     for token, freq in freqs.items():
         cB = round(math.log10(freq) * 100)