diff --git a/tests/test.py b/tests/test.py index 8553c19..d38fd14 100644 --- a/tests/test.py +++ b/tests/test.py @@ -135,12 +135,20 @@ def test_not_enough_ascii(): random_ascii_words(lang='zh') def test_ar(): + + # Remove tatweels eq_( tokenize('متــــــــعب', 'ar'), ['متعب'] ) + # Remove combining marks eq_( tokenize('حَرَكَات', 'ar'), ['حركات'] ) + + eq_( + tokenize('إﻻ', 'ar'), + ['إلا'] + ) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 1b39257..cb085f7 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -8,6 +8,8 @@ import itertools import pathlib import random import logging +import unicodedata + logger = logging.getLogger(__name__) @@ -75,7 +77,10 @@ def standardize_arabic(text): """ Standardizes arabic text by removing combining marks and tatweels. """ - return COMBINING_MARK_RE.sub('', text.replace('ـ', '')) + return unicodedata.normalize( + 'NFKC', + COMBINING_MARK_RE.sub('', text.replace('ـ', '')) + ) def read_cBpack(filename): diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py index 6bf3957..288e3d6 100644 --- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py +++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py @@ -5,7 +5,7 @@ import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('filename_in', help='name of input file containing tokens') + parser.add_argument('language', help='language of the input file') parser.add_argument('filename_out', help='name of output file') args = parser.parse_args() - freqs_to_cBpack(args.filename_in, args.filename_out) - + freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index b36e1cf..094479f 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -96,11 +96,9 @@ def wikipedia_deps(dirname_in, languages): 'wikipedia', language, 'mecab-tokens.txt') add_dep( lines, 'tokenize_japanese', plain_text_file, mecab_token_file) - add_dep(lines, 'count', mecab_token_file, - count_file, params={'lang': language}) + add_dep(lines, 'count', mecab_token_file, count_file) else: - add_dep(lines, 'count', plain_text_file, - count_file, params={'lang': language}) + add_dep(lines, 'count', plain_text_file, count_file}) return lines @@ -165,8 +163,7 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, combined_output = mecab_token_file add_dep(lines, 'count', combined_output, count_file, - extra='wordfreq_builder/tokenizers.py', - params={'lang': language}) + extra='wordfreq_builder/tokenizers.py') return lines @@ -211,7 +208,8 @@ def combine_lists(languages): output_cBpack = wordlist_filename( 'combined-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', output_file, output_cBpack, - extra='wordfreq_builder/word_counts.py') + extra='wordfreq_builder/word_counts.py', + params={'lang': language}) lines.append('default {}'.format(output_cBpack)) @@ -221,7 +219,8 @@ def combine_lists(languages): output_cBpack = wordlist_filename( 'twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, - extra='wordfreq_builder/word_counts.py') + extra='wordfreq_builder/word_counts.py', + params={'lang': language}) lines.append('default {}'.format(output_cBpack)) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index e877262..717ab0a 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -1,4 +1,4 @@ -from wordfreq import simple_tokenize, standardize_arabic +from wordfreq import simple_tokenize, tokenize from collections import defaultdict from operator import itemgetter from ftfy import fix_text @@ -8,7 +8,7 @@ import msgpack import gzip -def count_tokens(filename, lang): +def count_tokens(filename): """ Count tokens that appear in a file, running each line through our simple tokenizer. @@ -19,18 +19,12 @@ def count_tokens(filename, lang): with open(filename, encoding='utf-8', errors='replace') as infile: for line in infile: for token in simple_tokenize(line): - if lang == 'ar': - token = standardize_arabic(token) - if not token: - # skip empty strings - continue - counts[token] += 1 return counts -def read_freqs(filename, cutoff=0): +def read_freqs(filename, cutoff=0, lang=None): """ Read words and their frequencies from a CSV file. @@ -47,7 +41,9 @@ def read_freqs(filename, cutoff=0): val = float(strval) if val < cutoff: break - for token in simple_tokenize(key): + + tokens = tokenize(key, lang) if lang is not None else simple_tokenize(lang) + for token in tokens: token = fix_text(token) total += val # Use += so that, if we give the reader concatenated files with @@ -60,7 +56,7 @@ def read_freqs(filename, cutoff=0): return raw_counts -def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): +def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): """ Convert a csv file of words and their frequencies to a file in the idiosyncratic 'cBpack' format. @@ -69,7 +65,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): written to the new file. """ freq_cutoff = 10 ** (cutoff / 100.) - freqs = read_freqs(in_filename, freq_cutoff) + freqs = read_freqs(in_filename, lang, freq_cutoff) cBpack = [] for token, freq in freqs.items(): cB = round(math.log10(freq) * 100)