ensure removal of tatweels (hopefully)

This commit is contained in:
Joshua Chin 2015-07-20 16:48:36 -04:00
parent 298d3c1d24
commit 173278fdd3
5 changed files with 31 additions and 23 deletions

View File

@ -135,12 +135,20 @@ def test_not_enough_ascii():
random_ascii_words(lang='zh') random_ascii_words(lang='zh')
def test_ar(): def test_ar():
# Remove tatweels
eq_( eq_(
tokenize('متــــــــعب', 'ar'), tokenize('متــــــــعب', 'ar'),
['متعب'] ['متعب']
) )
# Remove combining marks
eq_( eq_(
tokenize('حَرَكَات', 'ar'), tokenize('حَرَكَات', 'ar'),
['حركات'] ['حركات']
) )
eq_(
tokenize('إﻻ', 'ar'),
['إلا']
)

View File

@ -8,6 +8,8 @@ import itertools
import pathlib import pathlib
import random import random
import logging import logging
import unicodedata
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -75,7 +77,10 @@ def standardize_arabic(text):
""" """
Standardizes arabic text by removing combining marks and tatweels. Standardizes arabic text by removing combining marks and tatweels.
""" """
return COMBINING_MARK_RE.sub('', text.replace('ـ', '')) return unicodedata.normalize(
'NFKC',
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
)
def read_cBpack(filename): def read_cBpack(filename):

View File

@ -5,7 +5,7 @@ import argparse
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('language', help='language of the input file')
parser.add_argument('filename_out', help='name of output file') parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args() args = parser.parse_args()
freqs_to_cBpack(args.filename_in, args.filename_out) freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)

View File

@ -96,11 +96,9 @@ def wikipedia_deps(dirname_in, languages):
'wikipedia', language, 'mecab-tokens.txt') 'wikipedia', language, 'mecab-tokens.txt')
add_dep( add_dep(
lines, 'tokenize_japanese', plain_text_file, mecab_token_file) lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
add_dep(lines, 'count', mecab_token_file, add_dep(lines, 'count', mecab_token_file, count_file)
count_file, params={'lang': language})
else: else:
add_dep(lines, 'count', plain_text_file, add_dep(lines, 'count', plain_text_file, count_file})
count_file, params={'lang': language})
return lines return lines
@ -165,8 +163,7 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
combined_output = mecab_token_file combined_output = mecab_token_file
add_dep(lines, 'count', combined_output, count_file, add_dep(lines, 'count', combined_output, count_file,
extra='wordfreq_builder/tokenizers.py', extra='wordfreq_builder/tokenizers.py')
params={'lang': language})
return lines return lines
@ -211,7 +208,8 @@ def combine_lists(languages):
output_cBpack = wordlist_filename( output_cBpack = wordlist_filename(
'combined-dist', language, 'msgpack.gz') 'combined-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', output_file, output_cBpack, add_dep(lines, 'freqs2cB', output_file, output_cBpack,
extra='wordfreq_builder/word_counts.py') extra='wordfreq_builder/word_counts.py',
params={'lang': language})
lines.append('default {}'.format(output_cBpack)) lines.append('default {}'.format(output_cBpack))
@ -221,7 +219,8 @@ def combine_lists(languages):
output_cBpack = wordlist_filename( output_cBpack = wordlist_filename(
'twitter-dist', language, 'msgpack.gz') 'twitter-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', input_file, output_cBpack, add_dep(lines, 'freqs2cB', input_file, output_cBpack,
extra='wordfreq_builder/word_counts.py') extra='wordfreq_builder/word_counts.py',
params={'lang': language})
lines.append('default {}'.format(output_cBpack)) lines.append('default {}'.format(output_cBpack))

View File

@ -1,4 +1,4 @@
from wordfreq import simple_tokenize, standardize_arabic from wordfreq import simple_tokenize, tokenize
from collections import defaultdict from collections import defaultdict
from operator import itemgetter from operator import itemgetter
from ftfy import fix_text from ftfy import fix_text
@ -8,7 +8,7 @@ import msgpack
import gzip import gzip
def count_tokens(filename, lang): def count_tokens(filename):
""" """
Count tokens that appear in a file, running each line through our Count tokens that appear in a file, running each line through our
simple tokenizer. simple tokenizer.
@ -19,18 +19,12 @@ def count_tokens(filename, lang):
with open(filename, encoding='utf-8', errors='replace') as infile: with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile: for line in infile:
for token in simple_tokenize(line): for token in simple_tokenize(line):
if lang == 'ar':
token = standardize_arabic(token)
if not token:
# skip empty strings
continue
counts[token] += 1 counts[token] += 1
return counts return counts
def read_freqs(filename, cutoff=0): def read_freqs(filename, cutoff=0, lang=None):
""" """
Read words and their frequencies from a CSV file. Read words and their frequencies from a CSV file.
@ -47,7 +41,9 @@ def read_freqs(filename, cutoff=0):
val = float(strval) val = float(strval)
if val < cutoff: if val < cutoff:
break break
for token in simple_tokenize(key):
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(lang)
for token in tokens:
token = fix_text(token) token = fix_text(token)
total += val total += val
# Use += so that, if we give the reader concatenated files with # Use += so that, if we give the reader concatenated files with
@ -60,7 +56,7 @@ def read_freqs(filename, cutoff=0):
return raw_counts return raw_counts
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
""" """
Convert a csv file of words and their frequencies to a file in the Convert a csv file of words and their frequencies to a file in the
idiosyncratic 'cBpack' format. idiosyncratic 'cBpack' format.
@ -69,7 +65,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
written to the new file. written to the new file.
""" """
freq_cutoff = 10 ** (cutoff / 100.) freq_cutoff = 10 ** (cutoff / 100.)
freqs = read_freqs(in_filename, freq_cutoff) freqs = read_freqs(in_filename, lang, freq_cutoff)
cBpack = [] cBpack = []
for token, freq in freqs.items(): for token, freq in freqs.items():
cB = round(math.log10(freq) * 100) cB = round(math.log10(freq) * 100)