ensure removal of tatweels (hopefully)

Former-commit-id: 173278fdd3
This commit is contained in:
Joshua Chin 2015-07-20 16:48:36 -04:00
parent 675a02ac11
commit af8050f1b8
5 changed files with 31 additions and 23 deletions

View File

@ -135,12 +135,20 @@ def test_not_enough_ascii():
random_ascii_words(lang='zh')
def test_ar():
# Remove tatweels
eq_(
tokenize('متــــــــعب', 'ar'),
['متعب']
)
# Remove combining marks
eq_(
tokenize('حَرَكَات', 'ar'),
['حركات']
)
eq_(
tokenize('إﻻ', 'ar'),
['إلا']
)

View File

@ -8,6 +8,8 @@ import itertools
import pathlib
import random
import logging
import unicodedata
logger = logging.getLogger(__name__)
@ -75,7 +77,10 @@ def standardize_arabic(text):
"""
Standardizes arabic text by removing combining marks and tatweels.
"""
return COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
return unicodedata.normalize(
'NFKC',
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
)
def read_cBpack(filename):

View File

@ -5,7 +5,7 @@ import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('language', help='language of the input file')
parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args()
freqs_to_cBpack(args.filename_in, args.filename_out)
freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)

View File

@ -96,11 +96,9 @@ def wikipedia_deps(dirname_in, languages):
'wikipedia', language, 'mecab-tokens.txt')
add_dep(
lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
add_dep(lines, 'count', mecab_token_file,
count_file, params={'lang': language})
add_dep(lines, 'count', mecab_token_file, count_file)
else:
add_dep(lines, 'count', plain_text_file,
count_file, params={'lang': language})
add_dep(lines, 'count', plain_text_file, count_file})
return lines
@ -165,8 +163,7 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
combined_output = mecab_token_file
add_dep(lines, 'count', combined_output, count_file,
extra='wordfreq_builder/tokenizers.py',
params={'lang': language})
extra='wordfreq_builder/tokenizers.py')
return lines
@ -211,7 +208,8 @@ def combine_lists(languages):
output_cBpack = wordlist_filename(
'combined-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
extra='wordfreq_builder/word_counts.py')
extra='wordfreq_builder/word_counts.py',
params={'lang': language})
lines.append('default {}'.format(output_cBpack))
@ -221,7 +219,8 @@ def combine_lists(languages):
output_cBpack = wordlist_filename(
'twitter-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
extra='wordfreq_builder/word_counts.py')
extra='wordfreq_builder/word_counts.py',
params={'lang': language})
lines.append('default {}'.format(output_cBpack))

View File

@ -1,4 +1,4 @@
from wordfreq import simple_tokenize, standardize_arabic
from wordfreq import simple_tokenize, tokenize
from collections import defaultdict
from operator import itemgetter
from ftfy import fix_text
@ -8,7 +8,7 @@ import msgpack
import gzip
def count_tokens(filename, lang):
def count_tokens(filename):
"""
Count tokens that appear in a file, running each line through our
simple tokenizer.
@ -19,18 +19,12 @@ def count_tokens(filename, lang):
with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile:
for token in simple_tokenize(line):
if lang == 'ar':
token = standardize_arabic(token)
if not token:
# skip empty strings
continue
counts[token] += 1
return counts
def read_freqs(filename, cutoff=0):
def read_freqs(filename, cutoff=0, lang=None):
"""
Read words and their frequencies from a CSV file.
@ -47,7 +41,9 @@ def read_freqs(filename, cutoff=0):
val = float(strval)
if val < cutoff:
break
for token in simple_tokenize(key):
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(lang)
for token in tokens:
token = fix_text(token)
total += val
# Use += so that, if we give the reader concatenated files with
@ -60,7 +56,7 @@ def read_freqs(filename, cutoff=0):
return raw_counts
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
"""
Convert a csv file of words and their frequencies to a file in the
idiosyncratic 'cBpack' format.
@ -69,7 +65,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
written to the new file.
"""
freq_cutoff = 10 ** (cutoff / 100.)
freqs = read_freqs(in_filename, freq_cutoff)
freqs = read_freqs(in_filename, lang, freq_cutoff)
cBpack = []
for token, freq in freqs.items():
cB = round(math.log10(freq) * 100)