mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
ensure removal of tatweels (hopefully)
This commit is contained in:
parent
298d3c1d24
commit
173278fdd3
@ -135,12 +135,20 @@ def test_not_enough_ascii():
|
||||
random_ascii_words(lang='zh')
|
||||
|
||||
def test_ar():
|
||||
|
||||
# Remove tatweels
|
||||
eq_(
|
||||
tokenize('متــــــــعب', 'ar'),
|
||||
['متعب']
|
||||
)
|
||||
|
||||
# Remove combining marks
|
||||
eq_(
|
||||
tokenize('حَرَكَات', 'ar'),
|
||||
['حركات']
|
||||
)
|
||||
|
||||
eq_(
|
||||
tokenize('إﻻ', 'ar'),
|
||||
['إلا']
|
||||
)
|
||||
|
@ -8,6 +8,8 @@ import itertools
|
||||
import pathlib
|
||||
import random
|
||||
import logging
|
||||
import unicodedata
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -75,7 +77,10 @@ def standardize_arabic(text):
|
||||
"""
|
||||
Standardizes arabic text by removing combining marks and tatweels.
|
||||
"""
|
||||
return COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
return unicodedata.normalize(
|
||||
'NFKC',
|
||||
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
)
|
||||
|
||||
|
||||
def read_cBpack(filename):
|
||||
|
@ -5,7 +5,7 @@ import argparse
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||
parser.add_argument('language', help='language of the input file')
|
||||
parser.add_argument('filename_out', help='name of output file')
|
||||
args = parser.parse_args()
|
||||
freqs_to_cBpack(args.filename_in, args.filename_out)
|
||||
|
||||
freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
|
||||
|
@ -96,11 +96,9 @@ def wikipedia_deps(dirname_in, languages):
|
||||
'wikipedia', language, 'mecab-tokens.txt')
|
||||
add_dep(
|
||||
lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
|
||||
add_dep(lines, 'count', mecab_token_file,
|
||||
count_file, params={'lang': language})
|
||||
add_dep(lines, 'count', mecab_token_file, count_file)
|
||||
else:
|
||||
add_dep(lines, 'count', plain_text_file,
|
||||
count_file, params={'lang': language})
|
||||
add_dep(lines, 'count', plain_text_file, count_file})
|
||||
|
||||
return lines
|
||||
|
||||
@ -165,8 +163,7 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
||||
combined_output = mecab_token_file
|
||||
|
||||
add_dep(lines, 'count', combined_output, count_file,
|
||||
extra='wordfreq_builder/tokenizers.py',
|
||||
params={'lang': language})
|
||||
extra='wordfreq_builder/tokenizers.py')
|
||||
|
||||
return lines
|
||||
|
||||
@ -211,7 +208,8 @@ def combine_lists(languages):
|
||||
output_cBpack = wordlist_filename(
|
||||
'combined-dist', language, 'msgpack.gz')
|
||||
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
||||
extra='wordfreq_builder/word_counts.py')
|
||||
extra='wordfreq_builder/word_counts.py',
|
||||
params={'lang': language})
|
||||
|
||||
lines.append('default {}'.format(output_cBpack))
|
||||
|
||||
@ -221,7 +219,8 @@ def combine_lists(languages):
|
||||
output_cBpack = wordlist_filename(
|
||||
'twitter-dist', language, 'msgpack.gz')
|
||||
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
|
||||
extra='wordfreq_builder/word_counts.py')
|
||||
extra='wordfreq_builder/word_counts.py',
|
||||
params={'lang': language})
|
||||
|
||||
lines.append('default {}'.format(output_cBpack))
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from wordfreq import simple_tokenize, standardize_arabic
|
||||
from wordfreq import simple_tokenize, tokenize
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
from ftfy import fix_text
|
||||
@ -8,7 +8,7 @@ import msgpack
|
||||
import gzip
|
||||
|
||||
|
||||
def count_tokens(filename, lang):
|
||||
def count_tokens(filename):
|
||||
"""
|
||||
Count tokens that appear in a file, running each line through our
|
||||
simple tokenizer.
|
||||
@ -19,18 +19,12 @@ def count_tokens(filename, lang):
|
||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||
for line in infile:
|
||||
for token in simple_tokenize(line):
|
||||
if lang == 'ar':
|
||||
token = standardize_arabic(token)
|
||||
if not token:
|
||||
# skip empty strings
|
||||
continue
|
||||
|
||||
counts[token] += 1
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
def read_freqs(filename, cutoff=0):
|
||||
def read_freqs(filename, cutoff=0, lang=None):
|
||||
"""
|
||||
Read words and their frequencies from a CSV file.
|
||||
|
||||
@ -47,7 +41,9 @@ def read_freqs(filename, cutoff=0):
|
||||
val = float(strval)
|
||||
if val < cutoff:
|
||||
break
|
||||
for token in simple_tokenize(key):
|
||||
|
||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(lang)
|
||||
for token in tokens:
|
||||
token = fix_text(token)
|
||||
total += val
|
||||
# Use += so that, if we give the reader concatenated files with
|
||||
@ -60,7 +56,7 @@ def read_freqs(filename, cutoff=0):
|
||||
return raw_counts
|
||||
|
||||
|
||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
|
||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
||||
"""
|
||||
Convert a csv file of words and their frequencies to a file in the
|
||||
idiosyncratic 'cBpack' format.
|
||||
@ -69,7 +65,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
|
||||
written to the new file.
|
||||
"""
|
||||
freq_cutoff = 10 ** (cutoff / 100.)
|
||||
freqs = read_freqs(in_filename, freq_cutoff)
|
||||
freqs = read_freqs(in_filename, lang, freq_cutoff)
|
||||
cBpack = []
|
||||
for token, freq in freqs.items():
|
||||
cB = round(math.log10(freq) * 100)
|
||||
|
Loading…
Reference in New Issue
Block a user