mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
ensure removal of tatweels (hopefully)
This commit is contained in:
parent
298d3c1d24
commit
173278fdd3
@ -135,12 +135,20 @@ def test_not_enough_ascii():
|
|||||||
random_ascii_words(lang='zh')
|
random_ascii_words(lang='zh')
|
||||||
|
|
||||||
def test_ar():
|
def test_ar():
|
||||||
|
|
||||||
|
# Remove tatweels
|
||||||
eq_(
|
eq_(
|
||||||
tokenize('متــــــــعب', 'ar'),
|
tokenize('متــــــــعب', 'ar'),
|
||||||
['متعب']
|
['متعب']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Remove combining marks
|
||||||
eq_(
|
eq_(
|
||||||
tokenize('حَرَكَات', 'ar'),
|
tokenize('حَرَكَات', 'ar'),
|
||||||
['حركات']
|
['حركات']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
eq_(
|
||||||
|
tokenize('إﻻ', 'ar'),
|
||||||
|
['إلا']
|
||||||
|
)
|
||||||
|
@ -8,6 +8,8 @@ import itertools
|
|||||||
import pathlib
|
import pathlib
|
||||||
import random
|
import random
|
||||||
import logging
|
import logging
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -75,7 +77,10 @@ def standardize_arabic(text):
|
|||||||
"""
|
"""
|
||||||
Standardizes arabic text by removing combining marks and tatweels.
|
Standardizes arabic text by removing combining marks and tatweels.
|
||||||
"""
|
"""
|
||||||
return COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
return unicodedata.normalize(
|
||||||
|
'NFKC',
|
||||||
|
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def read_cBpack(filename):
|
def read_cBpack(filename):
|
||||||
|
@ -5,7 +5,7 @@ import argparse
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||||
|
parser.add_argument('language', help='language of the input file')
|
||||||
parser.add_argument('filename_out', help='name of output file')
|
parser.add_argument('filename_out', help='name of output file')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
freqs_to_cBpack(args.filename_in, args.filename_out)
|
freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
|
||||||
|
|
||||||
|
@ -96,11 +96,9 @@ def wikipedia_deps(dirname_in, languages):
|
|||||||
'wikipedia', language, 'mecab-tokens.txt')
|
'wikipedia', language, 'mecab-tokens.txt')
|
||||||
add_dep(
|
add_dep(
|
||||||
lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
|
lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
|
||||||
add_dep(lines, 'count', mecab_token_file,
|
add_dep(lines, 'count', mecab_token_file, count_file)
|
||||||
count_file, params={'lang': language})
|
|
||||||
else:
|
else:
|
||||||
add_dep(lines, 'count', plain_text_file,
|
add_dep(lines, 'count', plain_text_file, count_file})
|
||||||
count_file, params={'lang': language})
|
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
@ -165,8 +163,7 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
|||||||
combined_output = mecab_token_file
|
combined_output = mecab_token_file
|
||||||
|
|
||||||
add_dep(lines, 'count', combined_output, count_file,
|
add_dep(lines, 'count', combined_output, count_file,
|
||||||
extra='wordfreq_builder/tokenizers.py',
|
extra='wordfreq_builder/tokenizers.py')
|
||||||
params={'lang': language})
|
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
@ -211,7 +208,8 @@ def combine_lists(languages):
|
|||||||
output_cBpack = wordlist_filename(
|
output_cBpack = wordlist_filename(
|
||||||
'combined-dist', language, 'msgpack.gz')
|
'combined-dist', language, 'msgpack.gz')
|
||||||
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
||||||
extra='wordfreq_builder/word_counts.py')
|
extra='wordfreq_builder/word_counts.py',
|
||||||
|
params={'lang': language})
|
||||||
|
|
||||||
lines.append('default {}'.format(output_cBpack))
|
lines.append('default {}'.format(output_cBpack))
|
||||||
|
|
||||||
@ -221,7 +219,8 @@ def combine_lists(languages):
|
|||||||
output_cBpack = wordlist_filename(
|
output_cBpack = wordlist_filename(
|
||||||
'twitter-dist', language, 'msgpack.gz')
|
'twitter-dist', language, 'msgpack.gz')
|
||||||
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
|
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
|
||||||
extra='wordfreq_builder/word_counts.py')
|
extra='wordfreq_builder/word_counts.py',
|
||||||
|
params={'lang': language})
|
||||||
|
|
||||||
lines.append('default {}'.format(output_cBpack))
|
lines.append('default {}'.format(output_cBpack))
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from wordfreq import simple_tokenize, standardize_arabic
|
from wordfreq import simple_tokenize, tokenize
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from ftfy import fix_text
|
from ftfy import fix_text
|
||||||
@ -8,7 +8,7 @@ import msgpack
|
|||||||
import gzip
|
import gzip
|
||||||
|
|
||||||
|
|
||||||
def count_tokens(filename, lang):
|
def count_tokens(filename):
|
||||||
"""
|
"""
|
||||||
Count tokens that appear in a file, running each line through our
|
Count tokens that appear in a file, running each line through our
|
||||||
simple tokenizer.
|
simple tokenizer.
|
||||||
@ -19,18 +19,12 @@ def count_tokens(filename, lang):
|
|||||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
for token in simple_tokenize(line):
|
for token in simple_tokenize(line):
|
||||||
if lang == 'ar':
|
|
||||||
token = standardize_arabic(token)
|
|
||||||
if not token:
|
|
||||||
# skip empty strings
|
|
||||||
continue
|
|
||||||
|
|
||||||
counts[token] += 1
|
counts[token] += 1
|
||||||
|
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(filename, cutoff=0):
|
def read_freqs(filename, cutoff=0, lang=None):
|
||||||
"""
|
"""
|
||||||
Read words and their frequencies from a CSV file.
|
Read words and their frequencies from a CSV file.
|
||||||
|
|
||||||
@ -47,7 +41,9 @@ def read_freqs(filename, cutoff=0):
|
|||||||
val = float(strval)
|
val = float(strval)
|
||||||
if val < cutoff:
|
if val < cutoff:
|
||||||
break
|
break
|
||||||
for token in simple_tokenize(key):
|
|
||||||
|
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(lang)
|
||||||
|
for token in tokens:
|
||||||
token = fix_text(token)
|
token = fix_text(token)
|
||||||
total += val
|
total += val
|
||||||
# Use += so that, if we give the reader concatenated files with
|
# Use += so that, if we give the reader concatenated files with
|
||||||
@ -60,7 +56,7 @@ def read_freqs(filename, cutoff=0):
|
|||||||
return raw_counts
|
return raw_counts
|
||||||
|
|
||||||
|
|
||||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
|
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
||||||
"""
|
"""
|
||||||
Convert a csv file of words and their frequencies to a file in the
|
Convert a csv file of words and their frequencies to a file in the
|
||||||
idiosyncratic 'cBpack' format.
|
idiosyncratic 'cBpack' format.
|
||||||
@ -69,7 +65,7 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
|
|||||||
written to the new file.
|
written to the new file.
|
||||||
"""
|
"""
|
||||||
freq_cutoff = 10 ** (cutoff / 100.)
|
freq_cutoff = 10 ** (cutoff / 100.)
|
||||||
freqs = read_freqs(in_filename, freq_cutoff)
|
freqs = read_freqs(in_filename, lang, freq_cutoff)
|
||||||
cBpack = []
|
cBpack = []
|
||||||
for token, freq in freqs.items():
|
for token, freq in freqs.items():
|
||||||
cB = round(math.log10(freq) * 100)
|
cB = round(math.log10(freq) * 100)
|
||||||
|
Loading…
Reference in New Issue
Block a user