mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
run black
This commit is contained in:
parent
3c4819e7e5
commit
ef4d6fe0df
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
CACHE_SIZE = 100000
|
CACHE_SIZE = 100000
|
||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename("wordfreq", "data"))
|
||||||
|
|
||||||
# We'll divide the frequency by 10 for each token boundary that was inferred.
|
# We'll divide the frequency by 10 for each token boundary that was inferred.
|
||||||
# (We determined the factor of 10 empirically by looking at words in the
|
# (We determined the factor of 10 empirically by looking at words in the
|
||||||
@ -75,44 +75,43 @@ def read_cBpack(filename):
|
|||||||
['blue', 'red']
|
['blue', 'red']
|
||||||
]
|
]
|
||||||
"""
|
"""
|
||||||
with gzip.open(filename, 'rb') as infile:
|
with gzip.open(filename, "rb") as infile:
|
||||||
data = msgpack.load(infile, raw=False)
|
data = msgpack.load(infile, raw=False)
|
||||||
header = data[0]
|
header = data[0]
|
||||||
if (
|
if (
|
||||||
not isinstance(header, dict) or header.get('format') != 'cB'
|
not isinstance(header, dict)
|
||||||
or header.get('version') != 1
|
or header.get("format") != "cB"
|
||||||
|
or header.get("version") != 1
|
||||||
):
|
):
|
||||||
raise ValueError("Unexpected header: %r" % header)
|
raise ValueError("Unexpected header: %r" % header)
|
||||||
return data[1:]
|
return data[1:]
|
||||||
|
|
||||||
|
|
||||||
def available_languages(wordlist='best'):
|
def available_languages(wordlist="best"):
|
||||||
"""
|
"""
|
||||||
Given a wordlist name, return a dictionary of language codes to filenames,
|
Given a wordlist name, return a dictionary of language codes to filenames,
|
||||||
representing all the languages in which that wordlist is available.
|
representing all the languages in which that wordlist is available.
|
||||||
"""
|
"""
|
||||||
if wordlist == 'best':
|
if wordlist == "best":
|
||||||
available = available_languages('small')
|
available = available_languages("small")
|
||||||
available.update(available_languages('large'))
|
available.update(available_languages("large"))
|
||||||
return available
|
return available
|
||||||
elif wordlist == 'combined':
|
elif wordlist == "combined":
|
||||||
logger.warning(
|
logger.warning("The 'combined' wordlists have been renamed to 'small'.")
|
||||||
"The 'combined' wordlists have been renamed to 'small'."
|
wordlist = "small"
|
||||||
)
|
|
||||||
wordlist = 'small'
|
|
||||||
|
|
||||||
available = {}
|
available = {}
|
||||||
for path in DATA_PATH.glob('*.msgpack.gz'):
|
for path in DATA_PATH.glob("*.msgpack.gz"):
|
||||||
if not path.name.startswith('_'):
|
if not path.name.startswith("_"):
|
||||||
list_name = path.name.split('.')[0]
|
list_name = path.name.split(".")[0]
|
||||||
name, lang = list_name.split('_')
|
name, lang = list_name.split("_")
|
||||||
if name == wordlist:
|
if name == wordlist:
|
||||||
available[lang] = str(path)
|
available[lang] = str(path)
|
||||||
return available
|
return available
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def get_frequency_list(lang, wordlist='best', match_cutoff=None):
|
def get_frequency_list(lang, wordlist="best", match_cutoff=None):
|
||||||
"""
|
"""
|
||||||
Read the raw data from a wordlist file, returning it as a list of
|
Read the raw data from a wordlist file, returning it as a list of
|
||||||
lists. (See `read_cBpack` for what this represents.)
|
lists. (See `read_cBpack` for what this represents.)
|
||||||
@ -123,27 +122,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=None):
|
|||||||
Looking up the alternate code 'por' will also get the same list.
|
Looking up the alternate code 'por' will also get the same list.
|
||||||
"""
|
"""
|
||||||
if match_cutoff is not None:
|
if match_cutoff is not None:
|
||||||
warnings.warn(
|
warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning)
|
||||||
"The `match_cutoff` parameter is deprecated",
|
|
||||||
DeprecationWarning
|
|
||||||
)
|
|
||||||
available = available_languages(wordlist)
|
available = available_languages(wordlist)
|
||||||
|
|
||||||
# TODO: decrease the maximum distance. This distance is so high just
|
# TODO: decrease the maximum distance. This distance is so high just
|
||||||
# because it allows a test where 'yue' matches 'zh', and maybe the
|
# because it allows a test where 'yue' matches 'zh', and maybe the
|
||||||
# distance between those is high because they shouldn't match.
|
# distance between those is high because they shouldn't match.
|
||||||
best, _distance = langcodes.closest_match(
|
best, _distance = langcodes.closest_match(lang, list(available), max_distance=70)
|
||||||
lang, list(available), max_distance=70
|
if best == "und":
|
||||||
)
|
raise LookupError("No wordlist %r available for language %r" % (wordlist, lang))
|
||||||
if best == 'und':
|
|
||||||
raise LookupError("No wordlist %r available for language %r"
|
|
||||||
% (wordlist, lang))
|
|
||||||
|
|
||||||
if best != lang:
|
if best != lang:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"You asked for word frequencies in language %r. Using the "
|
"You asked for word frequencies in language %r. Using the "
|
||||||
"nearest match, which is %r."
|
"nearest match, which is %r." % (lang, best)
|
||||||
% (lang, best)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return read_cBpack(available[best])
|
return read_cBpack(available[best])
|
||||||
@ -161,9 +153,7 @@ def cB_to_freq(cB):
|
|||||||
In general, x cB represents a frequency of 10 ** (x/100).
|
In general, x cB represents a frequency of 10 ** (x/100).
|
||||||
"""
|
"""
|
||||||
if cB > 0:
|
if cB > 0:
|
||||||
raise ValueError(
|
raise ValueError("A frequency cannot be a positive number of centibels.")
|
||||||
"A frequency cannot be a positive number of centibels."
|
|
||||||
)
|
|
||||||
return 10 ** (cB / 100)
|
return 10 ** (cB / 100)
|
||||||
|
|
||||||
|
|
||||||
@ -204,16 +194,13 @@ def freq_to_zipf(freq):
|
|||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
|
def get_frequency_dict(lang, wordlist="best", match_cutoff=None):
|
||||||
"""
|
"""
|
||||||
Get a word frequency list as a dictionary, mapping tokens to
|
Get a word frequency list as a dictionary, mapping tokens to
|
||||||
frequencies as floating-point probabilities.
|
frequencies as floating-point probabilities.
|
||||||
"""
|
"""
|
||||||
if match_cutoff is not None:
|
if match_cutoff is not None:
|
||||||
warnings.warn(
|
warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning)
|
||||||
"The `match_cutoff` parameter is deprecated",
|
|
||||||
DeprecationWarning
|
|
||||||
)
|
|
||||||
freqs = {}
|
freqs = {}
|
||||||
pack = get_frequency_list(lang, wordlist)
|
pack = get_frequency_list(lang, wordlist)
|
||||||
for index, bucket in enumerate(pack):
|
for index, bucket in enumerate(pack):
|
||||||
@ -223,7 +210,7 @@ def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
|
|||||||
return freqs
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
def iter_wordlist(lang, wordlist='best'):
|
def iter_wordlist(lang, wordlist="best"):
|
||||||
"""
|
"""
|
||||||
Yield the words in a wordlist in approximate descending order of
|
Yield the words in a wordlist in approximate descending order of
|
||||||
frequency.
|
frequency.
|
||||||
@ -258,12 +245,12 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
# If any word is missing, just return the default value
|
# If any word is missing, just return the default value
|
||||||
return minimum
|
return minimum
|
||||||
# spread the frequency of digits over all digit combinations
|
# spread the frequency of digits over all digit combinations
|
||||||
freq = freqs[token] / (10. ** digits)
|
freq = freqs[token] / (10.0**digits)
|
||||||
one_over_result += 1.0 / freq
|
one_over_result += 1.0 / freq
|
||||||
|
|
||||||
freq = 1.0 / one_over_result
|
freq = 1.0 / one_over_result
|
||||||
|
|
||||||
if get_language_info(lang)['tokenizer'] == 'jieba':
|
if get_language_info(lang)["tokenizer"] == "jieba":
|
||||||
# If we used the Jieba tokenizer, we could tokenize anything to match
|
# If we used the Jieba tokenizer, we could tokenize anything to match
|
||||||
# our wordlist, even nonsense. To counteract this, we multiply by a
|
# our wordlist, even nonsense. To counteract this, we multiply by a
|
||||||
# probability for each word break that was inferred.
|
# probability for each word break that was inferred.
|
||||||
@ -272,14 +259,14 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
# All our frequency data is only precise to within 1% anyway, so round
|
# All our frequency data is only precise to within 1% anyway, so round
|
||||||
# it to 3 significant digits
|
# it to 3 significant digits
|
||||||
unrounded = max(freq, minimum)
|
unrounded = max(freq, minimum)
|
||||||
if unrounded == 0.:
|
if unrounded == 0.0:
|
||||||
return 0.
|
return 0.0
|
||||||
else:
|
else:
|
||||||
leading_zeroes = math.floor(-math.log(unrounded, 10))
|
leading_zeroes = math.floor(-math.log(unrounded, 10))
|
||||||
return round(unrounded, leading_zeroes + 3)
|
return round(unrounded, leading_zeroes + 3)
|
||||||
|
|
||||||
|
|
||||||
def word_frequency(word, lang, wordlist='best', minimum=0.):
|
def word_frequency(word, lang, wordlist="best", minimum=0.0):
|
||||||
"""
|
"""
|
||||||
Get the frequency of `word` in the language with code `lang`, from the
|
Get the frequency of `word` in the language with code `lang`, from the
|
||||||
specified `wordlist`.
|
specified `wordlist`.
|
||||||
@ -306,7 +293,7 @@ def word_frequency(word, lang, wordlist='best', minimum=0.):
|
|||||||
return _wf_cache[args]
|
return _wf_cache[args]
|
||||||
|
|
||||||
|
|
||||||
def zipf_frequency(word, lang, wordlist='best', minimum=0.):
|
def zipf_frequency(word, lang, wordlist="best", minimum=0.0):
|
||||||
"""
|
"""
|
||||||
Get the frequency of `word`, in the language with code `lang`, on the Zipf
|
Get the frequency of `word`, in the language with code `lang`, on the Zipf
|
||||||
scale.
|
scale.
|
||||||
@ -334,7 +321,7 @@ def zipf_frequency(word, lang, wordlist='best', minimum=0.):
|
|||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=100)
|
@lru_cache(maxsize=100)
|
||||||
def top_n_list(lang, n, wordlist='best', ascii_only=False):
|
def top_n_list(lang, n, wordlist="best", ascii_only=False):
|
||||||
"""
|
"""
|
||||||
Return a frequency list of length `n` in descending order of frequency.
|
Return a frequency list of length `n` in descending order of frequency.
|
||||||
This list contains words from `wordlist`, of the given language.
|
This list contains words from `wordlist`, of the given language.
|
||||||
@ -342,15 +329,16 @@ def top_n_list(lang, n, wordlist='best', ascii_only=False):
|
|||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
for word in iter_wordlist(lang, wordlist):
|
for word in iter_wordlist(lang, wordlist):
|
||||||
if (not ascii_only) or max(word) <= '~':
|
if (not ascii_only) or max(word) <= "~":
|
||||||
results.append(word)
|
results.append(word)
|
||||||
if len(results) >= n:
|
if len(results) >= n:
|
||||||
break
|
break
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
|
def random_words(
|
||||||
ascii_only=False):
|
lang="en", wordlist="best", nwords=5, bits_per_word=12, ascii_only=False
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Returns a string of random, space separated words.
|
Returns a string of random, space separated words.
|
||||||
|
|
||||||
@ -371,11 +359,10 @@ def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
|
|||||||
"There aren't enough words in the wordlist to provide %d bits of "
|
"There aren't enough words in the wordlist to provide %d bits of "
|
||||||
"entropy per word." % bits_per_word
|
"entropy per word." % bits_per_word
|
||||||
)
|
)
|
||||||
return ' '.join([random.choice(choices) for i in range(nwords)])
|
return " ".join([random.choice(choices) for i in range(nwords)])
|
||||||
|
|
||||||
|
|
||||||
def random_ascii_words(lang='en', wordlist='best', nwords=5,
|
def random_ascii_words(lang="en", wordlist="best", nwords=5, bits_per_word=12):
|
||||||
bits_per_word=12):
|
|
||||||
"""
|
"""
|
||||||
Returns a string of random, space separated, ASCII words.
|
Returns a string of random, space separated, ASCII words.
|
||||||
|
|
||||||
|
@ -3,11 +3,13 @@ import jieba
|
|||||||
import msgpack
|
import msgpack
|
||||||
import gzip
|
import gzip
|
||||||
|
|
||||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh.txt")
|
||||||
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
ORIG_DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh_orig.txt")
|
||||||
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
SIMP_MAP_FILENAME = resource_filename("wordfreq", "data/_chinese_mapping.msgpack.gz")
|
||||||
try:
|
try:
|
||||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False)
|
SIMPLIFIED_MAP = msgpack.load(
|
||||||
|
gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False
|
||||||
|
)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
# work around incompatibility between pure-Python msgpack and C msgpack
|
# work around incompatibility between pure-Python msgpack and C msgpack
|
||||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False)
|
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False)
|
||||||
@ -58,6 +60,8 @@ def jieba_tokenize(text, external_wordlist=False):
|
|||||||
# those spans from the original text, even if it's in Traditional
|
# those spans from the original text, even if it's in Traditional
|
||||||
# Chinese
|
# Chinese
|
||||||
tokens = []
|
tokens = []
|
||||||
for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
|
for _token, start, end in jieba_tokenizer.tokenize(
|
||||||
|
simplify_chinese(text), HMM=False
|
||||||
|
):
|
||||||
tokens.append(text[start:end])
|
tokens.append(text[start:end])
|
||||||
return tokens
|
return tokens
|
||||||
|
@ -12,20 +12,19 @@ SPACELESS_SCRIPTS = [
|
|||||||
# characters, are covered by the \p{IsIdeo} check. Checking for
|
# characters, are covered by the \p{IsIdeo} check. Checking for
|
||||||
# Script=Hani and IsIdeo slows down our regexes with huge, redundant
|
# Script=Hani and IsIdeo slows down our regexes with huge, redundant
|
||||||
# classes of characters. Instead, we'll list the exceptions below.
|
# classes of characters. Instead, we'll list the exceptions below.
|
||||||
|
"Hira", # Hiragana
|
||||||
'Hira', # Hiragana
|
"Kana", # Katakana
|
||||||
'Kana', # Katakana
|
"Thai", # Thai script
|
||||||
'Thai', # Thai script
|
"Khmr", # Khmer script
|
||||||
'Khmr', # Khmer script
|
"Laoo", # Lao script
|
||||||
'Laoo', # Lao script
|
"Mymr", # Burmese script
|
||||||
'Mymr', # Burmese script
|
"Tale", # Tai Le script
|
||||||
'Tale', # Tai Le script
|
"Talu", # Tai Lü script
|
||||||
'Talu', # Tai Lü script
|
"Lana", # Lanna script
|
||||||
'Lana', # Lanna script
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
|
EXTRA_JAPANESE_CHARACTERS = "ー々〻〆"
|
||||||
|
|
||||||
# ー is a lengthening mark that's both hiragana and katakana. Unicode
|
# ー is a lengthening mark that's both hiragana and katakana. Unicode
|
||||||
# segmentation handles it as a special case, but we're overriding standard
|
# segmentation handles it as a special case, but we're overriding standard
|
||||||
@ -54,7 +53,7 @@ def _language_in_list(language, targets, max_distance=10):
|
|||||||
objects. `targets` can be any iterable of such languages.
|
objects. `targets` can be any iterable of such languages.
|
||||||
"""
|
"""
|
||||||
matched = closest_match(language, targets, max_distance=max_distance)
|
matched = closest_match(language, targets, max_distance=max_distance)
|
||||||
return matched[0] != 'und'
|
return matched[0] != "und"
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
@ -131,42 +130,42 @@ def get_language_info(language):
|
|||||||
# Start the `info` dictionary with default values, including the 'script'
|
# Start the `info` dictionary with default values, including the 'script'
|
||||||
# value that we now know from `language_full`.
|
# value that we now know from `language_full`.
|
||||||
info = {
|
info = {
|
||||||
'script': language_full.script,
|
"script": language_full.script,
|
||||||
'tokenizer': 'regex',
|
"tokenizer": "regex",
|
||||||
'normal_form': 'NFKC',
|
"normal_form": "NFKC",
|
||||||
'remove_marks': False,
|
"remove_marks": False,
|
||||||
'dotless_i': False,
|
"dotless_i": False,
|
||||||
'diacritics_under': None,
|
"diacritics_under": None,
|
||||||
'transliteration': None,
|
"transliteration": None,
|
||||||
'lookup_transliteration': None
|
"lookup_transliteration": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
if _language_in_list(language, ['ja', 'ko']):
|
if _language_in_list(language, ["ja", "ko"]):
|
||||||
info['tokenizer'] = 'mecab'
|
info["tokenizer"] = "mecab"
|
||||||
elif _language_in_list(language, ['zh', 'yue']):
|
elif _language_in_list(language, ["zh", "yue"]):
|
||||||
info['tokenizer'] = 'jieba'
|
info["tokenizer"] = "jieba"
|
||||||
elif info['script'] in SPACELESS_SCRIPTS:
|
elif info["script"] in SPACELESS_SCRIPTS:
|
||||||
info['tokenizer'] = None
|
info["tokenizer"] = None
|
||||||
|
|
||||||
# Cased alphabetic scripts get NFC normal form
|
# Cased alphabetic scripts get NFC normal form
|
||||||
if info['script'] in ['Latn', 'Grek', 'Cyrl']:
|
if info["script"] in ["Latn", "Grek", "Cyrl"]:
|
||||||
info['normal_form'] = 'NFC'
|
info["normal_form"] = "NFC"
|
||||||
|
|
||||||
if info['script'] in ['Arab', 'Hebr']:
|
if info["script"] in ["Arab", "Hebr"]:
|
||||||
info['remove_marks'] = True
|
info["remove_marks"] = True
|
||||||
|
|
||||||
if _language_in_list(language, ['tr', 'az', 'kk']):
|
if _language_in_list(language, ["tr", "az", "kk"]):
|
||||||
info['dotless_i'] = True
|
info["dotless_i"] = True
|
||||||
info['diacritics_under'] = 'cedillas'
|
info["diacritics_under"] = "cedillas"
|
||||||
elif _language_in_list(language, ['ro']):
|
elif _language_in_list(language, ["ro"]):
|
||||||
info['diacritics_under'] = 'commas'
|
info["diacritics_under"] = "commas"
|
||||||
|
|
||||||
if _language_in_list(language, ['sr']):
|
if _language_in_list(language, ["sr"]):
|
||||||
info['transliteration'] = 'sr-Latn'
|
info["transliteration"] = "sr-Latn"
|
||||||
elif _language_in_list(language, ['az']):
|
elif _language_in_list(language, ["az"]):
|
||||||
info['transliteration'] = 'az-Latn'
|
info["transliteration"] = "az-Latn"
|
||||||
|
|
||||||
if language.language == 'zh' and language.script != 'Hant':
|
if language.language == "zh" and language.script != "Hant":
|
||||||
info['lookup_transliteration'] = 'zh-Hans'
|
info["lookup_transliteration"] = "zh-Hans"
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
@ -13,11 +13,13 @@ def make_mecab_analyzer(lang):
|
|||||||
Get a MeCab analyzer object, given the language code of the language to
|
Get a MeCab analyzer object, given the language code of the language to
|
||||||
analyze.
|
analyze.
|
||||||
"""
|
"""
|
||||||
if lang == 'ko':
|
if lang == "ko":
|
||||||
import mecab_ko_dic
|
import mecab_ko_dic
|
||||||
|
|
||||||
return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
|
return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
|
||||||
elif lang == 'ja':
|
elif lang == "ja":
|
||||||
import ipadic
|
import ipadic
|
||||||
|
|
||||||
return MeCab.Tagger(ipadic.MECAB_ARGS)
|
return MeCab.Tagger(ipadic.MECAB_ARGS)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Can't run MeCab on language {lang}".format(lang))
|
raise ValueError("Can't run MeCab on language {lang}".format(lang))
|
||||||
@ -40,10 +42,12 @@ def mecab_tokenize(text, lang):
|
|||||||
MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
|
MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
|
||||||
|
|
||||||
analyzer = MECAB_ANALYZERS[lang]
|
analyzer = MECAB_ANALYZERS[lang]
|
||||||
text = unicodedata.normalize('NFKC', text.strip())
|
text = unicodedata.normalize("NFKC", text.strip())
|
||||||
analyzed = analyzer.parse(text)
|
analyzed = analyzer.parse(text)
|
||||||
if not analyzed:
|
if not analyzed:
|
||||||
return []
|
return []
|
||||||
return [line.split('\t')[0]
|
return [
|
||||||
for line in analyzed.split('\n')
|
line.split("\t")[0]
|
||||||
if line != '' and line != 'EOS']
|
for line in analyzed.split("\n")
|
||||||
|
if line != "" and line != "EOS"
|
||||||
|
]
|
||||||
|
@ -4,10 +4,10 @@ import unicodedata
|
|||||||
from .language_info import get_language_info
|
from .language_info import get_language_info
|
||||||
from .transliterate import transliterate
|
from .transliterate import transliterate
|
||||||
|
|
||||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
MARK_RE = regex.compile(r"[\p{Mn}\N{ARABIC TATWEEL}]", regex.V1)
|
||||||
|
|
||||||
DIGIT_RE = regex.compile(r'\d')
|
DIGIT_RE = regex.compile(r"\d")
|
||||||
MULTI_DIGIT_RE = regex.compile(r'\d[\d.,]+')
|
MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
|
||||||
|
|
||||||
|
|
||||||
def preprocess_text(text, language):
|
def preprocess_text(text, language):
|
||||||
@ -171,26 +171,26 @@ def preprocess_text(text, language):
|
|||||||
"""
|
"""
|
||||||
# NFC or NFKC normalization, as needed for the language
|
# NFC or NFKC normalization, as needed for the language
|
||||||
info = get_language_info(language)
|
info = get_language_info(language)
|
||||||
text = unicodedata.normalize(info['normal_form'], text)
|
text = unicodedata.normalize(info["normal_form"], text)
|
||||||
|
|
||||||
# Transliteration of multi-script languages
|
# Transliteration of multi-script languages
|
||||||
if info['transliteration'] is not None:
|
if info["transliteration"] is not None:
|
||||||
text = transliterate(info['transliteration'], text)
|
text = transliterate(info["transliteration"], text)
|
||||||
|
|
||||||
# Abjad mark removal
|
# Abjad mark removal
|
||||||
if info['remove_marks']:
|
if info["remove_marks"]:
|
||||||
text = remove_marks(text)
|
text = remove_marks(text)
|
||||||
|
|
||||||
# Case folding
|
# Case folding
|
||||||
if info['dotless_i']:
|
if info["dotless_i"]:
|
||||||
text = casefold_with_i_dots(text)
|
text = casefold_with_i_dots(text)
|
||||||
else:
|
else:
|
||||||
text = text.casefold()
|
text = text.casefold()
|
||||||
|
|
||||||
# Fixing of diacritics
|
# Fixing of diacritics
|
||||||
if info['diacritics_under'] == 'commas':
|
if info["diacritics_under"] == "commas":
|
||||||
text = cedillas_to_commas(text)
|
text = cedillas_to_commas(text)
|
||||||
elif info['diacritics_under'] == 'cedillas':
|
elif info["diacritics_under"] == "cedillas":
|
||||||
text = commas_to_cedillas(text)
|
text = commas_to_cedillas(text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
@ -205,7 +205,7 @@ def remove_marks(text):
|
|||||||
- Tatweels, horizontal segments that are used to extend or justify an
|
- Tatweels, horizontal segments that are used to extend or justify an
|
||||||
Arabic word.
|
Arabic word.
|
||||||
"""
|
"""
|
||||||
return MARK_RE.sub('', text)
|
return MARK_RE.sub("", text)
|
||||||
|
|
||||||
|
|
||||||
def casefold_with_i_dots(text):
|
def casefold_with_i_dots(text):
|
||||||
@ -214,7 +214,7 @@ def casefold_with_i_dots(text):
|
|||||||
that's appropriate for Turkish and related languages, then case-fold
|
that's appropriate for Turkish and related languages, then case-fold
|
||||||
the rest of the letters.
|
the rest of the letters.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
text = unicodedata.normalize("NFC", text).replace("İ", "i").replace("I", "ı")
|
||||||
return text.casefold()
|
return text.casefold()
|
||||||
|
|
||||||
|
|
||||||
@ -227,11 +227,11 @@ def commas_to_cedillas(text):
|
|||||||
text has already been case-folded.
|
text has already been case-folded.
|
||||||
"""
|
"""
|
||||||
return text.replace(
|
return text.replace(
|
||||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
|
"\N{LATIN SMALL LETTER S WITH COMMA BELOW}",
|
||||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
|
"\N{LATIN SMALL LETTER S WITH CEDILLA}",
|
||||||
).replace(
|
).replace(
|
||||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
|
"\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
|
||||||
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
|
"\N{LATIN SMALL LETTER T WITH CEDILLA}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -244,11 +244,11 @@ def cedillas_to_commas(text):
|
|||||||
text has already been case-folded.
|
text has already been case-folded.
|
||||||
"""
|
"""
|
||||||
return text.replace(
|
return text.replace(
|
||||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
|
"\N{LATIN SMALL LETTER S WITH CEDILLA}",
|
||||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
|
"\N{LATIN SMALL LETTER S WITH COMMA BELOW}",
|
||||||
).replace(
|
).replace(
|
||||||
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
|
"\N{LATIN SMALL LETTER T WITH CEDILLA}",
|
||||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
"\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -257,7 +257,7 @@ def _sub_zeroes(match):
|
|||||||
Given a regex match, return what it matched with digits replaced by
|
Given a regex match, return what it matched with digits replaced by
|
||||||
zeroes.
|
zeroes.
|
||||||
"""
|
"""
|
||||||
return DIGIT_RE.sub('0', match.group(0))
|
return DIGIT_RE.sub("0", match.group(0))
|
||||||
|
|
||||||
|
|
||||||
def num_generic_digits(text):
|
def num_generic_digits(text):
|
||||||
|
@ -22,17 +22,17 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
def _make_spaceless_expr():
|
def _make_spaceless_expr():
|
||||||
scripts = sorted(SPACELESS_SCRIPTS)
|
scripts = sorted(SPACELESS_SCRIPTS)
|
||||||
pieces = [r'\p{IsIdeo}'] + [
|
pieces = [r"\p{IsIdeo}"] + [
|
||||||
r'\p{Script=%s}' % script_code for script_code in scripts
|
r"\p{Script=%s}" % script_code for script_code in scripts
|
||||||
]
|
]
|
||||||
return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS
|
return "".join(pieces) + EXTRA_JAPANESE_CHARACTERS
|
||||||
|
|
||||||
|
|
||||||
SPACELESS_EXPR = _make_spaceless_expr()
|
SPACELESS_EXPR = _make_spaceless_expr()
|
||||||
|
|
||||||
# All vowels that might appear at the start of a word in French or Catalan,
|
# All vowels that might appear at the start of a word in French or Catalan,
|
||||||
# plus 'h' which would be silent and imply a following vowel sound.
|
# plus 'h' which would be silent and imply a following vowel sound.
|
||||||
INITIAL_VOWEL_EXPR = '[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]'
|
INITIAL_VOWEL_EXPR = "[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]"
|
||||||
|
|
||||||
TOKEN_RE = regex.compile(
|
TOKEN_RE = regex.compile(
|
||||||
r"""
|
r"""
|
||||||
@ -148,9 +148,9 @@ TOKEN_RE = regex.compile(
|
|||||||
|
|
||||||
\w\w?'
|
\w\w?'
|
||||||
""".replace(
|
""".replace(
|
||||||
'<SPACELESS>', SPACELESS_EXPR
|
"<SPACELESS>", SPACELESS_EXPR
|
||||||
).replace(
|
).replace(
|
||||||
'<VOWEL>', INITIAL_VOWEL_EXPR
|
"<VOWEL>", INITIAL_VOWEL_EXPR
|
||||||
),
|
),
|
||||||
regex.V1 | regex.WORD | regex.VERBOSE,
|
regex.V1 | regex.WORD | regex.VERBOSE,
|
||||||
)
|
)
|
||||||
@ -167,9 +167,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(
|
|||||||
\X+? (?: @s? (?!w) | \b) | # Case 3
|
\X+? (?: @s? (?!w) | \b) | # Case 3
|
||||||
\w\w?' # Case 4
|
\w\w?' # Case 4
|
||||||
""".replace(
|
""".replace(
|
||||||
'<SPACELESS>', SPACELESS_EXPR
|
"<SPACELESS>", SPACELESS_EXPR
|
||||||
).replace(
|
).replace(
|
||||||
'<VOWEL>', INITIAL_VOWEL_EXPR
|
"<VOWEL>", INITIAL_VOWEL_EXPR
|
||||||
),
|
),
|
||||||
regex.V1 | regex.WORD | regex.VERBOSE,
|
regex.V1 | regex.WORD | regex.VERBOSE,
|
||||||
)
|
)
|
||||||
@ -207,12 +207,9 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
tokens that are much too long, but the alternative is that every grapheme
|
tokens that are much too long, but the alternative is that every grapheme
|
||||||
would end up in its own token, which is worse.
|
would end up in its own token, which is worse.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize("NFC", text)
|
||||||
if include_punctuation:
|
if include_punctuation:
|
||||||
return [
|
return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
|
||||||
token.casefold()
|
|
||||||
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
|
||||||
]
|
|
||||||
else:
|
else:
|
||||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
@ -257,7 +254,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
info = get_language_info(language)
|
info = get_language_info(language)
|
||||||
text = preprocess_text(text, language)
|
text = preprocess_text(text, language)
|
||||||
|
|
||||||
if info['tokenizer'] == 'mecab':
|
if info["tokenizer"] == "mecab":
|
||||||
from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
|
from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
|
||||||
|
|
||||||
# Get just the language code out of the Language object, so we can
|
# Get just the language code out of the Language object, so we can
|
||||||
@ -265,7 +262,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
tokens = _mecab_tokenize(text, language.language)
|
tokens = _mecab_tokenize(text, language.language)
|
||||||
if not include_punctuation:
|
if not include_punctuation:
|
||||||
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
|
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
|
||||||
elif info['tokenizer'] == 'jieba':
|
elif info["tokenizer"] == "jieba":
|
||||||
from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
|
from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
|
||||||
|
|
||||||
tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
|
tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||||
@ -275,11 +272,11 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
# This is the default case where we use the regex tokenizer. First
|
# This is the default case where we use the regex tokenizer. First
|
||||||
# let's complain a bit if we ended up here because we don't have an
|
# let's complain a bit if we ended up here because we don't have an
|
||||||
# appropriate tokenizer.
|
# appropriate tokenizer.
|
||||||
if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES:
|
if info["tokenizer"] != "regex" and lang not in _WARNED_LANGUAGES:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"The language '{}' is in the '{}' script, which we don't "
|
"The language '{}' is in the '{}' script, which we don't "
|
||||||
"have a tokenizer for. The results will be bad.".format(
|
"have a tokenizer for. The results will be bad.".format(
|
||||||
lang, info['script']
|
lang, info["script"]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
_WARNED_LANGUAGES.add(lang)
|
_WARNED_LANGUAGES.add(lang)
|
||||||
@ -288,9 +285,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def lossy_tokenize(
|
def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||||
text, lang, include_punctuation=False, external_wordlist=False
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Get a list of tokens for this text, with largely the same results and
|
Get a list of tokens for this text, with largely the same results and
|
||||||
options as `tokenize`, but aggressively normalize some text in a lossy way
|
options as `tokenize`, but aggressively normalize some text in a lossy way
|
||||||
@ -316,7 +311,7 @@ def lossy_tokenize(
|
|||||||
info = get_language_info(lang)
|
info = get_language_info(lang)
|
||||||
tokens = tokenize(text, lang, include_punctuation, external_wordlist)
|
tokens = tokenize(text, lang, include_punctuation, external_wordlist)
|
||||||
|
|
||||||
if info['lookup_transliteration'] == 'zh-Hans':
|
if info["lookup_transliteration"] == "zh-Hans":
|
||||||
from wordfreq.chinese import simplify_chinese as _simplify_chinese
|
from wordfreq.chinese import simplify_chinese as _simplify_chinese
|
||||||
|
|
||||||
tokens = [_simplify_chinese(token) for token in tokens]
|
tokens = [_simplify_chinese(token) for token in tokens]
|
||||||
|
Loading…
Reference in New Issue
Block a user