run black

This commit is contained in:
Elia Robyn Lake 2022-02-08 18:27:18 -05:00
parent 3c4819e7e5
commit ef4d6fe0df
6 changed files with 137 additions and 148 deletions

View File

@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
CACHE_SIZE = 100000 CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) DATA_PATH = pathlib.Path(resource_filename("wordfreq", "data"))
# We'll divide the frequency by 10 for each token boundary that was inferred. # We'll divide the frequency by 10 for each token boundary that was inferred.
# (We determined the factor of 10 empirically by looking at words in the # (We determined the factor of 10 empirically by looking at words in the
@ -75,44 +75,43 @@ def read_cBpack(filename):
['blue', 'red'] ['blue', 'red']
] ]
""" """
with gzip.open(filename, 'rb') as infile: with gzip.open(filename, "rb") as infile:
data = msgpack.load(infile, raw=False) data = msgpack.load(infile, raw=False)
header = data[0] header = data[0]
if ( if (
not isinstance(header, dict) or header.get('format') != 'cB' not isinstance(header, dict)
or header.get('version') != 1 or header.get("format") != "cB"
or header.get("version") != 1
): ):
raise ValueError("Unexpected header: %r" % header) raise ValueError("Unexpected header: %r" % header)
return data[1:] return data[1:]
def available_languages(wordlist='best'): def available_languages(wordlist="best"):
""" """
Given a wordlist name, return a dictionary of language codes to filenames, Given a wordlist name, return a dictionary of language codes to filenames,
representing all the languages in which that wordlist is available. representing all the languages in which that wordlist is available.
""" """
if wordlist == 'best': if wordlist == "best":
available = available_languages('small') available = available_languages("small")
available.update(available_languages('large')) available.update(available_languages("large"))
return available return available
elif wordlist == 'combined': elif wordlist == "combined":
logger.warning( logger.warning("The 'combined' wordlists have been renamed to 'small'.")
"The 'combined' wordlists have been renamed to 'small'." wordlist = "small"
)
wordlist = 'small'
available = {} available = {}
for path in DATA_PATH.glob('*.msgpack.gz'): for path in DATA_PATH.glob("*.msgpack.gz"):
if not path.name.startswith('_'): if not path.name.startswith("_"):
list_name = path.name.split('.')[0] list_name = path.name.split(".")[0]
name, lang = list_name.split('_') name, lang = list_name.split("_")
if name == wordlist: if name == wordlist:
available[lang] = str(path) available[lang] = str(path)
return available return available
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def get_frequency_list(lang, wordlist='best', match_cutoff=None): def get_frequency_list(lang, wordlist="best", match_cutoff=None):
""" """
Read the raw data from a wordlist file, returning it as a list of Read the raw data from a wordlist file, returning it as a list of
lists. (See `read_cBpack` for what this represents.) lists. (See `read_cBpack` for what this represents.)
@ -123,27 +122,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=None):
Looking up the alternate code 'por' will also get the same list. Looking up the alternate code 'por' will also get the same list.
""" """
if match_cutoff is not None: if match_cutoff is not None:
warnings.warn( warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning)
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
available = available_languages(wordlist) available = available_languages(wordlist)
# TODO: decrease the maximum distance. This distance is so high just # TODO: decrease the maximum distance. This distance is so high just
# because it allows a test where 'yue' matches 'zh', and maybe the # because it allows a test where 'yue' matches 'zh', and maybe the
# distance between those is high because they shouldn't match. # distance between those is high because they shouldn't match.
best, _distance = langcodes.closest_match( best, _distance = langcodes.closest_match(lang, list(available), max_distance=70)
lang, list(available), max_distance=70 if best == "und":
) raise LookupError("No wordlist %r available for language %r" % (wordlist, lang))
if best == 'und':
raise LookupError("No wordlist %r available for language %r"
% (wordlist, lang))
if best != lang: if best != lang:
logger.warning( logger.warning(
"You asked for word frequencies in language %r. Using the " "You asked for word frequencies in language %r. Using the "
"nearest match, which is %r." "nearest match, which is %r." % (lang, best)
% (lang, best)
) )
return read_cBpack(available[best]) return read_cBpack(available[best])
@ -161,9 +153,7 @@ def cB_to_freq(cB):
In general, x cB represents a frequency of 10 ** (x/100). In general, x cB represents a frequency of 10 ** (x/100).
""" """
if cB > 0: if cB > 0:
raise ValueError( raise ValueError("A frequency cannot be a positive number of centibels.")
"A frequency cannot be a positive number of centibels."
)
return 10 ** (cB / 100) return 10 ** (cB / 100)
@ -192,7 +182,7 @@ def zipf_to_freq(zipf):
words. For example, a word that occurs once per million words is at 3.0 on words. For example, a word that occurs once per million words is at 3.0 on
the Zipf scale. the Zipf scale.
""" """
return 10 ** zipf / 1e9 return 10**zipf / 1e9
def freq_to_zipf(freq): def freq_to_zipf(freq):
@ -204,16 +194,13 @@ def freq_to_zipf(freq):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def get_frequency_dict(lang, wordlist='best', match_cutoff=None): def get_frequency_dict(lang, wordlist="best", match_cutoff=None):
""" """
Get a word frequency list as a dictionary, mapping tokens to Get a word frequency list as a dictionary, mapping tokens to
frequencies as floating-point probabilities. frequencies as floating-point probabilities.
""" """
if match_cutoff is not None: if match_cutoff is not None:
warnings.warn( warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning)
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
freqs = {} freqs = {}
pack = get_frequency_list(lang, wordlist) pack = get_frequency_list(lang, wordlist)
for index, bucket in enumerate(pack): for index, bucket in enumerate(pack):
@ -223,7 +210,7 @@ def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
return freqs return freqs
def iter_wordlist(lang, wordlist='best'): def iter_wordlist(lang, wordlist="best"):
""" """
Yield the words in a wordlist in approximate descending order of Yield the words in a wordlist in approximate descending order of
frequency. frequency.
@ -258,12 +245,12 @@ def _word_frequency(word, lang, wordlist, minimum):
# If any word is missing, just return the default value # If any word is missing, just return the default value
return minimum return minimum
# spread the frequency of digits over all digit combinations # spread the frequency of digits over all digit combinations
freq = freqs[token] / (10. ** digits) freq = freqs[token] / (10.0**digits)
one_over_result += 1.0 / freq one_over_result += 1.0 / freq
freq = 1.0 / one_over_result freq = 1.0 / one_over_result
if get_language_info(lang)['tokenizer'] == 'jieba': if get_language_info(lang)["tokenizer"] == "jieba":
# If we used the Jieba tokenizer, we could tokenize anything to match # If we used the Jieba tokenizer, we could tokenize anything to match
# our wordlist, even nonsense. To counteract this, we multiply by a # our wordlist, even nonsense. To counteract this, we multiply by a
# probability for each word break that was inferred. # probability for each word break that was inferred.
@ -272,14 +259,14 @@ def _word_frequency(word, lang, wordlist, minimum):
# All our frequency data is only precise to within 1% anyway, so round # All our frequency data is only precise to within 1% anyway, so round
# it to 3 significant digits # it to 3 significant digits
unrounded = max(freq, minimum) unrounded = max(freq, minimum)
if unrounded == 0.: if unrounded == 0.0:
return 0. return 0.0
else: else:
leading_zeroes = math.floor(-math.log(unrounded, 10)) leading_zeroes = math.floor(-math.log(unrounded, 10))
return round(unrounded, leading_zeroes + 3) return round(unrounded, leading_zeroes + 3)
def word_frequency(word, lang, wordlist='best', minimum=0.): def word_frequency(word, lang, wordlist="best", minimum=0.0):
""" """
Get the frequency of `word` in the language with code `lang`, from the Get the frequency of `word` in the language with code `lang`, from the
specified `wordlist`. specified `wordlist`.
@ -306,7 +293,7 @@ def word_frequency(word, lang, wordlist='best', minimum=0.):
return _wf_cache[args] return _wf_cache[args]
def zipf_frequency(word, lang, wordlist='best', minimum=0.): def zipf_frequency(word, lang, wordlist="best", minimum=0.0):
""" """
Get the frequency of `word`, in the language with code `lang`, on the Zipf Get the frequency of `word`, in the language with code `lang`, on the Zipf
scale. scale.
@ -334,7 +321,7 @@ def zipf_frequency(word, lang, wordlist='best', minimum=0.):
@lru_cache(maxsize=100) @lru_cache(maxsize=100)
def top_n_list(lang, n, wordlist='best', ascii_only=False): def top_n_list(lang, n, wordlist="best", ascii_only=False):
""" """
Return a frequency list of length `n` in descending order of frequency. Return a frequency list of length `n` in descending order of frequency.
This list contains words from `wordlist`, of the given language. This list contains words from `wordlist`, of the given language.
@ -342,15 +329,16 @@ def top_n_list(lang, n, wordlist='best', ascii_only=False):
""" """
results = [] results = []
for word in iter_wordlist(lang, wordlist): for word in iter_wordlist(lang, wordlist):
if (not ascii_only) or max(word) <= '~': if (not ascii_only) or max(word) <= "~":
results.append(word) results.append(word)
if len(results) >= n: if len(results) >= n:
break break
return results return results
def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12, def random_words(
ascii_only=False): lang="en", wordlist="best", nwords=5, bits_per_word=12, ascii_only=False
):
""" """
Returns a string of random, space separated words. Returns a string of random, space separated words.
@ -364,18 +352,17 @@ def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
You can restrict the selection of words to those written in ASCII You can restrict the selection of words to those written in ASCII
characters by setting `ascii_only` to True. characters by setting `ascii_only` to True.
""" """
n_choices = 2 ** bits_per_word n_choices = 2**bits_per_word
choices = top_n_list(lang, n_choices, wordlist, ascii_only=ascii_only) choices = top_n_list(lang, n_choices, wordlist, ascii_only=ascii_only)
if len(choices) < n_choices: if len(choices) < n_choices:
raise ValueError( raise ValueError(
"There aren't enough words in the wordlist to provide %d bits of " "There aren't enough words in the wordlist to provide %d bits of "
"entropy per word." % bits_per_word "entropy per word." % bits_per_word
) )
return ' '.join([random.choice(choices) for i in range(nwords)]) return " ".join([random.choice(choices) for i in range(nwords)])
def random_ascii_words(lang='en', wordlist='best', nwords=5, def random_ascii_words(lang="en", wordlist="best", nwords=5, bits_per_word=12):
bits_per_word=12):
""" """
Returns a string of random, space separated, ASCII words. Returns a string of random, space separated, ASCII words.

View File

@ -3,11 +3,13 @@ import jieba
import msgpack import msgpack
import gzip import gzip
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh.txt")
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt') ORIG_DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh_orig.txt")
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') SIMP_MAP_FILENAME = resource_filename("wordfreq", "data/_chinese_mapping.msgpack.gz")
try: try:
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False) SIMPLIFIED_MAP = msgpack.load(
gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False
)
except TypeError: except TypeError:
# work around incompatibility between pure-Python msgpack and C msgpack # work around incompatibility between pure-Python msgpack and C msgpack
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False) SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False)
@ -58,6 +60,8 @@ def jieba_tokenize(text, external_wordlist=False):
# those spans from the original text, even if it's in Traditional # those spans from the original text, even if it's in Traditional
# Chinese # Chinese
tokens = [] tokens = []
for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False): for _token, start, end in jieba_tokenizer.tokenize(
simplify_chinese(text), HMM=False
):
tokens.append(text[start:end]) tokens.append(text[start:end])
return tokens return tokens

View File

@ -12,20 +12,19 @@ SPACELESS_SCRIPTS = [
# characters, are covered by the \p{IsIdeo} check. Checking for # characters, are covered by the \p{IsIdeo} check. Checking for
# Script=Hani and IsIdeo slows down our regexes with huge, redundant # Script=Hani and IsIdeo slows down our regexes with huge, redundant
# classes of characters. Instead, we'll list the exceptions below. # classes of characters. Instead, we'll list the exceptions below.
"Hira", # Hiragana
'Hira', # Hiragana "Kana", # Katakana
'Kana', # Katakana "Thai", # Thai script
'Thai', # Thai script "Khmr", # Khmer script
'Khmr', # Khmer script "Laoo", # Lao script
'Laoo', # Lao script "Mymr", # Burmese script
'Mymr', # Burmese script "Tale", # Tai Le script
'Tale', # Tai Le script "Talu", # Tai Lü script
'Talu', # Tai Lü script "Lana", # Lanna script
'Lana', # Lanna script
] ]
EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆' EXTRA_JAPANESE_CHARACTERS = "ー々〻〆"
# ー is a lengthening mark that's both hiragana and katakana. Unicode # ー is a lengthening mark that's both hiragana and katakana. Unicode
# segmentation handles it as a special case, but we're overriding standard # segmentation handles it as a special case, but we're overriding standard
@ -54,7 +53,7 @@ def _language_in_list(language, targets, max_distance=10):
objects. `targets` can be any iterable of such languages. objects. `targets` can be any iterable of such languages.
""" """
matched = closest_match(language, targets, max_distance=max_distance) matched = closest_match(language, targets, max_distance=max_distance)
return matched[0] != 'und' return matched[0] != "und"
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
@ -131,42 +130,42 @@ def get_language_info(language):
# Start the `info` dictionary with default values, including the 'script' # Start the `info` dictionary with default values, including the 'script'
# value that we now know from `language_full`. # value that we now know from `language_full`.
info = { info = {
'script': language_full.script, "script": language_full.script,
'tokenizer': 'regex', "tokenizer": "regex",
'normal_form': 'NFKC', "normal_form": "NFKC",
'remove_marks': False, "remove_marks": False,
'dotless_i': False, "dotless_i": False,
'diacritics_under': None, "diacritics_under": None,
'transliteration': None, "transliteration": None,
'lookup_transliteration': None "lookup_transliteration": None,
} }
if _language_in_list(language, ['ja', 'ko']): if _language_in_list(language, ["ja", "ko"]):
info['tokenizer'] = 'mecab' info["tokenizer"] = "mecab"
elif _language_in_list(language, ['zh', 'yue']): elif _language_in_list(language, ["zh", "yue"]):
info['tokenizer'] = 'jieba' info["tokenizer"] = "jieba"
elif info['script'] in SPACELESS_SCRIPTS: elif info["script"] in SPACELESS_SCRIPTS:
info['tokenizer'] = None info["tokenizer"] = None
# Cased alphabetic scripts get NFC normal form # Cased alphabetic scripts get NFC normal form
if info['script'] in ['Latn', 'Grek', 'Cyrl']: if info["script"] in ["Latn", "Grek", "Cyrl"]:
info['normal_form'] = 'NFC' info["normal_form"] = "NFC"
if info['script'] in ['Arab', 'Hebr']: if info["script"] in ["Arab", "Hebr"]:
info['remove_marks'] = True info["remove_marks"] = True
if _language_in_list(language, ['tr', 'az', 'kk']): if _language_in_list(language, ["tr", "az", "kk"]):
info['dotless_i'] = True info["dotless_i"] = True
info['diacritics_under'] = 'cedillas' info["diacritics_under"] = "cedillas"
elif _language_in_list(language, ['ro']): elif _language_in_list(language, ["ro"]):
info['diacritics_under'] = 'commas' info["diacritics_under"] = "commas"
if _language_in_list(language, ['sr']): if _language_in_list(language, ["sr"]):
info['transliteration'] = 'sr-Latn' info["transliteration"] = "sr-Latn"
elif _language_in_list(language, ['az']): elif _language_in_list(language, ["az"]):
info['transliteration'] = 'az-Latn' info["transliteration"] = "az-Latn"
if language.language == 'zh' and language.script != 'Hant': if language.language == "zh" and language.script != "Hant":
info['lookup_transliteration'] = 'zh-Hans' info["lookup_transliteration"] = "zh-Hans"
return info return info

View File

@ -13,11 +13,13 @@ def make_mecab_analyzer(lang):
Get a MeCab analyzer object, given the language code of the language to Get a MeCab analyzer object, given the language code of the language to
analyze. analyze.
""" """
if lang == 'ko': if lang == "ko":
import mecab_ko_dic import mecab_ko_dic
return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS) return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
elif lang == 'ja': elif lang == "ja":
import ipadic import ipadic
return MeCab.Tagger(ipadic.MECAB_ARGS) return MeCab.Tagger(ipadic.MECAB_ARGS)
else: else:
raise ValueError("Can't run MeCab on language {lang}".format(lang)) raise ValueError("Can't run MeCab on language {lang}".format(lang))
@ -40,10 +42,12 @@ def mecab_tokenize(text, lang):
MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang) MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
analyzer = MECAB_ANALYZERS[lang] analyzer = MECAB_ANALYZERS[lang]
text = unicodedata.normalize('NFKC', text.strip()) text = unicodedata.normalize("NFKC", text.strip())
analyzed = analyzer.parse(text) analyzed = analyzer.parse(text)
if not analyzed: if not analyzed:
return [] return []
return [line.split('\t')[0] return [
for line in analyzed.split('\n') line.split("\t")[0]
if line != '' and line != 'EOS'] for line in analyzed.split("\n")
if line != "" and line != "EOS"
]

View File

@ -4,10 +4,10 @@ import unicodedata
from .language_info import get_language_info from .language_info import get_language_info
from .transliterate import transliterate from .transliterate import transliterate
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1) MARK_RE = regex.compile(r"[\p{Mn}\N{ARABIC TATWEEL}]", regex.V1)
DIGIT_RE = regex.compile(r'\d') DIGIT_RE = regex.compile(r"\d")
MULTI_DIGIT_RE = regex.compile(r'\d[\d.,]+') MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
def preprocess_text(text, language): def preprocess_text(text, language):
@ -171,26 +171,26 @@ def preprocess_text(text, language):
""" """
# NFC or NFKC normalization, as needed for the language # NFC or NFKC normalization, as needed for the language
info = get_language_info(language) info = get_language_info(language)
text = unicodedata.normalize(info['normal_form'], text) text = unicodedata.normalize(info["normal_form"], text)
# Transliteration of multi-script languages # Transliteration of multi-script languages
if info['transliteration'] is not None: if info["transliteration"] is not None:
text = transliterate(info['transliteration'], text) text = transliterate(info["transliteration"], text)
# Abjad mark removal # Abjad mark removal
if info['remove_marks']: if info["remove_marks"]:
text = remove_marks(text) text = remove_marks(text)
# Case folding # Case folding
if info['dotless_i']: if info["dotless_i"]:
text = casefold_with_i_dots(text) text = casefold_with_i_dots(text)
else: else:
text = text.casefold() text = text.casefold()
# Fixing of diacritics # Fixing of diacritics
if info['diacritics_under'] == 'commas': if info["diacritics_under"] == "commas":
text = cedillas_to_commas(text) text = cedillas_to_commas(text)
elif info['diacritics_under'] == 'cedillas': elif info["diacritics_under"] == "cedillas":
text = commas_to_cedillas(text) text = commas_to_cedillas(text)
return text return text
@ -205,7 +205,7 @@ def remove_marks(text):
- Tatweels, horizontal segments that are used to extend or justify an - Tatweels, horizontal segments that are used to extend or justify an
Arabic word. Arabic word.
""" """
return MARK_RE.sub('', text) return MARK_RE.sub("", text)
def casefold_with_i_dots(text): def casefold_with_i_dots(text):
@ -214,7 +214,7 @@ def casefold_with_i_dots(text):
that's appropriate for Turkish and related languages, then case-fold that's appropriate for Turkish and related languages, then case-fold
the rest of the letters. the rest of the letters.
""" """
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') text = unicodedata.normalize("NFC", text).replace("İ", "i").replace("I", "ı")
return text.casefold() return text.casefold()
@ -227,11 +227,11 @@ def commas_to_cedillas(text):
text has already been case-folded. text has already been case-folded.
""" """
return text.replace( return text.replace(
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}', "\N{LATIN SMALL LETTER S WITH COMMA BELOW}",
'\N{LATIN SMALL LETTER S WITH CEDILLA}' "\N{LATIN SMALL LETTER S WITH CEDILLA}",
).replace( ).replace(
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}', "\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
'\N{LATIN SMALL LETTER T WITH CEDILLA}' "\N{LATIN SMALL LETTER T WITH CEDILLA}",
) )
@ -244,11 +244,11 @@ def cedillas_to_commas(text):
text has already been case-folded. text has already been case-folded.
""" """
return text.replace( return text.replace(
'\N{LATIN SMALL LETTER S WITH CEDILLA}', "\N{LATIN SMALL LETTER S WITH CEDILLA}",
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}' "\N{LATIN SMALL LETTER S WITH COMMA BELOW}",
).replace( ).replace(
'\N{LATIN SMALL LETTER T WITH CEDILLA}', "\N{LATIN SMALL LETTER T WITH CEDILLA}",
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}' "\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
) )
@ -257,7 +257,7 @@ def _sub_zeroes(match):
Given a regex match, return what it matched with digits replaced by Given a regex match, return what it matched with digits replaced by
zeroes. zeroes.
""" """
return DIGIT_RE.sub('0', match.group(0)) return DIGIT_RE.sub("0", match.group(0))
def num_generic_digits(text): def num_generic_digits(text):

View File

@ -22,17 +22,17 @@ logger = logging.getLogger(__name__)
def _make_spaceless_expr(): def _make_spaceless_expr():
scripts = sorted(SPACELESS_SCRIPTS) scripts = sorted(SPACELESS_SCRIPTS)
pieces = [r'\p{IsIdeo}'] + [ pieces = [r"\p{IsIdeo}"] + [
r'\p{Script=%s}' % script_code for script_code in scripts r"\p{Script=%s}" % script_code for script_code in scripts
] ]
return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS return "".join(pieces) + EXTRA_JAPANESE_CHARACTERS
SPACELESS_EXPR = _make_spaceless_expr() SPACELESS_EXPR = _make_spaceless_expr()
# All vowels that might appear at the start of a word in French or Catalan, # All vowels that might appear at the start of a word in French or Catalan,
# plus 'h' which would be silent and imply a following vowel sound. # plus 'h' which would be silent and imply a following vowel sound.
INITIAL_VOWEL_EXPR = '[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]' INITIAL_VOWEL_EXPR = "[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]"
TOKEN_RE = regex.compile( TOKEN_RE = regex.compile(
r""" r"""
@ -148,9 +148,9 @@ TOKEN_RE = regex.compile(
\w\w?' \w\w?'
""".replace( """.replace(
'<SPACELESS>', SPACELESS_EXPR "<SPACELESS>", SPACELESS_EXPR
).replace( ).replace(
'<VOWEL>', INITIAL_VOWEL_EXPR "<VOWEL>", INITIAL_VOWEL_EXPR
), ),
regex.V1 | regex.WORD | regex.VERBOSE, regex.V1 | regex.WORD | regex.VERBOSE,
) )
@ -167,9 +167,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(
\X+? (?: @s? (?!w) | \b) | # Case 3 \X+? (?: @s? (?!w) | \b) | # Case 3
\w\w?' # Case 4 \w\w?' # Case 4
""".replace( """.replace(
'<SPACELESS>', SPACELESS_EXPR "<SPACELESS>", SPACELESS_EXPR
).replace( ).replace(
'<VOWEL>', INITIAL_VOWEL_EXPR "<VOWEL>", INITIAL_VOWEL_EXPR
), ),
regex.V1 | regex.WORD | regex.VERBOSE, regex.V1 | regex.WORD | regex.VERBOSE,
) )
@ -207,12 +207,9 @@ def simple_tokenize(text, include_punctuation=False):
tokens that are much too long, but the alternative is that every grapheme tokens that are much too long, but the alternative is that every grapheme
would end up in its own token, which is worse. would end up in its own token, which is worse.
""" """
text = unicodedata.normalize('NFC', text) text = unicodedata.normalize("NFC", text)
if include_punctuation: if include_punctuation:
return [ return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
token.casefold()
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
]
else: else:
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
@ -257,7 +254,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
info = get_language_info(language) info = get_language_info(language)
text = preprocess_text(text, language) text = preprocess_text(text, language)
if info['tokenizer'] == 'mecab': if info["tokenizer"] == "mecab":
from wordfreq.mecab import mecab_tokenize as _mecab_tokenize from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
# Get just the language code out of the Language object, so we can # Get just the language code out of the Language object, so we can
@ -265,7 +262,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
tokens = _mecab_tokenize(text, language.language) tokens = _mecab_tokenize(text, language.language)
if not include_punctuation: if not include_punctuation:
tokens = [token for token in tokens if not PUNCT_RE.match(token)] tokens = [token for token in tokens if not PUNCT_RE.match(token)]
elif info['tokenizer'] == 'jieba': elif info["tokenizer"] == "jieba":
from wordfreq.chinese import jieba_tokenize as _jieba_tokenize from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
tokens = _jieba_tokenize(text, external_wordlist=external_wordlist) tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
@ -275,11 +272,11 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
# This is the default case where we use the regex tokenizer. First # This is the default case where we use the regex tokenizer. First
# let's complain a bit if we ended up here because we don't have an # let's complain a bit if we ended up here because we don't have an
# appropriate tokenizer. # appropriate tokenizer.
if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES: if info["tokenizer"] != "regex" and lang not in _WARNED_LANGUAGES:
logger.warning( logger.warning(
"The language '{}' is in the '{}' script, which we don't " "The language '{}' is in the '{}' script, which we don't "
"have a tokenizer for. The results will be bad.".format( "have a tokenizer for. The results will be bad.".format(
lang, info['script'] lang, info["script"]
) )
) )
_WARNED_LANGUAGES.add(lang) _WARNED_LANGUAGES.add(lang)
@ -288,9 +285,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
return tokens return tokens
def lossy_tokenize( def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False):
text, lang, include_punctuation=False, external_wordlist=False
):
""" """
Get a list of tokens for this text, with largely the same results and Get a list of tokens for this text, with largely the same results and
options as `tokenize`, but aggressively normalize some text in a lossy way options as `tokenize`, but aggressively normalize some text in a lossy way
@ -316,7 +311,7 @@ def lossy_tokenize(
info = get_language_info(lang) info = get_language_info(lang)
tokens = tokenize(text, lang, include_punctuation, external_wordlist) tokens = tokenize(text, lang, include_punctuation, external_wordlist)
if info['lookup_transliteration'] == 'zh-Hans': if info["lookup_transliteration"] == "zh-Hans":
from wordfreq.chinese import simplify_chinese as _simplify_chinese from wordfreq.chinese import simplify_chinese as _simplify_chinese
tokens = [_simplify_chinese(token) for token in tokens] tokens = [_simplify_chinese(token) for token in tokens]