diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index c58de33..9b5e724 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) CACHE_SIZE = 100000 -DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) +DATA_PATH = pathlib.Path(resource_filename("wordfreq", "data")) # We'll divide the frequency by 10 for each token boundary that was inferred. # (We determined the factor of 10 empirically by looking at words in the @@ -75,44 +75,43 @@ def read_cBpack(filename): ['blue', 'red'] ] """ - with gzip.open(filename, 'rb') as infile: + with gzip.open(filename, "rb") as infile: data = msgpack.load(infile, raw=False) header = data[0] if ( - not isinstance(header, dict) or header.get('format') != 'cB' - or header.get('version') != 1 + not isinstance(header, dict) + or header.get("format") != "cB" + or header.get("version") != 1 ): raise ValueError("Unexpected header: %r" % header) return data[1:] -def available_languages(wordlist='best'): +def available_languages(wordlist="best"): """ Given a wordlist name, return a dictionary of language codes to filenames, representing all the languages in which that wordlist is available. """ - if wordlist == 'best': - available = available_languages('small') - available.update(available_languages('large')) + if wordlist == "best": + available = available_languages("small") + available.update(available_languages("large")) return available - elif wordlist == 'combined': - logger.warning( - "The 'combined' wordlists have been renamed to 'small'." - ) - wordlist = 'small' + elif wordlist == "combined": + logger.warning("The 'combined' wordlists have been renamed to 'small'.") + wordlist = "small" available = {} - for path in DATA_PATH.glob('*.msgpack.gz'): - if not path.name.startswith('_'): - list_name = path.name.split('.')[0] - name, lang = list_name.split('_') + for path in DATA_PATH.glob("*.msgpack.gz"): + if not path.name.startswith("_"): + list_name = path.name.split(".")[0] + name, lang = list_name.split("_") if name == wordlist: available[lang] = str(path) return available @lru_cache(maxsize=None) -def get_frequency_list(lang, wordlist='best', match_cutoff=None): +def get_frequency_list(lang, wordlist="best", match_cutoff=None): """ Read the raw data from a wordlist file, returning it as a list of lists. (See `read_cBpack` for what this represents.) @@ -123,27 +122,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=None): Looking up the alternate code 'por' will also get the same list. """ if match_cutoff is not None: - warnings.warn( - "The `match_cutoff` parameter is deprecated", - DeprecationWarning - ) + warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning) available = available_languages(wordlist) # TODO: decrease the maximum distance. This distance is so high just # because it allows a test where 'yue' matches 'zh', and maybe the # distance between those is high because they shouldn't match. - best, _distance = langcodes.closest_match( - lang, list(available), max_distance=70 - ) - if best == 'und': - raise LookupError("No wordlist %r available for language %r" - % (wordlist, lang)) + best, _distance = langcodes.closest_match(lang, list(available), max_distance=70) + if best == "und": + raise LookupError("No wordlist %r available for language %r" % (wordlist, lang)) if best != lang: logger.warning( "You asked for word frequencies in language %r. Using the " - "nearest match, which is %r." - % (lang, best) + "nearest match, which is %r." % (lang, best) ) return read_cBpack(available[best]) @@ -161,9 +153,7 @@ def cB_to_freq(cB): In general, x cB represents a frequency of 10 ** (x/100). """ if cB > 0: - raise ValueError( - "A frequency cannot be a positive number of centibels." - ) + raise ValueError("A frequency cannot be a positive number of centibels.") return 10 ** (cB / 100) @@ -192,7 +182,7 @@ def zipf_to_freq(zipf): words. For example, a word that occurs once per million words is at 3.0 on the Zipf scale. """ - return 10 ** zipf / 1e9 + return 10**zipf / 1e9 def freq_to_zipf(freq): @@ -204,16 +194,13 @@ def freq_to_zipf(freq): @lru_cache(maxsize=None) -def get_frequency_dict(lang, wordlist='best', match_cutoff=None): +def get_frequency_dict(lang, wordlist="best", match_cutoff=None): """ Get a word frequency list as a dictionary, mapping tokens to frequencies as floating-point probabilities. """ if match_cutoff is not None: - warnings.warn( - "The `match_cutoff` parameter is deprecated", - DeprecationWarning - ) + warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning) freqs = {} pack = get_frequency_list(lang, wordlist) for index, bucket in enumerate(pack): @@ -223,7 +210,7 @@ def get_frequency_dict(lang, wordlist='best', match_cutoff=None): return freqs -def iter_wordlist(lang, wordlist='best'): +def iter_wordlist(lang, wordlist="best"): """ Yield the words in a wordlist in approximate descending order of frequency. @@ -258,12 +245,12 @@ def _word_frequency(word, lang, wordlist, minimum): # If any word is missing, just return the default value return minimum # spread the frequency of digits over all digit combinations - freq = freqs[token] / (10. ** digits) + freq = freqs[token] / (10.0**digits) one_over_result += 1.0 / freq freq = 1.0 / one_over_result - if get_language_info(lang)['tokenizer'] == 'jieba': + if get_language_info(lang)["tokenizer"] == "jieba": # If we used the Jieba tokenizer, we could tokenize anything to match # our wordlist, even nonsense. To counteract this, we multiply by a # probability for each word break that was inferred. @@ -272,14 +259,14 @@ def _word_frequency(word, lang, wordlist, minimum): # All our frequency data is only precise to within 1% anyway, so round # it to 3 significant digits unrounded = max(freq, minimum) - if unrounded == 0.: - return 0. + if unrounded == 0.0: + return 0.0 else: leading_zeroes = math.floor(-math.log(unrounded, 10)) return round(unrounded, leading_zeroes + 3) -def word_frequency(word, lang, wordlist='best', minimum=0.): +def word_frequency(word, lang, wordlist="best", minimum=0.0): """ Get the frequency of `word` in the language with code `lang`, from the specified `wordlist`. @@ -306,7 +293,7 @@ def word_frequency(word, lang, wordlist='best', minimum=0.): return _wf_cache[args] -def zipf_frequency(word, lang, wordlist='best', minimum=0.): +def zipf_frequency(word, lang, wordlist="best", minimum=0.0): """ Get the frequency of `word`, in the language with code `lang`, on the Zipf scale. @@ -334,7 +321,7 @@ def zipf_frequency(word, lang, wordlist='best', minimum=0.): @lru_cache(maxsize=100) -def top_n_list(lang, n, wordlist='best', ascii_only=False): +def top_n_list(lang, n, wordlist="best", ascii_only=False): """ Return a frequency list of length `n` in descending order of frequency. This list contains words from `wordlist`, of the given language. @@ -342,15 +329,16 @@ def top_n_list(lang, n, wordlist='best', ascii_only=False): """ results = [] for word in iter_wordlist(lang, wordlist): - if (not ascii_only) or max(word) <= '~': + if (not ascii_only) or max(word) <= "~": results.append(word) if len(results) >= n: break return results -def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12, - ascii_only=False): +def random_words( + lang="en", wordlist="best", nwords=5, bits_per_word=12, ascii_only=False +): """ Returns a string of random, space separated words. @@ -364,18 +352,17 @@ def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12, You can restrict the selection of words to those written in ASCII characters by setting `ascii_only` to True. """ - n_choices = 2 ** bits_per_word + n_choices = 2**bits_per_word choices = top_n_list(lang, n_choices, wordlist, ascii_only=ascii_only) if len(choices) < n_choices: raise ValueError( "There aren't enough words in the wordlist to provide %d bits of " "entropy per word." % bits_per_word ) - return ' '.join([random.choice(choices) for i in range(nwords)]) + return " ".join([random.choice(choices) for i in range(nwords)]) -def random_ascii_words(lang='en', wordlist='best', nwords=5, - bits_per_word=12): +def random_ascii_words(lang="en", wordlist="best", nwords=5, bits_per_word=12): """ Returns a string of random, space separated, ASCII words. diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index 95b6b5d..8b30095 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -3,11 +3,13 @@ import jieba import msgpack import gzip -DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') -ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt') -SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') +DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh.txt") +ORIG_DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh_orig.txt") +SIMP_MAP_FILENAME = resource_filename("wordfreq", "data/_chinese_mapping.msgpack.gz") try: - SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False) + SIMPLIFIED_MAP = msgpack.load( + gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False + ) except TypeError: # work around incompatibility between pure-Python msgpack and C msgpack SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False) @@ -58,6 +60,8 @@ def jieba_tokenize(text, external_wordlist=False): # those spans from the original text, even if it's in Traditional # Chinese tokens = [] - for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False): + for _token, start, end in jieba_tokenizer.tokenize( + simplify_chinese(text), HMM=False + ): tokens.append(text[start:end]) return tokens diff --git a/wordfreq/language_info.py b/wordfreq/language_info.py index 73a7b69..3e48e1f 100644 --- a/wordfreq/language_info.py +++ b/wordfreq/language_info.py @@ -12,20 +12,19 @@ SPACELESS_SCRIPTS = [ # characters, are covered by the \p{IsIdeo} check. Checking for # Script=Hani and IsIdeo slows down our regexes with huge, redundant # classes of characters. Instead, we'll list the exceptions below. - - 'Hira', # Hiragana - 'Kana', # Katakana - 'Thai', # Thai script - 'Khmr', # Khmer script - 'Laoo', # Lao script - 'Mymr', # Burmese script - 'Tale', # Tai Le script - 'Talu', # Tai Lü script - 'Lana', # Lanna script + "Hira", # Hiragana + "Kana", # Katakana + "Thai", # Thai script + "Khmr", # Khmer script + "Laoo", # Lao script + "Mymr", # Burmese script + "Tale", # Tai Le script + "Talu", # Tai Lü script + "Lana", # Lanna script ] -EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆' +EXTRA_JAPANESE_CHARACTERS = "ー々〻〆" # ー is a lengthening mark that's both hiragana and katakana. Unicode # segmentation handles it as a special case, but we're overriding standard @@ -54,7 +53,7 @@ def _language_in_list(language, targets, max_distance=10): objects. `targets` can be any iterable of such languages. """ matched = closest_match(language, targets, max_distance=max_distance) - return matched[0] != 'und' + return matched[0] != "und" @lru_cache(maxsize=None) @@ -131,42 +130,42 @@ def get_language_info(language): # Start the `info` dictionary with default values, including the 'script' # value that we now know from `language_full`. info = { - 'script': language_full.script, - 'tokenizer': 'regex', - 'normal_form': 'NFKC', - 'remove_marks': False, - 'dotless_i': False, - 'diacritics_under': None, - 'transliteration': None, - 'lookup_transliteration': None + "script": language_full.script, + "tokenizer": "regex", + "normal_form": "NFKC", + "remove_marks": False, + "dotless_i": False, + "diacritics_under": None, + "transliteration": None, + "lookup_transliteration": None, } - if _language_in_list(language, ['ja', 'ko']): - info['tokenizer'] = 'mecab' - elif _language_in_list(language, ['zh', 'yue']): - info['tokenizer'] = 'jieba' - elif info['script'] in SPACELESS_SCRIPTS: - info['tokenizer'] = None + if _language_in_list(language, ["ja", "ko"]): + info["tokenizer"] = "mecab" + elif _language_in_list(language, ["zh", "yue"]): + info["tokenizer"] = "jieba" + elif info["script"] in SPACELESS_SCRIPTS: + info["tokenizer"] = None # Cased alphabetic scripts get NFC normal form - if info['script'] in ['Latn', 'Grek', 'Cyrl']: - info['normal_form'] = 'NFC' + if info["script"] in ["Latn", "Grek", "Cyrl"]: + info["normal_form"] = "NFC" - if info['script'] in ['Arab', 'Hebr']: - info['remove_marks'] = True + if info["script"] in ["Arab", "Hebr"]: + info["remove_marks"] = True - if _language_in_list(language, ['tr', 'az', 'kk']): - info['dotless_i'] = True - info['diacritics_under'] = 'cedillas' - elif _language_in_list(language, ['ro']): - info['diacritics_under'] = 'commas' + if _language_in_list(language, ["tr", "az", "kk"]): + info["dotless_i"] = True + info["diacritics_under"] = "cedillas" + elif _language_in_list(language, ["ro"]): + info["diacritics_under"] = "commas" - if _language_in_list(language, ['sr']): - info['transliteration'] = 'sr-Latn' - elif _language_in_list(language, ['az']): - info['transliteration'] = 'az-Latn' + if _language_in_list(language, ["sr"]): + info["transliteration"] = "sr-Latn" + elif _language_in_list(language, ["az"]): + info["transliteration"] = "az-Latn" - if language.language == 'zh' and language.script != 'Hant': - info['lookup_transliteration'] = 'zh-Hans' + if language.language == "zh" and language.script != "Hant": + info["lookup_transliteration"] = "zh-Hans" return info diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py index fee555c..8dd19f7 100644 --- a/wordfreq/mecab.py +++ b/wordfreq/mecab.py @@ -13,11 +13,13 @@ def make_mecab_analyzer(lang): Get a MeCab analyzer object, given the language code of the language to analyze. """ - if lang == 'ko': + if lang == "ko": import mecab_ko_dic + return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS) - elif lang == 'ja': + elif lang == "ja": import ipadic + return MeCab.Tagger(ipadic.MECAB_ARGS) else: raise ValueError("Can't run MeCab on language {lang}".format(lang)) @@ -40,10 +42,12 @@ def mecab_tokenize(text, lang): MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang) analyzer = MECAB_ANALYZERS[lang] - text = unicodedata.normalize('NFKC', text.strip()) + text = unicodedata.normalize("NFKC", text.strip()) analyzed = analyzer.parse(text) if not analyzed: return [] - return [line.split('\t')[0] - for line in analyzed.split('\n') - if line != '' and line != 'EOS'] + return [ + line.split("\t")[0] + for line in analyzed.split("\n") + if line != "" and line != "EOS" + ] diff --git a/wordfreq/preprocess.py b/wordfreq/preprocess.py index 0d3145b..88342c8 100644 --- a/wordfreq/preprocess.py +++ b/wordfreq/preprocess.py @@ -4,10 +4,10 @@ import unicodedata from .language_info import get_language_info from .transliterate import transliterate -MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1) +MARK_RE = regex.compile(r"[\p{Mn}\N{ARABIC TATWEEL}]", regex.V1) -DIGIT_RE = regex.compile(r'\d') -MULTI_DIGIT_RE = regex.compile(r'\d[\d.,]+') +DIGIT_RE = regex.compile(r"\d") +MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+") def preprocess_text(text, language): @@ -171,26 +171,26 @@ def preprocess_text(text, language): """ # NFC or NFKC normalization, as needed for the language info = get_language_info(language) - text = unicodedata.normalize(info['normal_form'], text) + text = unicodedata.normalize(info["normal_form"], text) # Transliteration of multi-script languages - if info['transliteration'] is not None: - text = transliterate(info['transliteration'], text) + if info["transliteration"] is not None: + text = transliterate(info["transliteration"], text) # Abjad mark removal - if info['remove_marks']: + if info["remove_marks"]: text = remove_marks(text) # Case folding - if info['dotless_i']: + if info["dotless_i"]: text = casefold_with_i_dots(text) else: text = text.casefold() # Fixing of diacritics - if info['diacritics_under'] == 'commas': + if info["diacritics_under"] == "commas": text = cedillas_to_commas(text) - elif info['diacritics_under'] == 'cedillas': + elif info["diacritics_under"] == "cedillas": text = commas_to_cedillas(text) return text @@ -205,7 +205,7 @@ def remove_marks(text): - Tatweels, horizontal segments that are used to extend or justify an Arabic word. """ - return MARK_RE.sub('', text) + return MARK_RE.sub("", text) def casefold_with_i_dots(text): @@ -214,7 +214,7 @@ def casefold_with_i_dots(text): that's appropriate for Turkish and related languages, then case-fold the rest of the letters. """ - text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') + text = unicodedata.normalize("NFC", text).replace("İ", "i").replace("I", "ı") return text.casefold() @@ -227,11 +227,11 @@ def commas_to_cedillas(text): text has already been case-folded. """ return text.replace( - '\N{LATIN SMALL LETTER S WITH COMMA BELOW}', - '\N{LATIN SMALL LETTER S WITH CEDILLA}' + "\N{LATIN SMALL LETTER S WITH COMMA BELOW}", + "\N{LATIN SMALL LETTER S WITH CEDILLA}", ).replace( - '\N{LATIN SMALL LETTER T WITH COMMA BELOW}', - '\N{LATIN SMALL LETTER T WITH CEDILLA}' + "\N{LATIN SMALL LETTER T WITH COMMA BELOW}", + "\N{LATIN SMALL LETTER T WITH CEDILLA}", ) @@ -244,11 +244,11 @@ def cedillas_to_commas(text): text has already been case-folded. """ return text.replace( - '\N{LATIN SMALL LETTER S WITH CEDILLA}', - '\N{LATIN SMALL LETTER S WITH COMMA BELOW}' + "\N{LATIN SMALL LETTER S WITH CEDILLA}", + "\N{LATIN SMALL LETTER S WITH COMMA BELOW}", ).replace( - '\N{LATIN SMALL LETTER T WITH CEDILLA}', - '\N{LATIN SMALL LETTER T WITH COMMA BELOW}' + "\N{LATIN SMALL LETTER T WITH CEDILLA}", + "\N{LATIN SMALL LETTER T WITH COMMA BELOW}", ) @@ -257,7 +257,7 @@ def _sub_zeroes(match): Given a regex match, return what it matched with digits replaced by zeroes. """ - return DIGIT_RE.sub('0', match.group(0)) + return DIGIT_RE.sub("0", match.group(0)) def num_generic_digits(text): diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index 25d945b..238a2df 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -22,17 +22,17 @@ logger = logging.getLogger(__name__) def _make_spaceless_expr(): scripts = sorted(SPACELESS_SCRIPTS) - pieces = [r'\p{IsIdeo}'] + [ - r'\p{Script=%s}' % script_code for script_code in scripts + pieces = [r"\p{IsIdeo}"] + [ + r"\p{Script=%s}" % script_code for script_code in scripts ] - return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS + return "".join(pieces) + EXTRA_JAPANESE_CHARACTERS SPACELESS_EXPR = _make_spaceless_expr() # All vowels that might appear at the start of a word in French or Catalan, # plus 'h' which would be silent and imply a following vowel sound. -INITIAL_VOWEL_EXPR = '[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]' +INITIAL_VOWEL_EXPR = "[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]" TOKEN_RE = regex.compile( r""" @@ -148,9 +148,9 @@ TOKEN_RE = regex.compile( \w\w?' """.replace( - '', SPACELESS_EXPR + "", SPACELESS_EXPR ).replace( - '', INITIAL_VOWEL_EXPR + "", INITIAL_VOWEL_EXPR ), regex.V1 | regex.WORD | regex.VERBOSE, ) @@ -167,9 +167,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile( \X+? (?: @s? (?!w) | \b) | # Case 3 \w\w?' # Case 4 """.replace( - '', SPACELESS_EXPR + "", SPACELESS_EXPR ).replace( - '', INITIAL_VOWEL_EXPR + "", INITIAL_VOWEL_EXPR ), regex.V1 | regex.WORD | regex.VERBOSE, ) @@ -207,12 +207,9 @@ def simple_tokenize(text, include_punctuation=False): tokens that are much too long, but the alternative is that every grapheme would end up in its own token, which is worse. """ - text = unicodedata.normalize('NFC', text) + text = unicodedata.normalize("NFC", text) if include_punctuation: - return [ - token.casefold() - for token in TOKEN_RE_WITH_PUNCTUATION.findall(text) - ] + return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)] else: return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] @@ -257,7 +254,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): info = get_language_info(language) text = preprocess_text(text, language) - if info['tokenizer'] == 'mecab': + if info["tokenizer"] == "mecab": from wordfreq.mecab import mecab_tokenize as _mecab_tokenize # Get just the language code out of the Language object, so we can @@ -265,7 +262,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): tokens = _mecab_tokenize(text, language.language) if not include_punctuation: tokens = [token for token in tokens if not PUNCT_RE.match(token)] - elif info['tokenizer'] == 'jieba': + elif info["tokenizer"] == "jieba": from wordfreq.chinese import jieba_tokenize as _jieba_tokenize tokens = _jieba_tokenize(text, external_wordlist=external_wordlist) @@ -275,11 +272,11 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): # This is the default case where we use the regex tokenizer. First # let's complain a bit if we ended up here because we don't have an # appropriate tokenizer. - if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES: + if info["tokenizer"] != "regex" and lang not in _WARNED_LANGUAGES: logger.warning( "The language '{}' is in the '{}' script, which we don't " "have a tokenizer for. The results will be bad.".format( - lang, info['script'] + lang, info["script"] ) ) _WARNED_LANGUAGES.add(lang) @@ -288,9 +285,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): return tokens -def lossy_tokenize( - text, lang, include_punctuation=False, external_wordlist=False -): +def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False): """ Get a list of tokens for this text, with largely the same results and options as `tokenize`, but aggressively normalize some text in a lossy way @@ -316,7 +311,7 @@ def lossy_tokenize( info = get_language_info(lang) tokens = tokenize(text, lang, include_punctuation, external_wordlist) - if info['lookup_transliteration'] == 'zh-Hans': + if info["lookup_transliteration"] == "zh-Hans": from wordfreq.chinese import simplify_chinese as _simplify_chinese tokens = [_simplify_chinese(token) for token in tokens]