diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index c58de33..9b5e724 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 
 
 CACHE_SIZE = 100000
-DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
+DATA_PATH = pathlib.Path(resource_filename("wordfreq", "data"))
 
 # We'll divide the frequency by 10 for each token boundary that was inferred.
 # (We determined the factor of 10 empirically by looking at words in the
@@ -75,44 +75,43 @@ def read_cBpack(filename):
             ['blue', 'red']
         ]
     """
-    with gzip.open(filename, 'rb') as infile:
+    with gzip.open(filename, "rb") as infile:
         data = msgpack.load(infile, raw=False)
     header = data[0]
     if (
-        not isinstance(header, dict) or header.get('format') != 'cB'
-        or header.get('version') != 1
+        not isinstance(header, dict)
+        or header.get("format") != "cB"
+        or header.get("version") != 1
     ):
         raise ValueError("Unexpected header: %r" % header)
     return data[1:]
 
 
-def available_languages(wordlist='best'):
+def available_languages(wordlist="best"):
     """
     Given a wordlist name, return a dictionary of language codes to filenames,
     representing all the languages in which that wordlist is available.
     """
-    if wordlist == 'best':
-        available = available_languages('small')
-        available.update(available_languages('large'))
+    if wordlist == "best":
+        available = available_languages("small")
+        available.update(available_languages("large"))
         return available
-    elif wordlist == 'combined':
-        logger.warning(
-            "The 'combined' wordlists have been renamed to 'small'."
-        )
-        wordlist = 'small'
+    elif wordlist == "combined":
+        logger.warning("The 'combined' wordlists have been renamed to 'small'.")
+        wordlist = "small"
 
     available = {}
-    for path in DATA_PATH.glob('*.msgpack.gz'):
-        if not path.name.startswith('_'):
-            list_name = path.name.split('.')[0]
-            name, lang = list_name.split('_')
+    for path in DATA_PATH.glob("*.msgpack.gz"):
+        if not path.name.startswith("_"):
+            list_name = path.name.split(".")[0]
+            name, lang = list_name.split("_")
             if name == wordlist:
                 available[lang] = str(path)
     return available
 
 
 @lru_cache(maxsize=None)
-def get_frequency_list(lang, wordlist='best', match_cutoff=None):
+def get_frequency_list(lang, wordlist="best", match_cutoff=None):
     """
     Read the raw data from a wordlist file, returning it as a list of
     lists. (See `read_cBpack` for what this represents.)
@@ -123,27 +122,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=None):
     Looking up the alternate code 'por' will also get the same list.
     """
     if match_cutoff is not None:
-        warnings.warn(
-            "The `match_cutoff` parameter is deprecated",
-            DeprecationWarning
-        )
+        warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning)
     available = available_languages(wordlist)
 
     # TODO: decrease the maximum distance. This distance is so high just
     # because it allows a test where 'yue' matches 'zh', and maybe the
     # distance between those is high because they shouldn't match.
-    best, _distance = langcodes.closest_match(
-        lang, list(available), max_distance=70
-    )
-    if best == 'und':
-        raise LookupError("No wordlist %r available for language %r"
-                          % (wordlist, lang))
+    best, _distance = langcodes.closest_match(lang, list(available), max_distance=70)
+    if best == "und":
+        raise LookupError("No wordlist %r available for language %r" % (wordlist, lang))
 
     if best != lang:
         logger.warning(
             "You asked for word frequencies in language %r. Using the "
-            "nearest match, which is %r."
-            % (lang, best)
+            "nearest match, which is %r." % (lang, best)
         )
 
     return read_cBpack(available[best])
@@ -161,9 +153,7 @@ def cB_to_freq(cB):
     In general, x cB represents a frequency of 10 ** (x/100).
     """
     if cB > 0:
-        raise ValueError(
-            "A frequency cannot be a positive number of centibels."
-        )
+        raise ValueError("A frequency cannot be a positive number of centibels.")
     return 10 ** (cB / 100)
 
 
@@ -192,7 +182,7 @@ def zipf_to_freq(zipf):
     words. For example, a word that occurs once per million words is at 3.0 on
     the Zipf scale.
     """
-    return 10 ** zipf / 1e9
+    return 10**zipf / 1e9
 
 
 def freq_to_zipf(freq):
@@ -204,16 +194,13 @@ def freq_to_zipf(freq):
 
 
 @lru_cache(maxsize=None)
-def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
+def get_frequency_dict(lang, wordlist="best", match_cutoff=None):
     """
     Get a word frequency list as a dictionary, mapping tokens to
     frequencies as floating-point probabilities.
     """
     if match_cutoff is not None:
-        warnings.warn(
-            "The `match_cutoff` parameter is deprecated",
-            DeprecationWarning
-        )
+        warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning)
     freqs = {}
     pack = get_frequency_list(lang, wordlist)
     for index, bucket in enumerate(pack):
@@ -223,7 +210,7 @@ def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
     return freqs
 
 
-def iter_wordlist(lang, wordlist='best'):
+def iter_wordlist(lang, wordlist="best"):
     """
     Yield the words in a wordlist in approximate descending order of
     frequency.
@@ -258,12 +245,12 @@ def _word_frequency(word, lang, wordlist, minimum):
             # If any word is missing, just return the default value
             return minimum
         # spread the frequency of digits over all digit combinations
-        freq = freqs[token] / (10. ** digits)
+        freq = freqs[token] / (10.0**digits)
         one_over_result += 1.0 / freq
 
     freq = 1.0 / one_over_result
 
-    if get_language_info(lang)['tokenizer'] == 'jieba':
+    if get_language_info(lang)["tokenizer"] == "jieba":
         # If we used the Jieba tokenizer, we could tokenize anything to match
         # our wordlist, even nonsense. To counteract this, we multiply by a
         # probability for each word break that was inferred.
@@ -272,14 +259,14 @@ def _word_frequency(word, lang, wordlist, minimum):
     # All our frequency data is only precise to within 1% anyway, so round
     # it to 3 significant digits
     unrounded = max(freq, minimum)
-    if unrounded == 0.:
-        return 0.
+    if unrounded == 0.0:
+        return 0.0
     else:
         leading_zeroes = math.floor(-math.log(unrounded, 10))
         return round(unrounded, leading_zeroes + 3)
 
 
-def word_frequency(word, lang, wordlist='best', minimum=0.):
+def word_frequency(word, lang, wordlist="best", minimum=0.0):
     """
     Get the frequency of `word` in the language with code `lang`, from the
     specified `wordlist`.
@@ -306,7 +293,7 @@ def word_frequency(word, lang, wordlist='best', minimum=0.):
         return _wf_cache[args]
 
 
-def zipf_frequency(word, lang, wordlist='best', minimum=0.):
+def zipf_frequency(word, lang, wordlist="best", minimum=0.0):
     """
     Get the frequency of `word`, in the language with code `lang`, on the Zipf
     scale.
@@ -334,7 +321,7 @@ def zipf_frequency(word, lang, wordlist='best', minimum=0.):
 
 
 @lru_cache(maxsize=100)
-def top_n_list(lang, n, wordlist='best', ascii_only=False):
+def top_n_list(lang, n, wordlist="best", ascii_only=False):
     """
     Return a frequency list of length `n` in descending order of frequency.
     This list contains words from `wordlist`, of the given language.
@@ -342,15 +329,16 @@ def top_n_list(lang, n, wordlist='best', ascii_only=False):
     """
     results = []
     for word in iter_wordlist(lang, wordlist):
-        if (not ascii_only) or max(word) <= '~':
+        if (not ascii_only) or max(word) <= "~":
             results.append(word)
             if len(results) >= n:
                 break
     return results
 
 
-def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
-                 ascii_only=False):
+def random_words(
+    lang="en", wordlist="best", nwords=5, bits_per_word=12, ascii_only=False
+):
     """
     Returns a string of random, space separated words.
 
@@ -364,18 +352,17 @@ def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
     You can restrict the selection of words to those written in ASCII
     characters by setting `ascii_only` to True.
     """
-    n_choices = 2 ** bits_per_word
+    n_choices = 2**bits_per_word
     choices = top_n_list(lang, n_choices, wordlist, ascii_only=ascii_only)
     if len(choices) < n_choices:
         raise ValueError(
             "There aren't enough words in the wordlist to provide %d bits of "
             "entropy per word." % bits_per_word
         )
-    return ' '.join([random.choice(choices) for i in range(nwords)])
+    return " ".join([random.choice(choices) for i in range(nwords)])
 
 
-def random_ascii_words(lang='en', wordlist='best', nwords=5,
-                       bits_per_word=12):
+def random_ascii_words(lang="en", wordlist="best", nwords=5, bits_per_word=12):
     """
     Returns a string of random, space separated, ASCII words.
 
diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py
index 95b6b5d..8b30095 100644
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@@ -3,11 +3,13 @@ import jieba
 import msgpack
 import gzip
 
-DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
-ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
-SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
+DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh.txt")
+ORIG_DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh_orig.txt")
+SIMP_MAP_FILENAME = resource_filename("wordfreq", "data/_chinese_mapping.msgpack.gz")
 try:
-    SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False)
+    SIMPLIFIED_MAP = msgpack.load(
+        gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False
+    )
 except TypeError:
     # work around incompatibility between pure-Python msgpack and C msgpack
     SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False)
@@ -58,6 +60,8 @@ def jieba_tokenize(text, external_wordlist=False):
         # those spans from the original text, even if it's in Traditional
         # Chinese
         tokens = []
-        for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
+        for _token, start, end in jieba_tokenizer.tokenize(
+            simplify_chinese(text), HMM=False
+        ):
             tokens.append(text[start:end])
         return tokens
diff --git a/wordfreq/language_info.py b/wordfreq/language_info.py
index 73a7b69..3e48e1f 100644
--- a/wordfreq/language_info.py
+++ b/wordfreq/language_info.py
@@ -12,20 +12,19 @@ SPACELESS_SCRIPTS = [
     # characters, are covered by the \p{IsIdeo} check.  Checking for
     # Script=Hani and IsIdeo slows down our regexes with huge, redundant
     # classes of characters. Instead, we'll list the exceptions below.
-
-    'Hira',  # Hiragana
-    'Kana',  # Katakana
-    'Thai',  # Thai script
-    'Khmr',  # Khmer script
-    'Laoo',  # Lao script
-    'Mymr',  # Burmese script
-    'Tale',  # Tai Le script
-    'Talu',  # Tai Lü script
-    'Lana',  # Lanna script
+    "Hira",  # Hiragana
+    "Kana",  # Katakana
+    "Thai",  # Thai script
+    "Khmr",  # Khmer script
+    "Laoo",  # Lao script
+    "Mymr",  # Burmese script
+    "Tale",  # Tai Le script
+    "Talu",  # Tai Lü script
+    "Lana",  # Lanna script
 ]
 
 
-EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
+EXTRA_JAPANESE_CHARACTERS = "ー々〻〆"
 
 # ー is a lengthening mark that's both hiragana and katakana. Unicode
 # segmentation handles it as a special case, but we're overriding standard
@@ -54,7 +53,7 @@ def _language_in_list(language, targets, max_distance=10):
     objects. `targets` can be any iterable of such languages.
     """
     matched = closest_match(language, targets, max_distance=max_distance)
-    return matched[0] != 'und'
+    return matched[0] != "und"
 
 
 @lru_cache(maxsize=None)
@@ -131,42 +130,42 @@ def get_language_info(language):
     # Start the `info` dictionary with default values, including the 'script'
     # value that we now know from `language_full`.
     info = {
-        'script': language_full.script,
-        'tokenizer': 'regex',
-        'normal_form': 'NFKC',
-        'remove_marks': False,
-        'dotless_i': False,
-        'diacritics_under': None,
-        'transliteration': None,
-        'lookup_transliteration': None
+        "script": language_full.script,
+        "tokenizer": "regex",
+        "normal_form": "NFKC",
+        "remove_marks": False,
+        "dotless_i": False,
+        "diacritics_under": None,
+        "transliteration": None,
+        "lookup_transliteration": None,
     }
 
-    if _language_in_list(language, ['ja', 'ko']):
-        info['tokenizer'] = 'mecab'
-    elif _language_in_list(language, ['zh', 'yue']):
-        info['tokenizer'] = 'jieba'
-    elif info['script'] in SPACELESS_SCRIPTS:
-        info['tokenizer'] = None
+    if _language_in_list(language, ["ja", "ko"]):
+        info["tokenizer"] = "mecab"
+    elif _language_in_list(language, ["zh", "yue"]):
+        info["tokenizer"] = "jieba"
+    elif info["script"] in SPACELESS_SCRIPTS:
+        info["tokenizer"] = None
 
     # Cased alphabetic scripts get NFC normal form
-    if info['script'] in ['Latn', 'Grek', 'Cyrl']:
-        info['normal_form'] = 'NFC'
+    if info["script"] in ["Latn", "Grek", "Cyrl"]:
+        info["normal_form"] = "NFC"
 
-    if info['script'] in ['Arab', 'Hebr']:
-        info['remove_marks'] = True
+    if info["script"] in ["Arab", "Hebr"]:
+        info["remove_marks"] = True
 
-    if _language_in_list(language, ['tr', 'az', 'kk']):
-        info['dotless_i'] = True
-        info['diacritics_under'] = 'cedillas'
-    elif _language_in_list(language, ['ro']):
-        info['diacritics_under'] = 'commas'
+    if _language_in_list(language, ["tr", "az", "kk"]):
+        info["dotless_i"] = True
+        info["diacritics_under"] = "cedillas"
+    elif _language_in_list(language, ["ro"]):
+        info["diacritics_under"] = "commas"
 
-    if _language_in_list(language, ['sr']):
-        info['transliteration'] = 'sr-Latn'
-    elif _language_in_list(language, ['az']):
-        info['transliteration'] = 'az-Latn'
+    if _language_in_list(language, ["sr"]):
+        info["transliteration"] = "sr-Latn"
+    elif _language_in_list(language, ["az"]):
+        info["transliteration"] = "az-Latn"
 
-    if language.language == 'zh' and language.script != 'Hant':
-        info['lookup_transliteration'] = 'zh-Hans'
+    if language.language == "zh" and language.script != "Hant":
+        info["lookup_transliteration"] = "zh-Hans"
 
     return info
diff --git a/wordfreq/mecab.py b/wordfreq/mecab.py
index fee555c..8dd19f7 100644
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@@ -13,11 +13,13 @@ def make_mecab_analyzer(lang):
     Get a MeCab analyzer object, given the language code of the language to
     analyze.
     """
-    if lang == 'ko':
+    if lang == "ko":
         import mecab_ko_dic
+
         return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
-    elif lang == 'ja':
+    elif lang == "ja":
         import ipadic
+
         return MeCab.Tagger(ipadic.MECAB_ARGS)
     else:
         raise ValueError("Can't run MeCab on language {lang}".format(lang))
@@ -40,10 +42,12 @@ def mecab_tokenize(text, lang):
         MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
 
     analyzer = MECAB_ANALYZERS[lang]
-    text = unicodedata.normalize('NFKC', text.strip())
+    text = unicodedata.normalize("NFKC", text.strip())
     analyzed = analyzer.parse(text)
     if not analyzed:
         return []
-    return [line.split('\t')[0]
-            for line in analyzed.split('\n')
-            if line != '' and line != 'EOS']
+    return [
+        line.split("\t")[0]
+        for line in analyzed.split("\n")
+        if line != "" and line != "EOS"
+    ]
diff --git a/wordfreq/preprocess.py b/wordfreq/preprocess.py
index 0d3145b..88342c8 100644
--- a/wordfreq/preprocess.py
+++ b/wordfreq/preprocess.py
@@ -4,10 +4,10 @@ import unicodedata
 from .language_info import get_language_info
 from .transliterate import transliterate
 
-MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
+MARK_RE = regex.compile(r"[\p{Mn}\N{ARABIC TATWEEL}]", regex.V1)
 
-DIGIT_RE = regex.compile(r'\d')
-MULTI_DIGIT_RE = regex.compile(r'\d[\d.,]+')
+DIGIT_RE = regex.compile(r"\d")
+MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
 
 
 def preprocess_text(text, language):
@@ -171,26 +171,26 @@ def preprocess_text(text, language):
     """
     # NFC or NFKC normalization, as needed for the language
     info = get_language_info(language)
-    text = unicodedata.normalize(info['normal_form'], text)
+    text = unicodedata.normalize(info["normal_form"], text)
 
     # Transliteration of multi-script languages
-    if info['transliteration'] is not None:
-        text = transliterate(info['transliteration'], text)
+    if info["transliteration"] is not None:
+        text = transliterate(info["transliteration"], text)
 
     # Abjad mark removal
-    if info['remove_marks']:
+    if info["remove_marks"]:
         text = remove_marks(text)
 
     # Case folding
-    if info['dotless_i']:
+    if info["dotless_i"]:
         text = casefold_with_i_dots(text)
     else:
         text = text.casefold()
 
     # Fixing of diacritics
-    if info['diacritics_under'] == 'commas':
+    if info["diacritics_under"] == "commas":
         text = cedillas_to_commas(text)
-    elif info['diacritics_under'] == 'cedillas':
+    elif info["diacritics_under"] == "cedillas":
         text = commas_to_cedillas(text)
 
     return text
@@ -205,7 +205,7 @@ def remove_marks(text):
     - Tatweels, horizontal segments that are used to extend or justify an
       Arabic word.
     """
-    return MARK_RE.sub('', text)
+    return MARK_RE.sub("", text)
 
 
 def casefold_with_i_dots(text):
@@ -214,7 +214,7 @@ def casefold_with_i_dots(text):
     that's appropriate for Turkish and related languages, then case-fold
     the rest of the letters.
     """
-    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
+    text = unicodedata.normalize("NFC", text).replace("İ", "i").replace("I", "ı")
     return text.casefold()
 
 
@@ -227,11 +227,11 @@ def commas_to_cedillas(text):
     text has already been case-folded.
     """
     return text.replace(
-        '\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
-        '\N{LATIN SMALL LETTER S WITH CEDILLA}'
+        "\N{LATIN SMALL LETTER S WITH COMMA BELOW}",
+        "\N{LATIN SMALL LETTER S WITH CEDILLA}",
     ).replace(
-        '\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
-        '\N{LATIN SMALL LETTER T WITH CEDILLA}'
+        "\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
+        "\N{LATIN SMALL LETTER T WITH CEDILLA}",
     )
 
 
@@ -244,11 +244,11 @@ def cedillas_to_commas(text):
     text has already been case-folded.
     """
     return text.replace(
-        '\N{LATIN SMALL LETTER S WITH CEDILLA}',
-        '\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
+        "\N{LATIN SMALL LETTER S WITH CEDILLA}",
+        "\N{LATIN SMALL LETTER S WITH COMMA BELOW}",
     ).replace(
-        '\N{LATIN SMALL LETTER T WITH CEDILLA}',
-        '\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
+        "\N{LATIN SMALL LETTER T WITH CEDILLA}",
+        "\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
     )
 
 
@@ -257,7 +257,7 @@ def _sub_zeroes(match):
     Given a regex match, return what it matched with digits replaced by
     zeroes.
     """
-    return DIGIT_RE.sub('0', match.group(0))
+    return DIGIT_RE.sub("0", match.group(0))
 
 
 def num_generic_digits(text):
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index 25d945b..238a2df 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -22,17 +22,17 @@ logger = logging.getLogger(__name__)
 
 def _make_spaceless_expr():
     scripts = sorted(SPACELESS_SCRIPTS)
-    pieces = [r'\p{IsIdeo}'] + [
-        r'\p{Script=%s}' % script_code for script_code in scripts
+    pieces = [r"\p{IsIdeo}"] + [
+        r"\p{Script=%s}" % script_code for script_code in scripts
     ]
-    return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS
+    return "".join(pieces) + EXTRA_JAPANESE_CHARACTERS
 
 
 SPACELESS_EXPR = _make_spaceless_expr()
 
 # All vowels that might appear at the start of a word in French or Catalan,
 # plus 'h' which would be silent and imply a following vowel sound.
-INITIAL_VOWEL_EXPR = '[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]'
+INITIAL_VOWEL_EXPR = "[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]"
 
 TOKEN_RE = regex.compile(
     r"""
@@ -148,9 +148,9 @@ TOKEN_RE = regex.compile(
 
     \w\w?'
 """.replace(
-        '<SPACELESS>', SPACELESS_EXPR
+        "<SPACELESS>", SPACELESS_EXPR
     ).replace(
-        '<VOWEL>', INITIAL_VOWEL_EXPR
+        "<VOWEL>", INITIAL_VOWEL_EXPR
     ),
     regex.V1 | regex.WORD | regex.VERBOSE,
 )
@@ -167,9 +167,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(
       \X+? (?: @s? (?!w) | \b) |                            # Case 3
     \w\w?'                                                  # Case 4
 """.replace(
-        '<SPACELESS>', SPACELESS_EXPR
+        "<SPACELESS>", SPACELESS_EXPR
     ).replace(
-        '<VOWEL>', INITIAL_VOWEL_EXPR
+        "<VOWEL>", INITIAL_VOWEL_EXPR
     ),
     regex.V1 | regex.WORD | regex.VERBOSE,
 )
@@ -207,12 +207,9 @@ def simple_tokenize(text, include_punctuation=False):
       tokens that are much too long, but the alternative is that every grapheme
       would end up in its own token, which is worse.
     """
-    text = unicodedata.normalize('NFC', text)
+    text = unicodedata.normalize("NFC", text)
     if include_punctuation:
-        return [
-            token.casefold()
-            for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
-        ]
+        return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
     else:
         return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
 
@@ -257,7 +254,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
     info = get_language_info(language)
     text = preprocess_text(text, language)
 
-    if info['tokenizer'] == 'mecab':
+    if info["tokenizer"] == "mecab":
         from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
 
         # Get just the language code out of the Language object, so we can
@@ -265,7 +262,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
         tokens = _mecab_tokenize(text, language.language)
         if not include_punctuation:
             tokens = [token for token in tokens if not PUNCT_RE.match(token)]
-    elif info['tokenizer'] == 'jieba':
+    elif info["tokenizer"] == "jieba":
         from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
 
         tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
@@ -275,11 +272,11 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
         # This is the default case where we use the regex tokenizer. First
         # let's complain a bit if we ended up here because we don't have an
         # appropriate tokenizer.
-        if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES:
+        if info["tokenizer"] != "regex" and lang not in _WARNED_LANGUAGES:
             logger.warning(
                 "The language '{}' is in the '{}' script, which we don't "
                 "have a tokenizer for. The results will be bad.".format(
-                    lang, info['script']
+                    lang, info["script"]
                 )
             )
             _WARNED_LANGUAGES.add(lang)
@@ -288,9 +285,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
     return tokens
 
 
-def lossy_tokenize(
-    text, lang, include_punctuation=False, external_wordlist=False
-):
+def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False):
     """
     Get a list of tokens for this text, with largely the same results and
     options as `tokenize`, but aggressively normalize some text in a lossy way
@@ -316,7 +311,7 @@ def lossy_tokenize(
     info = get_language_info(lang)
     tokens = tokenize(text, lang, include_punctuation, external_wordlist)
 
-    if info['lookup_transliteration'] == 'zh-Hans':
+    if info["lookup_transliteration"] == "zh-Hans":
         from wordfreq.chinese import simplify_chinese as _simplify_chinese
 
         tokens = [_simplify_chinese(token) for token in tokens]