Separate preprocessing from tokenization

2024-12-23 17:31:41 +00:00 · 2018-03-08 16:25:45 -05:00 · 2018-03-08 16:25:45 -05:00 · 5ab5d2ea55
commit 5ab5d2ea55
parent 72646f16a1
7 changed files with 555 additions and 252 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -1,6 +1,6 @@
 from wordfreq import (
    word_frequency, available_languages, cB_to_freq,
-    top_n_list, random_words, random_ascii_words, tokenize
+    top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
 )
 from nose.tools import (
    eq_, assert_almost_equal, assert_greater, raises
@ -164,13 +164,13 @@ def test_casefolding():
 def test_number_smashing():
    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
        ['715', 'crσσks', 'by', 'bon', 'iver'])
-    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
+    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
        ['000', 'crσσks', 'by', 'bon', 'iver'])
-    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
+    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
        ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
-    eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
+    eq_(lossy_tokenize('1', 'en'), ['1'])
-    eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
+    eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
-    eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
+    eq_(lossy_tokenize('24601', 'en'), ['00000'])
    eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
@ -231,6 +231,7 @@ def test_ideographic_fallback():
        ['ひらがな', 'カタカナ', 'romaji']
    )
 def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -83,5 +83,3 @@ def test_alternate_codes():
    # Separate codes for Mandarin and Cantonese
    eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
    eq_(tokenize('谢谢谢谢', 'yue'), tokens)
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -1,4 +1,3 @@
 from wordfreq.tokens import tokenize, simple_tokenize
 from pkg_resources import resource_filename
 from functools import lru_cache
 import langcodes
@ -10,6 +9,9 @@ import random
 import logging
 import math
 from .tokens import tokenize, simple_tokenize, lossy_tokenize
 from .language_info import get_language_info
 logger = logging.getLogger(__name__)
@ -30,8 +32,9 @@ INFERRED_SPACE_LANGUAGES = {'zh'}
 # frequency.)
 INFERRED_SPACE_FACTOR = 10.0
-# simple_tokenize is imported so that other things can import it from here.
+# tokenize and simple_tokenize are imported so that other things can import
-# Suppress the pyflakes warning.
+# them from here. Suppress the pyflakes warning.
 tokenize = tokenize
 simple_tokenize = simple_tokenize
@ -215,8 +218,9 @@ def iter_wordlist(lang, wordlist='combined'):
 # it takes to look up frequencies from scratch, so something faster is needed.
 _wf_cache = {}
 def _word_frequency(word, lang, wordlist, minimum):
-    tokens = tokenize(word, lang, combine_numbers=True)
+    tokens = lossy_tokenize(word, lang)
    if not tokens:
        return minimum
@ -234,7 +238,10 @@ def _word_frequency(word, lang, wordlist, minimum):
    freq = 1.0 / one_over_result
-    if lang in INFERRED_SPACE_LANGUAGES:
+    if get_language_info(lang)['tokenizer'] == 'jieba':
        # If we used the Jieba tokenizer, we could tokenize anything to match
        # our wordlist, even nonsense. To counteract this, we multiply by a
        # probability for each word break that was inferred.
        freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
    return max(freq, minimum)
--- a/wordfreq/language_info.py
+++ b/wordfreq/language_info.py
@ -0,0 +1,148 @@
 from langcodes import Language, best_match
 # Text in scripts written without spaces has to be handled specially in our
 # tokenization regex (see TOKEN_RE in tokens.py). Also, when one of these is
 # the script of the language we're analyzing, then we need to either have
 # a specific tokenizer for the language or give up.
 SPACELESS_SCRIPTS = [
    # Han ideographs are spaceless, but they don't need to appear in this list
    # because they have their own cases in get_language_info and TOKEN_RE.
    'Hiragana',
    # We omit katakana because Unicode regular expressions can already
    # tokenize sequences of katakana, and omitting it here means we can also
    # recognize a switch between hiragana and katakana as a token boundary.
    'Thai',  # Thai script
    'Khmr',  # Khmer script
    'Laoo',  # Lao script
    'Mymr',  # Burmese script
    'Tale',  # Tai Le script
    'Talu',  # Tai Lü script
    'Lana',  # Lanna script
 ]
 def _language_in_list(language, targets, min_score=80):
    """
    A helper function to determine whether this language matches one of the
    target languages, with a match score above a certain threshold.
    The languages can be given as strings (language tags) or as Language
    objects. `targets` can be any iterable of such languages.
    """
    matched = best_match(language, targets)
    return matched[1] > 0
 def get_language_info(language):
    """
    Looks up the things we need to know about how to handle text in a given
    language. This will return a dictionary with the following fields:
    'script': a BCP 47 script code such as 'Latn', 'Cyrl', 'Hans'...
        Indicates the script that tokens in this language should be in,
        _after_ our preprocessing. The script for 'zh' is 'Hans', for example,
        because even if the input is in Traditional Chinese ('Hant'), we
        convert it to Simplified.
    'tokenizer': 'regex', 'jieba', 'mecab', or None
        Indicates the best way we know to separate tokens in the language.
        'regex' is what will be used for most languages, meaning that we can
        segment the text with a Unicode-aware regular expression. If a language
        generally uses spaces to separate words, the regex will work well.
        'jieba' and 'mecab' are tokenizers for specific languages written
        without spaces.
        A tokenizer of None means we don't have a good way to segment the
        language. We'll use the regex anyway, but the results will be pretty
        bad.
    'normal_form': 'NFC' or 'NFKC'
        How "should" Unicode be normalized when comparing text in this
        language? This is not a standard, it's just based on experience.
        Many languages need NFKC normalization for text comparisons to work
        properly, but in many European languages, NFKC normalization is
        excessive and loses information.
    'remove_marks': True or False
        Determines whether marks and decorations, such as vowel points and
        tatweels, should be removed. True for languages in abjad scripts.
    'dotless_i': True or False
        Is "ı" the lowercase of "I" in this language, as in Turkish?
    'diacritics_under': 'cedillas', 'commas', or None
        Should we convert any diacritics that are under the letters "s" and
        "t" in this language? 'cedillas' means we should convert commas to
        cedillas, and 'commas' means we should convert cedillas to commas.
    'transliteration': 'sr-Latn', 'az-Latn', or None
        Indicates a type of transliteration that we should use for normalizing
        a multi-script language. 'sr-Latn' means to use Serbian romanization,
        and 'az-Latn' means to use Azerbaijani romanization.
    'lookup_transliteration': 'zh-Hans' or None
        Indicates a lossy transliteration that should be not be used for output,
        but should be applied when looking up words in a list. 'zh-Hans' means
        that we should convert Traditional Chinese characters to Simplified.
    """
    # The input is probably a string, so parse it into a Language. If it's
    # already a Language, it will pass through.
    language = Language.get(language)
    # Assume additional things about the language, such as what script it's in,
    # using the "likely subtags" table
    language_full = language.maximize()
    # Start the `info` dictionary with default values, including the 'script'
    # value that we now know from `language_full`.
    info = {
        'script': language_full.script,
        'tokenizer': 'regex',
        'normal_form': 'NFKC',
        'remove_marks': False,
        'dotless_i': False,
        'diacritics_under': None,
        'transliteration': None,
        'lookup_transliteration': None
    }
    if _language_in_list(language, ['ja', 'ko']):
        info['tokenizer'] = 'mecab'
    elif _language_in_list(language, ['zh', 'yue']):
        info['tokenizer'] = 'jieba'
    elif info['script'] in SPACELESS_SCRIPTS:
        info['tokenizer'] = None
    # Cased alphabetic scripts get NFC normal form
    if info['script'] in ['Latn', 'Grek', 'Cyrl']:
        info['normal_form'] = 'NFC'
    if info['script'] in ['Arab', 'Hebr']:
        info['remove_marks'] = True
    if _language_in_list(language, ['tr', 'az', 'kk']):
        info['dotless_i'] = True
        info['diacritics_under'] = 'cedillas'
    elif _language_in_list(language, ['ro']):
        info['diacritics_under'] = 'commas'
    if _language_in_list(language, ['sr']):
        info['transliteration'] = 'sr-Latn'
    elif _language_in_list(language, ['az']):
        info['transliteration'] = 'az-Latn'
    if language.language == 'zh' and language.script != 'Hant':
        info['lookup_transliteration'] = 'zh-Hans'
    return info
--- a/wordfreq/preprocess.py
+++ b/wordfreq/preprocess.py
@ -0,0 +1,265 @@
 import regex
 import unicodedata
 from .language_info import get_language_info
 from .transliterate import transliterate
 MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
 DIGIT_RE = regex.compile('\d')
 MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
 def preprocess_text(text, language):
    """
    This function applies pre-processing steps that convert forms of words
    considered equivalent into one standardized form.
    As one straightforward step, it case-folds the text. For the purposes of
    wordfreq and related tools, a capitalized word shouldn't have a different
    frequency from its lowercase version.
    The steps that are applied in order, only some of which apply to each
    language, are:
    - NFC or NFKC normalization, as needed for the language
    - Transliteration of multi-script languages
    - Abjad mark removal
    - Case folding
    - Fixing of diacritics
    We'll describe these steps out of order, to start with the more obvious
    steps.
    Case folding
    ------------
    The most common effect of this function is that it case-folds alphabetic
    text to lowercase:
    >>> preprocess_text('Word', 'en')
    'word'
    This is proper Unicode-aware case-folding, so it eliminates distinctions
    in lowercase letters that would not appear in uppercase. This accounts for
    the German ß and the Greek final sigma:
    >>> preprocess_text('groß', 'de')
    'gross'
    >>> preprocess_text('λέξις', 'el')
    'λέξισ'
    In Turkish (and Azerbaijani), case-folding is different, because the
    uppercase and lowercase I come in two variants, one with a dot and one
    without. They are matched in a way that preserves the number of dots, which
    the usual pair of "I" and "i" do not.
    >>> preprocess_text('HAKKINDA İSTANBUL', 'tr')
    'hakkında istanbul'
    Fixing of diacritics
    --------------------
    While we're talking about Turkish: the Turkish alphabet contains letters
    with cedillas attached to the bottom. In the case of "ş" and "ţ", these
    letters are very similar to two Romanian letters, "ș" and "ț", which have
    separate _commas_ below them.
    (Did you know that a cedilla is not the same as a comma under a letter? I
    didn't until I started dealing with text normalization. My keyboard layout
    even inputs a letter with a cedilla when you hit Compose+comma.)
    Because these letters look so similar, and because some fonts only include
    one pair of letters and not the other, there are many cases where the
    letters are confused with each other. Our preprocessing normalizes these
    Turkish and Romanian letters to the letters each language prefers.
    >>> preprocess_text('kișinin', 'tr')   # comma to cedilla
    'kişinin'
    >>> preprocess_text('ACELAŞI', 'ro')   # cedilla to comma
    'același'
    Unicode normalization
    ---------------------
    Unicode text is NFC normalized in most languages, removing trivial
    distinctions between strings that should be considered equivalent in all
    cases:
    >>> word = preprocess_text('natu\N{COMBINING DIAERESIS}rlich', 'de')
    >>> word
    'natürlich'
    >>> '\N{LATIN SMALL LETTER U WITH DIAERESIS}' in word
    True
    NFC normalization is sufficient (and NFKC normalization is a bit too strong)
    for many languages that are written in cased, alphabetic scripts.
    Languages in other scripts tend to need stronger normalization to properly
    compare text. So we use NFC normalization when the language's script is
    Latin, Greek, or Cyrillic, and we use NFKC normalization for all other
    languages.
    Here's an example in Japanese, where preprocessing changes the width (and
    the case) of a Latin letter that's used as part of a word:
    >>> preprocess_text('Ｕターン', 'ja')
    'uターン'
    In Korean, NFKC normalization is important because it aligns two different
    ways of encoding text -- as individual letters that are grouped together
    into square characters, or as the entire syllables that those characters
    represent:
    >>> word = '\u1102\u1161\u11c0\u1106\u1161\u11af'
    >>> word
    '낱말'
    >>> len(word)
    6
    >>> word = preprocess_text(word, 'ko')
    >>> word
    '낱말'
    >>> len(word)
    2
    Abjad mark removal
    ------------------
    There are many abjad languages, such as Arabic, Hebrew, Persian, and Urdu,
    where words can be marked with vowel points but rarely are. In languages
    that use abjad scripts, we remove all modifiers that are classified by
    Unicode as "marks". We also remove an Arabic character called the tatweel,
    which is used to visually lengthen a word.
    >>> preprocess_text("كَلِمَة", 'ar')
    'كلمة'
    >>> preprocess_text("الحمــــــد", 'ar')
    'الحمد'
    Transliteration of multi-script languages
    -----------------------------------------
    Some languages are written in multiple scripts, and require special care.
    These languages include Chinese, Serbian, and Azerbaijani.
    In Serbian, there is a well-established mapping from Cyrillic letters to
    Latin letters. We apply this mapping so that Serbian is always represented
    in Latin letters.
    >>> preprocess_text('схваташ', 'sr')
    'shvataš'
    The transliteration is more complete than it needs to be to cover just
    Serbian, so that -- for example -- borrowings from Russian can be
    transliterated, instead of coming out in a mixed script.
    >>> preprocess_text('культуры', 'sr')
    "kul'tury"
    Azerbaijani (Azeri) has a similar transliteration step to Serbian,
    and then the Latin-alphabet text is handled similarly to Turkish.
    We don't transliterate Traditional to Simplified Chinese in this step.
    There are some steps where we unify them internally: see chinese.py
    for more information.
    """
    # NFC or NFKC normalization, as needed for the language
    info = get_language_info(language)
    text = unicodedata.normalize(info['normal_form'], text)
    # Transliteration of multi-script languages
    if info['transliteration'] is not None:
        text = transliterate(info['transliteration'], text)
    # Abjad mark removal
    if info['remove_marks']:
        text = remove_marks(text)
    # Case folding
    if info['dotless_i']:
        text = casefold_with_i_dots(text)
    else:
        text = text.casefold()
    # Fixing of diacritics
    if info['diacritics_under'] == 'commas':
        text = cedillas_to_commas(text)
    elif info['diacritics_under'] == 'cedillas':
        text = commas_to_cedillas(text)
    return text
 def remove_marks(text):
    """
    Remove decorations from words in abjad scripts:
    - Combining marks of class Mn, which tend to represent non-essential
      vowel markings.
    - Tatweels, horizontal segments that are used to extend or justify an
      Arabic word.
    """
    return MARK_RE.sub('', text)
 def casefold_with_i_dots(text):
    """
    Convert capital I's and capital dotted İ's to lowercase in the way
    that's appropriate for Turkish and related languages, then case-fold
    the rest of the letters.
    """
    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
    return text.casefold()
 def commas_to_cedillas(text):
    """
    Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
    preferred in Turkish.
    Only the lowercase versions are replaced, because this assumes the
    text has already been case-folded.
    """
    return text.replace(
        '\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
        '\N{LATIN SMALL LETTER S WITH CEDILLA}'
    ).replace(
        '\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
        '\N{LATIN SMALL LETTER T WITH CEDILLA}'
    )
 def cedillas_to_commas(text):
    """
    Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
    preferred in Romanian.
    Only the lowercase versions are replaced, because this assumes the
    text has already been case-folded.
    """
    return text.replace(
        '\N{LATIN SMALL LETTER S WITH CEDILLA}',
        '\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
    ).replace(
        '\N{LATIN SMALL LETTER T WITH CEDILLA}',
        '\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
    )
 def sub_zeroes(match):
    """
    Given a regex match, return what it matched with digits replaced by
    zeroes.
    """
    return DIGIT_RE.sub('0', match.group(0))
 def smash_numbers(text):
    """
    Replace sequences of multiple digits with zeroes, so we don't need to
    distinguish the frequencies of thousands of numbers.
    """
    return MULTI_DIGIT_RE.sub(sub_zeroes, text)
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -1,30 +1,22 @@
 import regex
 import unicodedata
 import logging
 import langcodes
 from .transliterate import serbian_cyrillic_to_latin
-mecab_tokenize = None
+from .language_info import get_language_info, SPACELESS_SCRIPTS
-jieba_tokenize = None
+from .preprocess import preprocess_text, smash_numbers
-# See the documentation inside TOKEN_RE for why we have to handle these
+# Placeholders for CJK functions that we'll import on demand
-# scripts specially.
+_mecab_tokenize = None
-SPACELESS_SCRIPTS = [
+_jieba_tokenize = None
-    'Hiragana',
+_simplify_chinese = None
-    'Thai',  # Thai script
+
-    'Khmr',  # Khmer script
+logger = logging.getLogger(__name__)
    'Laoo',  # Lao script
    'Mymr',  # Burmese script
    'Tale',  # Tai Le script
    'Talu',  # Tai Lü script
    'Lana',  # Lanna script
 ]
 ABJAD_LANGUAGES = {
    'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
 }
 def _make_spaceless_expr():
-    pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
+    scripts = sorted(SPACELESS_SCRIPTS)
    pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
    return ''.join(pieces)
@ -116,10 +108,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
    \w'
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
 MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
-DIGIT_RE = regex.compile('\d')
+# Just identify punctuation, for cases where the tokenizer is separate
-MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
+PUNCT_RE = regex.compile(r"[\p{punct}]+")
 def simple_tokenize(text, include_punctuation=False):
@ -162,197 +153,27 @@ def simple_tokenize(text, include_punctuation=False):
            for token in TOKEN_RE.findall(text)
        ]
 def tokenize_mecab_language(text, lang, include_punctuation=False):
    """
    Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
    """
    global mecab_tokenize
    if not (lang == 'ja' or lang == 'ko'):
        raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
    if mecab_tokenize is None:
        from wordfreq.mecab import mecab_tokenize
    tokens = mecab_tokenize(text, lang)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.casefold() for token in tokens if token_expr.match(token)]
-
+def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
 def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
    """
    Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
    """
    global jieba_tokenize
    if jieba_tokenize is None:
        from wordfreq.chinese import jieba_tokenize
    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.casefold() for token in tokens if token_expr.match(token)]
 def remove_marks(text):
    """
    Remove decorations from words in abjad scripts:
    - Combining marks of class Mn, which tend to represent non-essential
      vowel markings.
    - Tatweels, horizontal segments that are used to extend or justify an
      Arabic word.
    """
    return MARK_RE.sub('', text)
 def commas_to_cedillas(text):
    """
    Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
    preferred in Turkish.
    Only the lowercase versions are replaced, because this assumes the
    text has already been case-folded.
    """
    return text.replace(
        '\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
        '\N{LATIN SMALL LETTER S WITH CEDILLA}'
    ).replace(
        '\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
        '\N{LATIN SMALL LETTER T WITH CEDILLA}'
    )
 def cedillas_to_commas(text):
    """
    Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
    preferred in Romanian.
    Only the lowercase versions are replaced, because this assumes the
    text has already been case-folded.
    """
    return text.replace(
        '\N{LATIN SMALL LETTER S WITH CEDILLA}',
        '\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
    ).replace(
        '\N{LATIN SMALL LETTER T WITH CEDILLA}',
        '\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
    )
 def preprocess_turkish(text):
    """
    Modifies i's so that they case-fold correctly in Turkish, and modifies
    'comma-below' characters to use cedillas.
    """
    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
    return commas_to_cedillas(text.casefold())
 def preprocess_romanian(text):
    """
    Modifies the letters ş and ţ (with cedillas) to use commas-below instead.
    """
    return cedillas_to_commas(text.casefold())
 def preprocess_serbian(text):
    """
    Serbian is written in two scripts, so transliterate from Cyrillic to Latin
    (which is the unambiguous direction).
    """
    return serbian_cyrillic_to_latin(text)
 def sub_zeroes(match):
    """
    Given a regex match, return what it matched with digits replaced by
    zeroes.
    """
    return DIGIT_RE.sub('0', match.group(0))
 def smash_numbers(text):
    """
    Replace sequences of multiple digits with zeroes, so we don't need to
    distinguish the frequencies of thousands of numbers.
    """
    return MULTI_DIGIT_RE.sub(sub_zeroes, text)
 def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
             combine_numbers=False):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language. Strings that are looked up in wordfreq will be run through
    this function first, so that they can be expected to match the data.
-    Some of the processing steps are specific to one language, such as Chinese,
+    The text will be run through a number of pre-processing steps that vary
-    but what broadly happens to the text depends on what general writing system
+    by language; see the docstring of `wordfreq.preprocess.preprocess_text`.
    the language uses, out of these categories:
    - Alphabetic scripts: English, Spanish, Russian, etc.
    - Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
    - CJK scripts: Chinese, Japanese, Korean
    - Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
    The options `include_punctuation`, `external_wordlist`, and
    `combine_numbers` are passed on to the appropriate tokenizer:
    - `include_punctuation` preserves punctuation as tokens, instead of
      removing it.
    - `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
      instead of wordfreq's wordlist.
    - `combine_numbers` replaces multi-digit numbers with strings of zeroes.
    Alphabetic scripts
    ------------------
    The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
    European languages, which are relatively straightforward to tokenize.
    Text in these scripts will be normalized to NFC form, then passed
    through a regular expression that implements the Word Segmentation section
    of Unicode Annex #29, and then case-folded to lowercase.
    The effect is mostly to split the text on spaces and punctuation. There are
    some subtleties involving apostrophes inside words, which the regex will
    only split when they occur before a vowel. ("Hasn't" is one token, but
    "l'enfant" is two.)
    If the language is Turkish, the case-folding rules will take this into
    account, so that capital I and İ map to ı and i respectively.
    Abjad scripts
    -------------
    Languages in the Arabic or Hebrew scripts are written with optional vowel
    marks, and sometimes other decorative markings and ligatures. In these
    languages:
    - The text will be NFKC-normalized, which is a stronger and lossier form
      than NFC. Here its purpose is to reduce ligatures to simpler characters.
    - Marks will be removed, as well as the Arabic tatweel (an extension of
      a word that is used for justification or decoration).
    After these steps, the text will go through the same process as the
    alphabetic scripts above.
    If `include_punctuation` is True, punctuation will be included as separate
    tokens. Otherwise, punctuation will be omitted in the output.
    CJK scripts
    -----------
    In the CJK languages, word boundaries can't usually be identified by a
    regular expression. Instead, there needs to be some language-specific
-    handling.
+    handling. In Chinese, we use the Jieba tokenizer, with a custom word list
-
+    to match the words whose frequencies we can look up. In Japanese and
-    - Chinese text first gets converted to a canonical representation we call
+    Korean, we use the MeCab tokenizer.
      "Oversimplified Chinese", where all characters are replaced by their
      Simplified Chinese form, no matter what, even when this misspells a word or
      a name. This representation is then tokenized using the Jieba tokenizer,
      trained on the list of Chinese words that can be looked up in wordfreq.
    - Japanese and Korean will be NFKC-normalized, then tokenized using the
      MeCab tokenizer, using dictionary files that are included in this
      package.
    The `external_wordlist` option only affects Chinese tokenization.  If it's
    True, then wordfreq will not use its own Chinese wordlist for tokenization.
@ -364,39 +185,64 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
    If you end up seeing tokens that are entire phrases or sentences glued
    together, that probably means you passed in CJK text with the wrong
    language code.
    Brahmic scripts and other languages
    -----------------------------------
    Any kind of language not previously mentioned will just go through the same
    tokenizer that alphabetic languages use. This includes the Brahmic scripts
    used in Hindi, Tamil, and Telugu, for example.
    Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
    written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
    does not support these languages yet. It will split on spaces and
    punctuation, giving tokens that are far too long.
    """
-    # Reduce whatever language code was passed in to a normal form,
+    # Use globals to load CJK tokenizers on demand, so that we can still run
-    # containing just the language subtag.
+    # in environments that lack the CJK dependencies
-    lang = langcodes.get(lang).prefer_macrolanguage().language
+    global _mecab_tokenize, _jieba_tokenize
    if lang == 'ja' or lang == 'ko':
        result = tokenize_mecab_language(text, lang, include_punctuation)
    elif lang == 'zh' or lang == 'yue':
        result = chinese_tokenize(text, include_punctuation, external_wordlist)
    elif lang == 'tr':
        result = simple_tokenize(preprocess_turkish(text), include_punctuation)
    elif lang == 'ro':
        result = simple_tokenize(preprocess_romanian(text), include_punctuation)
    elif lang == 'sr':
        result = simple_tokenize(preprocess_serbian(text), include_punctuation)
    elif lang in ABJAD_LANGUAGES:
        text = remove_marks(unicodedata.normalize('NFKC', text))
        result = simple_tokenize(text, include_punctuation)
    else:
        result = simple_tokenize(text, include_punctuation)
-    if combine_numbers:
+    language = langcodes.get(lang)
-        result = [smash_numbers(token) for token in result]
+    info = get_language_info(language)
-    return result
+    text = preprocess_text(text, language)
    if info['tokenizer'] == 'mecab':
        from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
        # Get just the language code out of the Language object, so we can
        # use it to select a MeCab dictionary
        tokens = _mecab_tokenize(text, language.language)
        if not include_punctuation:
            tokens = [token for token in tokens if not PUNCT_RE.match(token)]
    elif info['tokenizer'] == 'jieba':
        from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
        tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
        if not include_punctuation:
            tokens = [token for token in tokens if not PUNCT_RE.match(token)]
    else:
        # This is the default case where we use the regex tokenizer. First
        # let's complain a bit if we ended up here because we don't have an
        # appropriate tokenizer.
        if info['tokenizer'] != 'regex':
            logger.warning(
                "The language '{}' is in the '{}' script, which we don't "
                "have a tokenizer for. The results will be bad."
                .format(lang, info['script'])
            )
        tokens = simple_tokenize(text, include_punctuation=include_punctuation)
    return tokens
 def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    """
    Get a list of tokens for this text, with largely the same results and
    options as `tokenize`, but aggressively normalize some text in a lossy way
    that's good for counting word frequencies.
    In particular:
    - If a token has 2 adjacent digits, all its digits will be replaced with
      the digit '0', so that frequencies for numbers don't have to be counted
      separately. This is similar to word2vec, which replaces them with '#'.
    - In Chinese, unless Traditional Chinese is specifically requested using
      'zh-Hant', all characters will be converted to Simplified Chinese.
    """
    global _simplify_chinese
    info = get_language_info(lang)
    tokens = tokenize(text, lang, include_punctuation, external_wordlist)
    if info['lookup_transliteration'] == 'zh-Hans':
        from wordfreq.chinese import simplify_chinese as _simplify_chinese
        tokens = [_simplify_chinese(token) for token in tokens]
    return [smash_numbers(token) for token in tokens]
--- a/wordfreq/transliterate.py
+++ b/wordfreq/transliterate.py
@ -1,6 +1,8 @@
 # This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
-# from the 'cyrtranslit' module, which can't currently be imported in Python 3.
+# from the 'cyrtranslit' module. We originally had to reimplement it because
-SR_CYRL_TO_LATN_DICT = {
+# 'cyrtranslit' didn't work in Python 3; now it does, but we've made the table
 # more robust than the one in cyrtranslit.
 SR_LATN_TABLE = {
    ord('А'): 'A',   ord('а'): 'a',
    ord('Б'): 'B',   ord('б'): 'b',
    ord('В'): 'V',   ord('в'): 'v',
@ -55,7 +57,7 @@ SR_CYRL_TO_LATN_DICT = {
    # Ukrainian letters
    ord('Є'): 'Je',  ord('є'): 'je',
    ord('І'): 'I',   ord('і'): 'i',
-    ord('Ї'): 'Ji',  ord('ї'): 'ji',
+    ord('Ї'): 'Ï',  ord('ї'): 'ï',
    ord('Ґ'): 'G',   ord('ґ'): 'g',
    # Macedonian letters
@ -64,7 +66,43 @@ SR_CYRL_TO_LATN_DICT = {
    ord('Ќ'): 'Ḱ',   ord('ќ'): 'ḱ',
 }
 AZ_LATN_TABLE = SR_LATN_TABLE.copy()
 AZ_LATN_TABLE.update({
    # Distinct Azerbaijani letters
    ord('Ҹ'): 'C',  ord('ҹ'): 'c',
    ord('Ә'): 'Ə',  ord('ә'): 'ə',
    ord('Ғ'): 'Ğ',  ord('ғ'): 'ğ',
    ord('Һ'): 'H',  ord('һ'): 'h',
    ord('Ө'): 'Ö',  ord('ө'): 'ö',
    ord('Ҝ'): 'G',  ord('ҝ'): 'g',
    ord('Ү'): 'Ü',  ord('ү'): 'ü',
-def serbian_cyrillic_to_latin(text):
+    # Azerbaijani letters with different transliterations
-    return text.translate(SR_CYRL_TO_LATN_DICT)
+    ord('Ч'): 'Ç',   ord('ч'): 'ç',
    ord('Х'): 'X',   ord('х'): 'x',
    ord('Ы'): 'I',   ord('ы'): 'ı',
    ord('И'): 'İ',   ord('и'): 'ı',
    ord('Ж'): 'J',   ord('ж'): 'j',
    ord('Ј'): 'Y',   ord('ј'): 'y',
    ord('Г'): 'Q',   ord('г'): 'q',
    ord('Ш'): 'Ş',   ord('ш'): 'ş',
 })
 def transliterate(table, text):
    """
    Transliterate text according to one of the tables above.
    `table` chooses the table. It looks like a language code but comes from a
    very restricted set:
    - 'sr-Latn' means to convert Serbian, which may be in Cyrillic, into the
      Latin alphabet.
    - 'az-Latn' means the same for Azerbaijani Cyrillic to Latn.
    """
    if table == 'sr-Latn':
        return text.translate(SR_LATN_TABLE)
    elif table == 'az-Latn':
        return text.translate(AZ_LATN_TABLE)
    else:
        raise ValueError("Unknown transliteration table: {!r}".format(table))