Separate preprocessing from tokenization

This commit is contained in:
Robyn Speer 2018-03-08 16:25:45 -05:00
parent 72646f16a1
commit 5ab5d2ea55
7 changed files with 555 additions and 252 deletions

View File

@ -1,6 +1,6 @@
from wordfreq import (
word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
)
from nose.tools import (
eq_, assert_almost_equal, assert_greater, raises
@ -164,13 +164,13 @@ def test_casefolding():
def test_number_smashing():
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
['715', 'crσσks', 'by', 'bon', 'iver'])
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
['000', 'crσσks', 'by', 'bon', 'iver'])
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
eq_(lossy_tokenize('1', 'en'), ['1'])
eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
eq_(lossy_tokenize('24601', 'en'), ['00000'])
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
@ -231,6 +231,7 @@ def test_ideographic_fallback():
['ひらがな', 'カタカナ', 'romaji']
)
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.

View File

@ -83,5 +83,3 @@ def test_alternate_codes():
# Separate codes for Mandarin and Cantonese
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
eq_(tokenize('谢谢谢谢', 'yue'), tokens)

View File

@ -1,4 +1,3 @@
from wordfreq.tokens import tokenize, simple_tokenize
from pkg_resources import resource_filename
from functools import lru_cache
import langcodes
@ -10,6 +9,9 @@ import random
import logging
import math
from .tokens import tokenize, simple_tokenize, lossy_tokenize
from .language_info import get_language_info
logger = logging.getLogger(__name__)
@ -30,8 +32,9 @@ INFERRED_SPACE_LANGUAGES = {'zh'}
# frequency.)
INFERRED_SPACE_FACTOR = 10.0
# simple_tokenize is imported so that other things can import it from here.
# Suppress the pyflakes warning.
# tokenize and simple_tokenize are imported so that other things can import
# them from here. Suppress the pyflakes warning.
tokenize = tokenize
simple_tokenize = simple_tokenize
@ -215,8 +218,9 @@ def iter_wordlist(lang, wordlist='combined'):
# it takes to look up frequencies from scratch, so something faster is needed.
_wf_cache = {}
def _word_frequency(word, lang, wordlist, minimum):
tokens = tokenize(word, lang, combine_numbers=True)
tokens = lossy_tokenize(word, lang)
if not tokens:
return minimum
@ -234,7 +238,10 @@ def _word_frequency(word, lang, wordlist, minimum):
freq = 1.0 / one_over_result
if lang in INFERRED_SPACE_LANGUAGES:
if get_language_info(lang)['tokenizer'] == 'jieba':
# If we used the Jieba tokenizer, we could tokenize anything to match
# our wordlist, even nonsense. To counteract this, we multiply by a
# probability for each word break that was inferred.
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
return max(freq, minimum)

148
wordfreq/language_info.py Normal file
View File

@ -0,0 +1,148 @@
from langcodes import Language, best_match
# Text in scripts written without spaces has to be handled specially in our
# tokenization regex (see TOKEN_RE in tokens.py). Also, when one of these is
# the script of the language we're analyzing, then we need to either have
# a specific tokenizer for the language or give up.
SPACELESS_SCRIPTS = [
# Han ideographs are spaceless, but they don't need to appear in this list
# because they have their own cases in get_language_info and TOKEN_RE.
'Hiragana',
# We omit katakana because Unicode regular expressions can already
# tokenize sequences of katakana, and omitting it here means we can also
# recognize a switch between hiragana and katakana as a token boundary.
'Thai', # Thai script
'Khmr', # Khmer script
'Laoo', # Lao script
'Mymr', # Burmese script
'Tale', # Tai Le script
'Talu', # Tai Lü script
'Lana', # Lanna script
]
def _language_in_list(language, targets, min_score=80):
"""
A helper function to determine whether this language matches one of the
target languages, with a match score above a certain threshold.
The languages can be given as strings (language tags) or as Language
objects. `targets` can be any iterable of such languages.
"""
matched = best_match(language, targets)
return matched[1] > 0
def get_language_info(language):
"""
Looks up the things we need to know about how to handle text in a given
language. This will return a dictionary with the following fields:
'script': a BCP 47 script code such as 'Latn', 'Cyrl', 'Hans'...
Indicates the script that tokens in this language should be in,
_after_ our preprocessing. The script for 'zh' is 'Hans', for example,
because even if the input is in Traditional Chinese ('Hant'), we
convert it to Simplified.
'tokenizer': 'regex', 'jieba', 'mecab', or None
Indicates the best way we know to separate tokens in the language.
'regex' is what will be used for most languages, meaning that we can
segment the text with a Unicode-aware regular expression. If a language
generally uses spaces to separate words, the regex will work well.
'jieba' and 'mecab' are tokenizers for specific languages written
without spaces.
A tokenizer of None means we don't have a good way to segment the
language. We'll use the regex anyway, but the results will be pretty
bad.
'normal_form': 'NFC' or 'NFKC'
How "should" Unicode be normalized when comparing text in this
language? This is not a standard, it's just based on experience.
Many languages need NFKC normalization for text comparisons to work
properly, but in many European languages, NFKC normalization is
excessive and loses information.
'remove_marks': True or False
Determines whether marks and decorations, such as vowel points and
tatweels, should be removed. True for languages in abjad scripts.
'dotless_i': True or False
Is "ı" the lowercase of "I" in this language, as in Turkish?
'diacritics_under': 'cedillas', 'commas', or None
Should we convert any diacritics that are under the letters "s" and
"t" in this language? 'cedillas' means we should convert commas to
cedillas, and 'commas' means we should convert cedillas to commas.
'transliteration': 'sr-Latn', 'az-Latn', or None
Indicates a type of transliteration that we should use for normalizing
a multi-script language. 'sr-Latn' means to use Serbian romanization,
and 'az-Latn' means to use Azerbaijani romanization.
'lookup_transliteration': 'zh-Hans' or None
Indicates a lossy transliteration that should be not be used for output,
but should be applied when looking up words in a list. 'zh-Hans' means
that we should convert Traditional Chinese characters to Simplified.
"""
# The input is probably a string, so parse it into a Language. If it's
# already a Language, it will pass through.
language = Language.get(language)
# Assume additional things about the language, such as what script it's in,
# using the "likely subtags" table
language_full = language.maximize()
# Start the `info` dictionary with default values, including the 'script'
# value that we now know from `language_full`.
info = {
'script': language_full.script,
'tokenizer': 'regex',
'normal_form': 'NFKC',
'remove_marks': False,
'dotless_i': False,
'diacritics_under': None,
'transliteration': None,
'lookup_transliteration': None
}
if _language_in_list(language, ['ja', 'ko']):
info['tokenizer'] = 'mecab'
elif _language_in_list(language, ['zh', 'yue']):
info['tokenizer'] = 'jieba'
elif info['script'] in SPACELESS_SCRIPTS:
info['tokenizer'] = None
# Cased alphabetic scripts get NFC normal form
if info['script'] in ['Latn', 'Grek', 'Cyrl']:
info['normal_form'] = 'NFC'
if info['script'] in ['Arab', 'Hebr']:
info['remove_marks'] = True
if _language_in_list(language, ['tr', 'az', 'kk']):
info['dotless_i'] = True
info['diacritics_under'] = 'cedillas'
elif _language_in_list(language, ['ro']):
info['diacritics_under'] = 'commas'
if _language_in_list(language, ['sr']):
info['transliteration'] = 'sr-Latn'
elif _language_in_list(language, ['az']):
info['transliteration'] = 'az-Latn'
if language.language == 'zh' and language.script != 'Hant':
info['lookup_transliteration'] = 'zh-Hans'
return info

265
wordfreq/preprocess.py Normal file
View File

@ -0,0 +1,265 @@
import regex
import unicodedata
from .language_info import get_language_info
from .transliterate import transliterate
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
DIGIT_RE = regex.compile('\d')
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
def preprocess_text(text, language):
"""
This function applies pre-processing steps that convert forms of words
considered equivalent into one standardized form.
As one straightforward step, it case-folds the text. For the purposes of
wordfreq and related tools, a capitalized word shouldn't have a different
frequency from its lowercase version.
The steps that are applied in order, only some of which apply to each
language, are:
- NFC or NFKC normalization, as needed for the language
- Transliteration of multi-script languages
- Abjad mark removal
- Case folding
- Fixing of diacritics
We'll describe these steps out of order, to start with the more obvious
steps.
Case folding
------------
The most common effect of this function is that it case-folds alphabetic
text to lowercase:
>>> preprocess_text('Word', 'en')
'word'
This is proper Unicode-aware case-folding, so it eliminates distinctions
in lowercase letters that would not appear in uppercase. This accounts for
the German ß and the Greek final sigma:
>>> preprocess_text('groß', 'de')
'gross'
>>> preprocess_text('λέξις', 'el')
'λέξισ'
In Turkish (and Azerbaijani), case-folding is different, because the
uppercase and lowercase I come in two variants, one with a dot and one
without. They are matched in a way that preserves the number of dots, which
the usual pair of "I" and "i" do not.
>>> preprocess_text('HAKKINDA İSTANBUL', 'tr')
'hakkında istanbul'
Fixing of diacritics
--------------------
While we're talking about Turkish: the Turkish alphabet contains letters
with cedillas attached to the bottom. In the case of "ş" and "ţ", these
letters are very similar to two Romanian letters, "ș" and "ț", which have
separate _commas_ below them.
(Did you know that a cedilla is not the same as a comma under a letter? I
didn't until I started dealing with text normalization. My keyboard layout
even inputs a letter with a cedilla when you hit Compose+comma.)
Because these letters look so similar, and because some fonts only include
one pair of letters and not the other, there are many cases where the
letters are confused with each other. Our preprocessing normalizes these
Turkish and Romanian letters to the letters each language prefers.
>>> preprocess_text('kișinin', 'tr') # comma to cedilla
'kişinin'
>>> preprocess_text('ACELAŞI', 'ro') # cedilla to comma
'același'
Unicode normalization
---------------------
Unicode text is NFC normalized in most languages, removing trivial
distinctions between strings that should be considered equivalent in all
cases:
>>> word = preprocess_text('natu\N{COMBINING DIAERESIS}rlich', 'de')
>>> word
'natürlich'
>>> '\N{LATIN SMALL LETTER U WITH DIAERESIS}' in word
True
NFC normalization is sufficient (and NFKC normalization is a bit too strong)
for many languages that are written in cased, alphabetic scripts.
Languages in other scripts tend to need stronger normalization to properly
compare text. So we use NFC normalization when the language's script is
Latin, Greek, or Cyrillic, and we use NFKC normalization for all other
languages.
Here's an example in Japanese, where preprocessing changes the width (and
the case) of a Latin letter that's used as part of a word:
>>> preprocess_text('Uターン', 'ja')
'uターン'
In Korean, NFKC normalization is important because it aligns two different
ways of encoding text -- as individual letters that are grouped together
into square characters, or as the entire syllables that those characters
represent:
>>> word = '\u1102\u1161\u11c0\u1106\u1161\u11af'
>>> word
'낱말'
>>> len(word)
6
>>> word = preprocess_text(word, 'ko')
>>> word
'낱말'
>>> len(word)
2
Abjad mark removal
------------------
There are many abjad languages, such as Arabic, Hebrew, Persian, and Urdu,
where words can be marked with vowel points but rarely are. In languages
that use abjad scripts, we remove all modifiers that are classified by
Unicode as "marks". We also remove an Arabic character called the tatweel,
which is used to visually lengthen a word.
>>> preprocess_text("كَلِمَة", 'ar')
'كلمة'
>>> preprocess_text("الحمــــــد", 'ar')
'الحمد'
Transliteration of multi-script languages
-----------------------------------------
Some languages are written in multiple scripts, and require special care.
These languages include Chinese, Serbian, and Azerbaijani.
In Serbian, there is a well-established mapping from Cyrillic letters to
Latin letters. We apply this mapping so that Serbian is always represented
in Latin letters.
>>> preprocess_text('схваташ', 'sr')
'shvataš'
The transliteration is more complete than it needs to be to cover just
Serbian, so that -- for example -- borrowings from Russian can be
transliterated, instead of coming out in a mixed script.
>>> preprocess_text('культуры', 'sr')
"kul'tury"
Azerbaijani (Azeri) has a similar transliteration step to Serbian,
and then the Latin-alphabet text is handled similarly to Turkish.
We don't transliterate Traditional to Simplified Chinese in this step.
There are some steps where we unify them internally: see chinese.py
for more information.
"""
# NFC or NFKC normalization, as needed for the language
info = get_language_info(language)
text = unicodedata.normalize(info['normal_form'], text)
# Transliteration of multi-script languages
if info['transliteration'] is not None:
text = transliterate(info['transliteration'], text)
# Abjad mark removal
if info['remove_marks']:
text = remove_marks(text)
# Case folding
if info['dotless_i']:
text = casefold_with_i_dots(text)
else:
text = text.casefold()
# Fixing of diacritics
if info['diacritics_under'] == 'commas':
text = cedillas_to_commas(text)
elif info['diacritics_under'] == 'cedillas':
text = commas_to_cedillas(text)
return text
def remove_marks(text):
"""
Remove decorations from words in abjad scripts:
- Combining marks of class Mn, which tend to represent non-essential
vowel markings.
- Tatweels, horizontal segments that are used to extend or justify an
Arabic word.
"""
return MARK_RE.sub('', text)
def casefold_with_i_dots(text):
"""
Convert capital I's and capital dotted İ's to lowercase in the way
that's appropriate for Turkish and related languages, then case-fold
the rest of the letters.
"""
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
return text.casefold()
def commas_to_cedillas(text):
"""
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
preferred in Turkish.
Only the lowercase versions are replaced, because this assumes the
text has already been case-folded.
"""
return text.replace(
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
).replace(
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
)
def cedillas_to_commas(text):
"""
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
preferred in Romanian.
Only the lowercase versions are replaced, because this assumes the
text has already been case-folded.
"""
return text.replace(
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
).replace(
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
)
def sub_zeroes(match):
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub('0', match.group(0))
def smash_numbers(text):
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(sub_zeroes, text)

View File

@ -1,30 +1,22 @@
import regex
import unicodedata
import logging
import langcodes
from .transliterate import serbian_cyrillic_to_latin
mecab_tokenize = None
jieba_tokenize = None
from .language_info import get_language_info, SPACELESS_SCRIPTS
from .preprocess import preprocess_text, smash_numbers
# See the documentation inside TOKEN_RE for why we have to handle these
# scripts specially.
SPACELESS_SCRIPTS = [
'Hiragana',
'Thai', # Thai script
'Khmr', # Khmer script
'Laoo', # Lao script
'Mymr', # Burmese script
'Tale', # Tai Le script
'Talu', # Tai Lü script
'Lana', # Lanna script
]
# Placeholders for CJK functions that we'll import on demand
_mecab_tokenize = None
_jieba_tokenize = None
_simplify_chinese = None
logger = logging.getLogger(__name__)
ABJAD_LANGUAGES = {
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
}
def _make_spaceless_expr():
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
scripts = sorted(SPACELESS_SCRIPTS)
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
return ''.join(pieces)
@ -116,10 +108,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
\w'
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
DIGIT_RE = regex.compile('\d')
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
# Just identify punctuation, for cases where the tokenizer is separate
PUNCT_RE = regex.compile(r"[\p{punct}]+")
def simple_tokenize(text, include_punctuation=False):
@ -162,197 +153,27 @@ def simple_tokenize(text, include_punctuation=False):
for token in TOKEN_RE.findall(text)
]
def tokenize_mecab_language(text, lang, include_punctuation=False):
"""
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
"""
global mecab_tokenize
if not (lang == 'ja' or lang == 'ko'):
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize
tokens = mecab_tokenize(text, lang)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
"""
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
"""
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
def remove_marks(text):
"""
Remove decorations from words in abjad scripts:
- Combining marks of class Mn, which tend to represent non-essential
vowel markings.
- Tatweels, horizontal segments that are used to extend or justify an
Arabic word.
"""
return MARK_RE.sub('', text)
def commas_to_cedillas(text):
"""
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
preferred in Turkish.
Only the lowercase versions are replaced, because this assumes the
text has already been case-folded.
"""
return text.replace(
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
).replace(
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
)
def cedillas_to_commas(text):
"""
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
preferred in Romanian.
Only the lowercase versions are replaced, because this assumes the
text has already been case-folded.
"""
return text.replace(
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
).replace(
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
)
def preprocess_turkish(text):
"""
Modifies i's so that they case-fold correctly in Turkish, and modifies
'comma-below' characters to use cedillas.
"""
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
return commas_to_cedillas(text.casefold())
def preprocess_romanian(text):
"""
Modifies the letters ş and ţ (with cedillas) to use commas-below instead.
"""
return cedillas_to_commas(text.casefold())
def preprocess_serbian(text):
"""
Serbian is written in two scripts, so transliterate from Cyrillic to Latin
(which is the unambiguous direction).
"""
return serbian_cyrillic_to_latin(text)
def sub_zeroes(match):
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub('0', match.group(0))
def smash_numbers(text):
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
combine_numbers=False):
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
"""
Tokenize this text in a way that's relatively simple but appropriate for
the language. Strings that are looked up in wordfreq will be run through
this function first, so that they can be expected to match the data.
Some of the processing steps are specific to one language, such as Chinese,
but what broadly happens to the text depends on what general writing system
the language uses, out of these categories:
- Alphabetic scripts: English, Spanish, Russian, etc.
- Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
- CJK scripts: Chinese, Japanese, Korean
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
The options `include_punctuation`, `external_wordlist`, and
`combine_numbers` are passed on to the appropriate tokenizer:
- `include_punctuation` preserves punctuation as tokens, instead of
removing it.
- `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
instead of wordfreq's wordlist.
- `combine_numbers` replaces multi-digit numbers with strings of zeroes.
Alphabetic scripts
------------------
The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
European languages, which are relatively straightforward to tokenize.
Text in these scripts will be normalized to NFC form, then passed
through a regular expression that implements the Word Segmentation section
of Unicode Annex #29, and then case-folded to lowercase.
The effect is mostly to split the text on spaces and punctuation. There are
some subtleties involving apostrophes inside words, which the regex will
only split when they occur before a vowel. ("Hasn't" is one token, but
"l'enfant" is two.)
If the language is Turkish, the case-folding rules will take this into
account, so that capital I and İ map to ı and i respectively.
Abjad scripts
-------------
Languages in the Arabic or Hebrew scripts are written with optional vowel
marks, and sometimes other decorative markings and ligatures. In these
languages:
- The text will be NFKC-normalized, which is a stronger and lossier form
than NFC. Here its purpose is to reduce ligatures to simpler characters.
- Marks will be removed, as well as the Arabic tatweel (an extension of
a word that is used for justification or decoration).
After these steps, the text will go through the same process as the
alphabetic scripts above.
The text will be run through a number of pre-processing steps that vary
by language; see the docstring of `wordfreq.preprocess.preprocess_text`.
If `include_punctuation` is True, punctuation will be included as separate
tokens. Otherwise, punctuation will be omitted in the output.
CJK scripts
-----------
In the CJK languages, word boundaries can't usually be identified by a
regular expression. Instead, there needs to be some language-specific
handling.
- Chinese text first gets converted to a canonical representation we call
"Oversimplified Chinese", where all characters are replaced by their
Simplified Chinese form, no matter what, even when this misspells a word or
a name. This representation is then tokenized using the Jieba tokenizer,
trained on the list of Chinese words that can be looked up in wordfreq.
- Japanese and Korean will be NFKC-normalized, then tokenized using the
MeCab tokenizer, using dictionary files that are included in this
package.
handling. In Chinese, we use the Jieba tokenizer, with a custom word list
to match the words whose frequencies we can look up. In Japanese and
Korean, we use the MeCab tokenizer.
The `external_wordlist` option only affects Chinese tokenization. If it's
True, then wordfreq will not use its own Chinese wordlist for tokenization.
@ -364,39 +185,64 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
If you end up seeing tokens that are entire phrases or sentences glued
together, that probably means you passed in CJK text with the wrong
language code.
Brahmic scripts and other languages
-----------------------------------
Any kind of language not previously mentioned will just go through the same
tokenizer that alphabetic languages use. This includes the Brahmic scripts
used in Hindi, Tamil, and Telugu, for example.
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
does not support these languages yet. It will split on spaces and
punctuation, giving tokens that are far too long.
"""
# Reduce whatever language code was passed in to a normal form,
# containing just the language subtag.
lang = langcodes.get(lang).prefer_macrolanguage().language
if lang == 'ja' or lang == 'ko':
result = tokenize_mecab_language(text, lang, include_punctuation)
elif lang == 'zh' or lang == 'yue':
result = chinese_tokenize(text, include_punctuation, external_wordlist)
elif lang == 'tr':
result = simple_tokenize(preprocess_turkish(text), include_punctuation)
elif lang == 'ro':
result = simple_tokenize(preprocess_romanian(text), include_punctuation)
elif lang == 'sr':
result = simple_tokenize(preprocess_serbian(text), include_punctuation)
elif lang in ABJAD_LANGUAGES:
text = remove_marks(unicodedata.normalize('NFKC', text))
result = simple_tokenize(text, include_punctuation)
else:
result = simple_tokenize(text, include_punctuation)
# Use globals to load CJK tokenizers on demand, so that we can still run
# in environments that lack the CJK dependencies
global _mecab_tokenize, _jieba_tokenize
if combine_numbers:
result = [smash_numbers(token) for token in result]
return result
language = langcodes.get(lang)
info = get_language_info(language)
text = preprocess_text(text, language)
if info['tokenizer'] == 'mecab':
from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
# Get just the language code out of the Language object, so we can
# use it to select a MeCab dictionary
tokens = _mecab_tokenize(text, language.language)
if not include_punctuation:
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
elif info['tokenizer'] == 'jieba':
from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
if not include_punctuation:
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
else:
# This is the default case where we use the regex tokenizer. First
# let's complain a bit if we ended up here because we don't have an
# appropriate tokenizer.
if info['tokenizer'] != 'regex':
logger.warning(
"The language '{}' is in the '{}' script, which we don't "
"have a tokenizer for. The results will be bad."
.format(lang, info['script'])
)
tokens = simple_tokenize(text, include_punctuation=include_punctuation)
return tokens
def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False):
"""
Get a list of tokens for this text, with largely the same results and
options as `tokenize`, but aggressively normalize some text in a lossy way
that's good for counting word frequencies.
In particular:
- If a token has 2 adjacent digits, all its digits will be replaced with
the digit '0', so that frequencies for numbers don't have to be counted
separately. This is similar to word2vec, which replaces them with '#'.
- In Chinese, unless Traditional Chinese is specifically requested using
'zh-Hant', all characters will be converted to Simplified Chinese.
"""
global _simplify_chinese
info = get_language_info(lang)
tokens = tokenize(text, lang, include_punctuation, external_wordlist)
if info['lookup_transliteration'] == 'zh-Hans':
from wordfreq.chinese import simplify_chinese as _simplify_chinese
tokens = [_simplify_chinese(token) for token in tokens]
return [smash_numbers(token) for token in tokens]

View File

@ -1,6 +1,8 @@
# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
# from the 'cyrtranslit' module, which can't currently be imported in Python 3.
SR_CYRL_TO_LATN_DICT = {
# from the 'cyrtranslit' module. We originally had to reimplement it because
# 'cyrtranslit' didn't work in Python 3; now it does, but we've made the table
# more robust than the one in cyrtranslit.
SR_LATN_TABLE = {
ord('А'): 'A', ord('а'): 'a',
ord('Б'): 'B', ord('б'): 'b',
ord('В'): 'V', ord('в'): 'v',
@ -55,7 +57,7 @@ SR_CYRL_TO_LATN_DICT = {
# Ukrainian letters
ord('Є'): 'Je', ord('є'): 'je',
ord('І'): 'I', ord('і'): 'i',
ord('Ї'): 'Ji', ord('ї'): 'ji',
ord('Ї'): 'Ï', ord('ї'): 'ï',
ord('Ґ'): 'G', ord('ґ'): 'g',
# Macedonian letters
@ -64,7 +66,43 @@ SR_CYRL_TO_LATN_DICT = {
ord('Ќ'): '', ord('ќ'): '',
}
AZ_LATN_TABLE = SR_LATN_TABLE.copy()
AZ_LATN_TABLE.update({
# Distinct Azerbaijani letters
ord('Ҹ'): 'C', ord('ҹ'): 'c',
ord('Ә'): 'Ə', ord('ә'): 'ə',
ord('Ғ'): 'Ğ', ord('ғ'): 'ğ',
ord('Һ'): 'H', ord('һ'): 'h',
ord('Ө'): 'Ö', ord('ө'): 'ö',
ord('Ҝ'): 'G', ord('ҝ'): 'g',
ord('Ү'): 'Ü', ord('ү'): 'ü',
def serbian_cyrillic_to_latin(text):
return text.translate(SR_CYRL_TO_LATN_DICT)
# Azerbaijani letters with different transliterations
ord('Ч'): 'Ç', ord('ч'): 'ç',
ord('Х'): 'X', ord('х'): 'x',
ord('Ы'): 'I', ord('ы'): 'ı',
ord('И'): 'İ', ord('и'): 'ı',
ord('Ж'): 'J', ord('ж'): 'j',
ord('Ј'): 'Y', ord('ј'): 'y',
ord('Г'): 'Q', ord('г'): 'q',
ord('Ш'): 'Ş', ord('ш'): 'ş',
})
def transliterate(table, text):
"""
Transliterate text according to one of the tables above.
`table` chooses the table. It looks like a language code but comes from a
very restricted set:
- 'sr-Latn' means to convert Serbian, which may be in Cyrillic, into the
Latin alphabet.
- 'az-Latn' means the same for Azerbaijani Cyrillic to Latn.
"""
if table == 'sr-Latn':
return text.translate(SR_LATN_TABLE)
elif table == 'az-Latn':
return text.translate(AZ_LATN_TABLE)
else:
raise ValueError("Unknown transliteration table: {!r}".format(table))