wordfreq/wordfreq/tokens.py
2016-07-29 12:32:26 -04:00

327 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import regex
import unicodedata
mecab_tokenize = None
jieba_tokenize = None
# See the documentation inside TOKEN_RE for why we have to handle these
# scripts specially.
SPACELESS_SCRIPTS = [
'Hiragana',
'Thai', # Thai script
'Khmr', # Khmer script
'Laoo', # Lao script
'Mymr', # Burmese script
'Tale', # Tai Le script
'Talu', # Tai Lü script
'Lana', # Lanna script
]
ABJAD_LANGUAGES = {
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
}
def _make_spaceless_expr():
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
return ''.join(pieces)
SPACELESS_EXPR = _make_spaceless_expr()
TOKEN_RE = regex.compile(r"""
# Case 1: a special case for non-spaced languages
# -----------------------------------------------
# Some scripts are written without spaces, and the Unicode algorithm
# seems to overreact and insert word breaks between all their letters.
# When we see sequences of characters in these scripts, we make sure not
# to break them up. Such scripts include Han ideographs (\p{IsIdeo}),
# hiragana (\p{Script=Hiragana}), and many Southeast Asian scripts such
# as Thai and Khmer.
#
# Without this case, the standard rule (case 2) would make each character
# a separate token. This would be the correct behavior for word-wrapping,
# but a messy failure mode for NLP tokenization.
#
# If you have Chinese or Japanese text, it's certainly better to use a
# tokenizer that's designed for it. Elsewhere in this file, we have
# specific tokenizers that can handle Chinese and Japanese. With this
# rule, though, at least this general tokenizer will fail less badly
# on those languages.
#
# This rule is listed first so that it takes precedence. The placeholder
# <SPACELESS> will be replaced by the complex range expression made by
# _make_spaceless_expr().
[<SPACELESS>]+ |
# Case 2: standard Unicode segmentation
# -------------------------------------
# The start of the token must be 'word-like', not punctuation or whitespace
# or various other things. However, we allow characters of category So
# (Symbol - Other) because many of these are emoji, which can convey
# meaning.
[\w\p{So}]
# The rest of the token matches characters that are not any sort of space
# (\S) and do not cause word breaks according to the Unicode word
# segmentation heuristic (\B), or are categorized as Marks (\p{M}).
(?:\B\S|\p{M})*
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
[<SPACELESS>]+ |
[\p{punct}]+ |
\S(?:\B\S|\p{M})*
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
def simple_tokenize(text, include_punctuation=False):
"""
Tokenize the given text using a straightforward, Unicode-aware token
expression.
The expression mostly implements the rules of Unicode Annex #29 that
are contained in the `regex` module's word boundary matching, including
the refinement that splits words between apostrophes and vowels in order
to separate tokens such as the French article «l'». Our customizations
to the expression are:
- It leaves sequences of Chinese or Japanese characters (specifically, Han
ideograms and hiragana) relatively untokenized, instead of splitting each
character into its own token.
- If `include_punctuation` is False (the default), it outputs only the
tokens that start with a word-like character, or miscellaneous symbols
such as emoji. If `include_punctuation` is True, it outputs all non-space
tokens.
- It breaks on all spaces, even the "non-breaking" ones.
- It aims to keep marks together with words, so that they aren't erroneously
split off as punctuation in languages such as Hindi.
- It keeps Southeast Asian scripts, such as Thai, glued together. This yields
tokens that are much too long, but the alternative is that every character
would end up in its own token, which is worse.
"""
text = unicodedata.normalize('NFC', text)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.strip("'").casefold() for token in token_expr.findall(text)]
def turkish_tokenize(text, include_punctuation=False):
"""
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
in Turkish, and modifies 'comma-below' characters to use cedillas.
"""
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [
commas_to_cedillas(token.strip("'").casefold())
for token in token_expr.findall(text)
]
def romanian_tokenize(text, include_punctuation=False):
"""
Like `simple_tokenize`, but modifies the letters ş and ţ (with cedillas)
to use commas-below instead.
"""
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [
cedillas_to_commas(token.strip("'").casefold())
for token in token_expr.findall(text)
]
def tokenize_mecab_language(text, lang, include_punctuation=False):
"""
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
"""
global mecab_tokenize
if not (lang == 'ja' or lang == 'ko'):
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize
tokens = mecab_tokenize(text, lang)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
"""
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
"""
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
def remove_marks(text):
"""
Remove decorations from words in abjad scripts:
- Combining marks of class Mn, which tend to represent non-essential
vowel markings.
- Tatweels, horizontal segments that are used to extend or justify an
Arabic word.
"""
return MARK_RE.sub('', text)
def commas_to_cedillas(text):
"""
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
preferred in Turkish.
Only the lowercase versions are replaced, because this assumes the
text has already been case-folded.
"""
return text.replace(
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
).replace(
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
)
def cedillas_to_commas(text):
"""
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
preferred in Romanian.
Only the lowercase versions are replaced, because this assumes the
text has already been case-folded.
"""
return text.replace(
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
).replace(
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
)
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
"""
Tokenize this text in a way that's relatively simple but appropriate for
the language. Strings that are looked up in wordfreq will be run through
this function first, so that they can be expected to match the data.
Some of the processing steps are specific to one language, such as Chinese,
but what broadly happens to the text depends on what general writing system
the language uses, out of these categories:
- Alphabetic scripts: English, Spanish, Russian, etc.
- Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
- CJK scripts: Chinese, Japanese, Korean
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
Alphabetic scripts
------------------
The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
European languages, which are relatively straightforward to tokenize.
Text in these scripts will be normalized to NFC form, then passed
through a regular expression that implements the Word Segmentation section
of Unicode Annex #29, and then case-folded to lowercase.
The effect is mostly to split the text on spaces and punctuation. There are
some subtleties involving apostrophes inside words, which the regex will
only split when they occur before a vowel. ("Hasn't" is one token, but
"l'enfant" is two.)
If the language is Turkish, the case-folding rules will take this into
account, so that capital I and İ map to ı and i respectively.
Abjad scripts
-------------
Languages in the Arabic or Hebrew scripts are written with optional vowel
marks, and sometimes other decorative markings and ligatures. In these
languages:
- The text will be NFKC-normalized, which is a stronger and lossier form
than NFC. Here its purpose is to reduce ligatures to simpler characters.
- Marks will be removed, as well as the Arabic tatweel (an extension of
a word that is used for justification or decoration).
After these steps, the text will go through the same process as the
alphabetic scripts above.
CJK scripts
-----------
In the CJK languages, word boundaries can't usually be identified by a
regular expression. Instead, there needs to be some language-specific
handling.
- Chinese text first gets converted to a canonical representation we call
"Oversimplified Chinese", where all characters are replaced by their
Simplified Chinese form, no matter what, even when this misspells a word or
a name. This representation is then tokenized using the Jieba tokenizer,
trained on the list of Chinese words that can be looked up in wordfreq.
- Japanese and Korean will be NFKC-normalized, then tokenized using the
MeCab tokenizer, using dictionary files that are included in this
package.
The `external_wordlist` option only affects Chinese tokenization. If it's
True, then wordfreq will not use its own Chinese wordlist for tokenization.
Instead, it will use the large wordlist packaged with the Jieba tokenizer,
and it will leave Traditional Chinese characters as is. This will probably
give more accurate tokenization, but the resulting tokens won't necessarily
have word frequencies that can be looked up.
If you end up seeing tokens that are entire phrases or sentences glued
together, that probably means you passed in CJK text with the wrong
language code.
Brahmic scripts and other languages
-----------------------------------
Any kind of language not previously mentioned will just go through the same
tokenizer that alphabetic languages use.
We've tweaked this tokenizer for the case of Indic languages in Brahmic
scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
languages where the default Unicode algorithm wouldn't quite work.
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
does not support these languages yet. It will split on spaces and
punctuation, giving tokens that are far too long.
"""
if lang == 'ja' or lang == 'ko':
return tokenize_mecab_language(text, lang, include_punctuation)
elif lang == 'zh':
return chinese_tokenize(text, include_punctuation, external_wordlist)
elif lang == 'tr':
return turkish_tokenize(text, include_punctuation)
elif lang == 'ro':
return romanian_tokenize(text, include_punctuation)
elif lang in ABJAD_LANGUAGES:
text = remove_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text, include_punctuation)
else:
return simple_tokenize(text, include_punctuation)