mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
fcbdf560c2
Former-commit-id: 1a16b0f84c
327 lines
12 KiB
Python
327 lines
12 KiB
Python
import regex
|
||
import unicodedata
|
||
|
||
|
||
mecab_tokenize = None
|
||
jieba_tokenize = None
|
||
|
||
# See the documentation inside TOKEN_RE for why we have to handle these
|
||
# scripts specially.
|
||
SPACELESS_SCRIPTS = [
|
||
'Hiragana',
|
||
'Thai', # Thai script
|
||
'Khmr', # Khmer script
|
||
'Laoo', # Lao script
|
||
'Mymr', # Burmese script
|
||
'Tale', # Tai Le script
|
||
'Talu', # Tai Lü script
|
||
'Lana', # Lanna script
|
||
]
|
||
|
||
ABJAD_LANGUAGES = {
|
||
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
||
}
|
||
|
||
|
||
def _make_spaceless_expr():
|
||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
||
return ''.join(pieces)
|
||
|
||
|
||
SPACELESS_EXPR = _make_spaceless_expr()
|
||
|
||
TOKEN_RE = regex.compile(r"""
|
||
# Case 1: a special case for non-spaced languages
|
||
# -----------------------------------------------
|
||
|
||
# Some scripts are written without spaces, and the Unicode algorithm
|
||
# seems to overreact and insert word breaks between all their letters.
|
||
# When we see sequences of characters in these scripts, we make sure not
|
||
# to break them up. Such scripts include Han ideographs (\p{IsIdeo}),
|
||
# hiragana (\p{Script=Hiragana}), and many Southeast Asian scripts such
|
||
# as Thai and Khmer.
|
||
#
|
||
# Without this case, the standard rule (case 2) would make each character
|
||
# a separate token. This would be the correct behavior for word-wrapping,
|
||
# but a messy failure mode for NLP tokenization.
|
||
#
|
||
# If you have Chinese or Japanese text, it's certainly better to use a
|
||
# tokenizer that's designed for it. Elsewhere in this file, we have
|
||
# specific tokenizers that can handle Chinese and Japanese. With this
|
||
# rule, though, at least this general tokenizer will fail less badly
|
||
# on those languages.
|
||
#
|
||
# This rule is listed first so that it takes precedence. The placeholder
|
||
# <SPACELESS> will be replaced by the complex range expression made by
|
||
# _make_spaceless_expr().
|
||
|
||
[<SPACELESS>]+ |
|
||
|
||
# Case 2: standard Unicode segmentation
|
||
# -------------------------------------
|
||
|
||
# The start of the token must be 'word-like', not punctuation or whitespace
|
||
# or various other things. However, we allow characters of category So
|
||
# (Symbol - Other) because many of these are emoji, which can convey
|
||
# meaning.
|
||
|
||
[\w\p{So}]
|
||
|
||
# The rest of the token matches characters that are not any sort of space
|
||
# (\S) and do not cause word breaks according to the Unicode word
|
||
# segmentation heuristic (\B), or are categorized as Marks (\p{M}).
|
||
|
||
(?:\B\S|\p{M})*
|
||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||
|
||
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||
[<SPACELESS>]+ |
|
||
[\p{punct}]+ |
|
||
\S(?:\B\S|\p{M})*
|
||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||
|
||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||
|
||
|
||
def simple_tokenize(text, include_punctuation=False):
|
||
"""
|
||
Tokenize the given text using a straightforward, Unicode-aware token
|
||
expression.
|
||
|
||
The expression mostly implements the rules of Unicode Annex #29 that
|
||
are contained in the `regex` module's word boundary matching, including
|
||
the refinement that splits words between apostrophes and vowels in order
|
||
to separate tokens such as the French article «l'». Our customizations
|
||
to the expression are:
|
||
|
||
- It leaves sequences of Chinese or Japanese characters (specifically, Han
|
||
ideograms and hiragana) relatively untokenized, instead of splitting each
|
||
character into its own token.
|
||
|
||
- If `include_punctuation` is False (the default), it outputs only the
|
||
tokens that start with a word-like character, or miscellaneous symbols
|
||
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
||
tokens.
|
||
|
||
- It breaks on all spaces, even the "non-breaking" ones.
|
||
|
||
- It aims to keep marks together with words, so that they aren't erroneously
|
||
split off as punctuation in languages such as Hindi.
|
||
|
||
- It keeps Southeast Asian scripts, such as Thai, glued together. This yields
|
||
tokens that are much too long, but the alternative is that every character
|
||
would end up in its own token, which is worse.
|
||
"""
|
||
text = unicodedata.normalize('NFC', text)
|
||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||
|
||
|
||
def turkish_tokenize(text, include_punctuation=False):
|
||
"""
|
||
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
|
||
in Turkish, and modifies 'comma-below' characters to use cedillas.
|
||
"""
|
||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||
return [
|
||
commas_to_cedillas(token.strip("'").casefold())
|
||
for token in token_expr.findall(text)
|
||
]
|
||
|
||
|
||
def romanian_tokenize(text, include_punctuation=False):
|
||
"""
|
||
Like `simple_tokenize`, but modifies the letters ş and ţ (with cedillas)
|
||
to use commas-below instead.
|
||
"""
|
||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||
return [
|
||
cedillas_to_commas(token.strip("'").casefold())
|
||
for token in token_expr.findall(text)
|
||
]
|
||
|
||
|
||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||
"""
|
||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||
"""
|
||
global mecab_tokenize
|
||
if not (lang == 'ja' or lang == 'ko'):
|
||
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
||
if mecab_tokenize is None:
|
||
from wordfreq.mecab import mecab_tokenize
|
||
tokens = mecab_tokenize(text, lang)
|
||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||
|
||
|
||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||
"""
|
||
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
|
||
"""
|
||
global jieba_tokenize
|
||
if jieba_tokenize is None:
|
||
from wordfreq.chinese import jieba_tokenize
|
||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||
|
||
|
||
def remove_marks(text):
|
||
"""
|
||
Remove decorations from words in abjad scripts:
|
||
|
||
- Combining marks of class Mn, which tend to represent non-essential
|
||
vowel markings.
|
||
- Tatweels, horizontal segments that are used to extend or justify an
|
||
Arabic word.
|
||
"""
|
||
return MARK_RE.sub('', text)
|
||
|
||
|
||
def commas_to_cedillas(text):
|
||
"""
|
||
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
|
||
preferred in Turkish.
|
||
|
||
Only the lowercase versions are replaced, because this assumes the
|
||
text has already been case-folded.
|
||
"""
|
||
return text.replace(
|
||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
|
||
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
|
||
).replace(
|
||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
|
||
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
|
||
)
|
||
|
||
|
||
def cedillas_to_commas(text):
|
||
"""
|
||
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
|
||
preferred in Romanian.
|
||
|
||
Only the lowercase versions are replaced, because this assumes the
|
||
text has already been case-folded.
|
||
"""
|
||
return text.replace(
|
||
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
|
||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
|
||
).replace(
|
||
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
|
||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
||
)
|
||
|
||
|
||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||
"""
|
||
Tokenize this text in a way that's relatively simple but appropriate for
|
||
the language. Strings that are looked up in wordfreq will be run through
|
||
this function first, so that they can be expected to match the data.
|
||
|
||
Some of the processing steps are specific to one language, such as Chinese,
|
||
but what broadly happens to the text depends on what general writing system
|
||
the language uses, out of these categories:
|
||
|
||
- Alphabetic scripts: English, Spanish, Russian, etc.
|
||
- Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
|
||
- CJK scripts: Chinese, Japanese, Korean
|
||
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
||
|
||
|
||
Alphabetic scripts
|
||
------------------
|
||
|
||
The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
|
||
European languages, which are relatively straightforward to tokenize.
|
||
|
||
Text in these scripts will be normalized to NFC form, then passed
|
||
through a regular expression that implements the Word Segmentation section
|
||
of Unicode Annex #29, and then case-folded to lowercase.
|
||
|
||
The effect is mostly to split the text on spaces and punctuation. There are
|
||
some subtleties involving apostrophes inside words, which the regex will
|
||
only split when they occur before a vowel. ("Hasn't" is one token, but
|
||
"l'enfant" is two.)
|
||
|
||
If the language is Turkish, the case-folding rules will take this into
|
||
account, so that capital I and İ map to ı and i respectively.
|
||
|
||
|
||
Abjad scripts
|
||
-------------
|
||
|
||
Languages in the Arabic or Hebrew scripts are written with optional vowel
|
||
marks, and sometimes other decorative markings and ligatures. In these
|
||
languages:
|
||
|
||
- The text will be NFKC-normalized, which is a stronger and lossier form
|
||
than NFC. Here its purpose is to reduce ligatures to simpler characters.
|
||
|
||
- Marks will be removed, as well as the Arabic tatweel (an extension of
|
||
a word that is used for justification or decoration).
|
||
|
||
After these steps, the text will go through the same process as the
|
||
alphabetic scripts above.
|
||
|
||
|
||
CJK scripts
|
||
-----------
|
||
|
||
In the CJK languages, word boundaries can't usually be identified by a
|
||
regular expression. Instead, there needs to be some language-specific
|
||
handling.
|
||
|
||
- Chinese text first gets converted to a canonical representation we call
|
||
"Oversimplified Chinese", where all characters are replaced by their
|
||
Simplified Chinese form, no matter what, even when this misspells a word or
|
||
a name. This representation is then tokenized using the Jieba tokenizer,
|
||
trained on the list of Chinese words that can be looked up in wordfreq.
|
||
|
||
- Japanese and Korean will be NFKC-normalized, then tokenized using the
|
||
MeCab tokenizer, using dictionary files that are included in this
|
||
package.
|
||
|
||
The `external_wordlist` option only affects Chinese tokenization. If it's
|
||
True, then wordfreq will not use its own Chinese wordlist for tokenization.
|
||
Instead, it will use the large wordlist packaged with the Jieba tokenizer,
|
||
and it will leave Traditional Chinese characters as is. This will probably
|
||
give more accurate tokenization, but the resulting tokens won't necessarily
|
||
have word frequencies that can be looked up.
|
||
|
||
If you end up seeing tokens that are entire phrases or sentences glued
|
||
together, that probably means you passed in CJK text with the wrong
|
||
language code.
|
||
|
||
|
||
Brahmic scripts and other languages
|
||
-----------------------------------
|
||
|
||
Any kind of language not previously mentioned will just go through the same
|
||
tokenizer that alphabetic languages use.
|
||
|
||
We've tweaked this tokenizer for the case of Indic languages in Brahmic
|
||
scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
|
||
languages where the default Unicode algorithm wouldn't quite work.
|
||
|
||
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
|
||
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
|
||
does not support these languages yet. It will split on spaces and
|
||
punctuation, giving tokens that are far too long.
|
||
"""
|
||
if lang == 'ja' or lang == 'ko':
|
||
return tokenize_mecab_language(text, lang, include_punctuation)
|
||
elif lang == 'zh':
|
||
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
||
elif lang == 'tr':
|
||
return turkish_tokenize(text, include_punctuation)
|
||
elif lang == 'ro':
|
||
return romanian_tokenize(text, include_punctuation)
|
||
elif lang in ABJAD_LANGUAGES:
|
||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||
return simple_tokenize(text, include_punctuation)
|
||
else:
|
||
return simple_tokenize(text, include_punctuation)
|
||
|