mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
3a140ee02f
commit
7fa449729b
@ -2,51 +2,64 @@ import regex
|
|||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
# Here's what the following regular expression is looking for:
|
TOKEN_RE = regex.compile(r"""
|
||||||
#
|
# Case 1: a special case for Chinese and Japanese
|
||||||
# At the start, it looks for a character in the set \S -- the set of
|
# -----------------------------------------------
|
||||||
# non-punctuation -- with various characters subtracted out, including
|
|
||||||
# punctuation and most of the 'symbol' categories. (We leave So, "Symbol -
|
# When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
|
||||||
# Other", because it contains things like emoji that have interesting
|
# \p{Script=Hiragana}, we allow a sequence of those characters to be glued
|
||||||
# frequencies. This is why we don't just insist on the token starting with a
|
# together as a single token. Without this case, the standard rule (case 2)
|
||||||
# "word" character, \w.)
|
# would make each characte a separate token. This would be the correct
|
||||||
#
|
# behavior for word-wrapping, but a messy failure mode for NLP
|
||||||
# WB=Extend is a Unicode property that says, for the purpose of word breaking,
|
# tokenization.
|
||||||
# that this character should get the word-breaking properties of the previous
|
#
|
||||||
# character. It's used for combining marks and stuff. If it shows up at the
|
# It is, of course, better to use a tokenizer that is designed for Chinese
|
||||||
# beginning of the token, something has gone wrong, so exclude it as a token.
|
# or Japanese text. This is effectively a fallback for when the wrong
|
||||||
#
|
# tokenizer is used.
|
||||||
# After it has found a starting character, the rest of the token matches
|
#
|
||||||
# (?:\B\S)*, which continues to consume characters as long as the next
|
# This rule is listed first so that it takes precedence.
|
||||||
# character does not cause a word break (\B) and is not a space (\S). The
|
|
||||||
# individual characters in this portion can be punctuation, allowing tokens
|
[\p{IsIdeo}\p{Script=Hiragana}]+ |
|
||||||
# such as "can't" or "google.com".
|
|
||||||
#
|
# Case 2: standard Unicode segmentation
|
||||||
# As a complication, the rest of the token can match a glob of Han ideographs
|
# -------------------------------------
|
||||||
# (\p{IsIdeo}) and hiragana (\p{Script=Hiragana}). Chinese words are made of
|
|
||||||
# Han ideographs (but we don't know where the breaks between them are).
|
# The start of the token must be 'word-like', not punctuation or whitespace
|
||||||
# Similarly, Japanese words are either made of Han ideographs and hiragana
|
# or various other things. However, we allow characters of category So
|
||||||
# (which will be matched by this expression), or katakana (which will be
|
# because many of these are emoji, which can convey meaning.
|
||||||
# matched by the standard Unicode rule).
|
|
||||||
#
|
[\w\p{So}]
|
||||||
# Without this special case for ideographs and hiragana, the standard Unicode
|
|
||||||
# rule would put each character in its own token. This actually would be the
|
# The rest of the token matches characters that are not any sort of space
|
||||||
# correct behavior for word-wrapping, but it's an ugly failure mode for NLP
|
# (\S) and do not cause word breaks according to the Unicode word
|
||||||
# tokenization.
|
# segmentation heuristic (\B).
|
||||||
|
|
||||||
|
(?:\B\S)*
|
||||||
|
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
TOKEN_RE = regex.compile(
|
|
||||||
r'[\S--[\p{punct}\p{Sm}\p{Sc}\p{Sk}\p{WB=Extend}]]'
|
|
||||||
r'(?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD)
|
|
||||||
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||||
|
|
||||||
|
|
||||||
def simple_tokenize(text):
|
def simple_tokenize(text):
|
||||||
"""
|
"""
|
||||||
Tokenize the given text using a straightforward, Unicode-aware token
|
Tokenize the given text using a straightforward, Unicode-aware token
|
||||||
expression. It returns non-whitespace tokens that are split at the
|
expression.
|
||||||
word boundaries defined by Unicode Tech Report #29, as implemented
|
|
||||||
by the regex package, except that it leaves Chinese and Japanese
|
The expression mostly implements the rules of Unicode Annex #29 that
|
||||||
relatively untokenized.
|
are contained in the `regex` module's word boundary matching, including
|
||||||
|
the refinement that splits words between apostrophes and vowels in order
|
||||||
|
to separate tokens such as the French article «l'». Our customizations
|
||||||
|
to the expression are:
|
||||||
|
|
||||||
|
- It leaves sequences of Chinese or Japanese characters (specifically, Han
|
||||||
|
ideograms and hiragana) relatively untokenized, instead of splitting each
|
||||||
|
character into its own token.
|
||||||
|
|
||||||
|
- It excludes punctuation, many classes of symbols, and "extenders" with
|
||||||
|
nothing to extend, from being tokens, but it allows miscellaneous symbols
|
||||||
|
such as emoji.
|
||||||
|
|
||||||
|
- It breaks on all spaces, even the "non-breaking" ones.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||||
@ -77,7 +90,9 @@ def tokenize(text, lang):
|
|||||||
- Chinese or Japanese texts that aren't identified as the appropriate
|
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||||
language will only split on punctuation and script boundaries, giving
|
language will only split on punctuation and script boundaries, giving
|
||||||
you untokenized globs of characters that probably represent many words.
|
you untokenized globs of characters that probably represent many words.
|
||||||
- All other languages will be tokenized according to UTR #29.
|
- All other languages will be tokenized using a regex that mostly
|
||||||
|
implements the Word Segmentation section of Unicode Annex #29.
|
||||||
|
See `simple_tokenize` for details.
|
||||||
|
|
||||||
Additionally, the text will be case-folded to lowercase, and text marked
|
Additionally, the text will be case-folded to lowercase, and text marked
|
||||||
as Arabic will be normalized more strongly and have combining marks and
|
as Arabic will be normalized more strongly and have combining marks and
|
||||||
|
Loading…
Reference in New Issue
Block a user