Tokenize by graphemes, not codepoints (#50)

* Tokenize by graphemes, not codepoints

* Add more documentation to TOKEN_RE

* Remove extra line break

* Update docstring - Brahmic scripts are no longer an exception

* approve using version 2017.07.28 of regex
This commit is contained in:
Robyn Speer 2017-08-08 11:35:28 -04:00 committed by Andrew Lin
parent 6c118c0b6a
commit 9dac967ca3
3 changed files with 58 additions and 26 deletions

View File

@ -27,7 +27,9 @@ current_dir = os.path.dirname(__file__)
README_contents = open(os.path.join(current_dir, 'README.md'), README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read() encoding='utf-8').read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015'] dependencies = [
'ftfy >= 5', 'msgpack-python', 'langcodes >= 1.4', 'regex == 2017.07.28'
]
if sys.version_info < (3, 4): if sys.version_info < (3, 4):
dependencies.append('pathlib') dependencies.append('pathlib')

View File

@ -137,6 +137,20 @@ def test_tokenization():
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
['this', 'text', 'has', '...', 'punctuation', ':)']) ['this', 'text', 'has', '...', 'punctuation', ':)'])
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
# is up to date
eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
'nothing', 'i', 'can', 'do', '🌎', '🚀'])
# Water wave, surfer, flag of California (indicates ridiculously complete support
# for Unicode 10 and Emoji 5.0)
eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
def test_casefolding(): def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss']) eq_(tokenize('WEISS', 'de'), ['weiss'])

View File

@ -60,6 +60,13 @@ TOKEN_RE = regex.compile(r"""
# Case 2: standard Unicode segmentation # Case 2: standard Unicode segmentation
# ------------------------------------- # -------------------------------------
# The start of the token must be 'word-like', not punctuation or whitespace
# or various other things. However, we allow characters of category So
# (Symbol - Other) because many of these are emoji, which can convey
# meaning.
(?=[\w\p{So}])
# The start of the token must not be a letter followed by «'h». If it is, # The start of the token must not be a letter followed by «'h». If it is,
# we should use Case 3 to match up to the apostrophe, then match a new token # we should use Case 3 to match up to the apostrophe, then match a new token
# starting with «h». This rule lets us break «l'heure» into two tokens, just # starting with «h». This rule lets us break «l'heure» into two tokens, just
@ -67,18 +74,28 @@ TOKEN_RE = regex.compile(r"""
(?!\w'[Hh]) (?!\w'[Hh])
# The start of the token must be 'word-like', not punctuation or whitespace # The entire token is made of graphemes (\X). Matching by graphemes means
# or various other things. However, we allow characters of category So # that we don't have to specially account for marks or ZWJ sequences.
# (Symbol - Other) because many of these are emoji, which can convey #
# meaning. # The token ends as soon as it encounters a word break (\b). We use the
# non-greedy match (+?) to make sure to end at the first word break we
# encounter.
\X+? \b |
[\w\p{So}] # If we were matching by codepoints (.) instead of graphemes (\X), then
# detecting boundaries would be more difficult. Here's a fact that's subtle
# The rest of the token matches characters that are not any sort of space # and poorly documented: a position that's between codepoints, but in the
# (\S) and do not cause word breaks according to the Unicode word # middle of a grapheme, does not match as a word break (\b), but also does
# segmentation heuristic (\B), or are categorized as Marks (\p{M}). # not match as not-a-word-break (\B). The word boundary algorithm simply
# doesn't apply in such a position.
(?:\B\S|\p{M})* | #
# We used to match the rest of the token using \S, which matches non-space
# *codepoints*, and this caused us to incompletely work around cases where
# it left off in the middle of a grapheme.
#
# Another subtle fact: the "non-breaking space" U+A0 counts as a word break
# here. That's surprising, but it's also what we want, because we don't want
# any kind of spaces in the middle of our tokens.
# Case 3: Fix French # Case 3: Fix French
# ------------------ # ------------------
@ -90,9 +107,12 @@ TOKEN_RE = regex.compile(r"""
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r""" TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
# This expression is similar to the expression above, but also matches any
# sequence of punctuation characters.
[<SPACELESS>]+ | [<SPACELESS>]+ |
[\p{punct}]+ | [\p{punct}]+ |
(?!\w'[Hh]) \S(?:\B\S|\p{M})* | (?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b |
\w' \w'
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
@ -110,8 +130,12 @@ def simple_tokenize(text, include_punctuation=False):
The expression mostly implements the rules of Unicode Annex #29 that The expression mostly implements the rules of Unicode Annex #29 that
are contained in the `regex` module's word boundary matching, including are contained in the `regex` module's word boundary matching, including
the refinement that splits words between apostrophes and vowels in order the refinement that splits words between apostrophes and vowels in order
to separate tokens such as the French article «l'». Our customizations to separate tokens such as the French article «l'».
to the expression are:
It makes sure not to split in the middle of a grapheme, so that zero-width
joiners and marks on Devanagari words work correctly.
Our customizations to the expression are:
- It leaves sequences of Chinese or Japanese characters (specifically, Han - It leaves sequences of Chinese or Japanese characters (specifically, Han
ideograms and hiragana) relatively untokenized, instead of splitting each ideograms and hiragana) relatively untokenized, instead of splitting each
@ -122,13 +146,8 @@ def simple_tokenize(text, include_punctuation=False):
such as emoji. If `include_punctuation` is True, it outputs all non-space such as emoji. If `include_punctuation` is True, it outputs all non-space
tokens. tokens.
- It breaks on all spaces, even the "non-breaking" ones.
- It aims to keep marks together with words, so that they aren't erroneously
split off as punctuation in languages such as Hindi.
- It keeps Southeast Asian scripts, such as Thai, glued together. This yields - It keeps Southeast Asian scripts, such as Thai, glued together. This yields
tokens that are much too long, but the alternative is that every character tokens that are much too long, but the alternative is that every grapheme
would end up in its own token, which is worse. would end up in its own token, which is worse.
""" """
text = unicodedata.normalize('NFC', text) text = unicodedata.normalize('NFC', text)
@ -351,11 +370,8 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
----------------------------------- -----------------------------------
Any kind of language not previously mentioned will just go through the same Any kind of language not previously mentioned will just go through the same
tokenizer that alphabetic languages use. tokenizer that alphabetic languages use. This includes the Brahmic scripts
used in Hindi, Tamil, and Telugu, for example.
We've tweaked this tokenizer for the case of Indic languages in Brahmic
scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
languages where the default Unicode algorithm wouldn't quite work.
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq written in Brahmic-derived scripts, but usually *without spaces*. wordfreq