Tokenize by graphemes, not codepoints (#50)

* Tokenize by graphemes, not codepoints * Add more documentation to TOKEN_RE * Remove extra line break * Update docstring - Brahmic scripts are no longer an exception * approve using version 2017.07.28 of regex
2024-12-23 17:31:41 +00:00 · 2017-08-08 11:35:28 -04:00 · 2017-08-08 11:35:28 -04:00 · dcef5813b3
commit dcef5813b3
parent baf6771e97
3 changed files with 58 additions and 26 deletions
--- a/setup.py
+++ b/setup.py
@ -27,7 +27,9 @@ current_dir = os.path.dirname(__file__)
 README_contents = open(os.path.join(current_dir, 'README.md'),
                       encoding='utf-8').read()
 doclines = README_contents.split("\n")
-dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015']
+dependencies = [
+    'ftfy >= 5', 'msgpack-python', 'langcodes >= 1.4', 'regex == 2017.07.28'
+]
 if sys.version_info < (3, 4):
    dependencies.append('pathlib')

--- a/tests/test.py
+++ b/tests/test.py
@ -137,6 +137,20 @@ def test_tokenization():
    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
        ['this', 'text', 'has', '...', 'punctuation', ':)'])

+    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
+    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
+    # is up to date
+    eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
+
+    eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
+        ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
+         'nothing', 'i', 'can', 'do', '🌎', '🚀'])
+
+    # Water wave, surfer, flag of California (indicates ridiculously complete support
+    # for Unicode 10 and Emoji 5.0)
+    eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
+        ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
+

 def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -60,6 +60,13 @@ TOKEN_RE = regex.compile(r"""
    # Case 2: standard Unicode segmentation
    # -------------------------------------

+    # The start of the token must be 'word-like', not punctuation or whitespace
+    # or various other things. However, we allow characters of category So
+    # (Symbol - Other) because many of these are emoji, which can convey
+    # meaning.
+
+    (?=[\w\p{So}])
+
    # The start of the token must not be a letter followed by «'h». If it is,
    # we should use Case 3 to match up to the apostrophe, then match a new token
    # starting with «h». This rule lets us break «l'heure» into two tokens, just
@ -67,18 +74,28 @@ TOKEN_RE = regex.compile(r"""

    (?!\w'[Hh])

-    # The start of the token must be 'word-like', not punctuation or whitespace
-    # or various other things. However, we allow characters of category So
-    # (Symbol - Other) because many of these are emoji, which can convey
-    # meaning.
+    # The entire token is made of graphemes (\X). Matching by graphemes means
+    # that we don't have to specially account for marks or ZWJ sequences.
+    #
+    # The token ends as soon as it encounters a word break (\b). We use the
+    # non-greedy match (+?) to make sure to end at the first word break we
+    # encounter.
+    \X+? \b |

-    [\w\p{So}]
-
-    # The rest of the token matches characters that are not any sort of space
-    # (\S) and do not cause word breaks according to the Unicode word
-    # segmentation heuristic (\B), or are categorized as Marks (\p{M}).
-
-    (?:\B\S|\p{M})* |
+    # If we were matching by codepoints (.) instead of graphemes (\X), then
+    # detecting boundaries would be more difficult. Here's a fact that's subtle
+    # and poorly documented: a position that's between codepoints, but in the
+    # middle of a grapheme, does not match as a word break (\b), but also does
+    # not match as not-a-word-break (\B). The word boundary algorithm simply
+    # doesn't apply in such a position.
+    #
+    # We used to match the rest of the token using \S, which matches non-space
+    # *codepoints*, and this caused us to incompletely work around cases where
+    # it left off in the middle of a grapheme.
+    #
+    # Another subtle fact: the "non-breaking space" U+A0 counts as a word break
+    # here. That's surprising, but it's also what we want, because we don't want
+    # any kind of spaces in the middle of our tokens.

    # Case 3: Fix French
    # ------------------
@ -90,9 +107,12 @@ TOKEN_RE = regex.compile(r"""
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)

 TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
+    # This expression is similar to the expression above, but also matches any
+    # sequence of punctuation characters.
+
    [<SPACELESS>]+ |
    [\p{punct}]+ |
-    (?!\w'[Hh]) \S(?:\B\S|\p{M})* |
+    (?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b |
    \w'
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)

@ -110,8 +130,12 @@ def simple_tokenize(text, include_punctuation=False):
    The expression mostly implements the rules of Unicode Annex #29 that
    are contained in the `regex` module's word boundary matching, including
    the refinement that splits words between apostrophes and vowels in order
-    to separate tokens such as the French article «l'». Our customizations
-    to the expression are:
+    to separate tokens such as the French article «l'».
+
+    It makes sure not to split in the middle of a grapheme, so that zero-width
+    joiners and marks on Devanagari words work correctly.
+
+    Our customizations to the expression are:

    - It leaves sequences of Chinese or Japanese characters (specifically, Han
      ideograms and hiragana) relatively untokenized, instead of splitting each
@ -122,13 +146,8 @@ def simple_tokenize(text, include_punctuation=False):
      such as emoji. If `include_punctuation` is True, it outputs all non-space
      tokens.

-    - It breaks on all spaces, even the "non-breaking" ones.
-
-    - It aims to keep marks together with words, so that they aren't erroneously
-      split off as punctuation in languages such as Hindi.
-
    - It keeps Southeast Asian scripts, such as Thai, glued together. This yields
-      tokens that are much too long, but the alternative is that every character
+      tokens that are much too long, but the alternative is that every grapheme
      would end up in its own token, which is worse.
    """
    text = unicodedata.normalize('NFC', text)
@ -351,11 +370,8 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
    -----------------------------------

    Any kind of language not previously mentioned will just go through the same
-    tokenizer that alphabetic languages use.
-
-    We've tweaked this tokenizer for the case of Indic languages in Brahmic
-    scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
-    languages where the default Unicode algorithm wouldn't quite work.
+    tokenizer that alphabetic languages use. This includes the Brahmic scripts
+    used in Hindi, Tamil, and Telugu, for example.

    Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
    written in Brahmic-derived scripts, but usually *without spaces*. wordfreq