Tokenize by graphemes, not codepoints (#50)

* Tokenize by graphemes, not codepoints * Add more documentation to TOKEN_RE * Remove extra line break * Update docstring - Brahmic scripts are no longer an exception * approve using version 2017.07.28 of regex
2024-12-23 17:31:41 +00:00 · 2017-08-08 11:35:28 -04:00 · 2017-08-08 11:35:28 -04:00 · 9dac967ca3
commit 9dac967ca3
parent 6c118c0b6a
3 changed files with 58 additions and 26 deletions
--- a/setup.py
+++ b/setup.py
@ -27,7 +27,9 @@ current_dir = os.path.dirname(__file__)
 README_contents = open(os.path.join(current_dir, 'README.md'),
                       encoding='utf-8').read()
 doclines = README_contents.split("\n")
-dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015']
+dependencies = [
    'ftfy >= 5', 'msgpack-python', 'langcodes >= 1.4', 'regex == 2017.07.28'
 ]
 if sys.version_info < (3, 4):
    dependencies.append('pathlib')
--- a/tests/test.py
+++ b/tests/test.py
@ -137,6 +137,20 @@ def test_tokenization():
    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
        ['this', 'text', 'has', '...', 'punctuation', ':)'])
    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
    # is up to date
    eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
    eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
        ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
         'nothing', 'i', 'can', 'do', '🌎', '🚀'])
    # Water wave, surfer, flag of California (indicates ridiculously complete support
    # for Unicode 10 and Emoji 5.0)
    eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
        ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
 def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -60,6 +60,13 @@ TOKEN_RE = regex.compile(r"""
    # Case 2: standard Unicode segmentation
    # -------------------------------------
    # The start of the token must be 'word-like', not punctuation or whitespace
    # or various other things. However, we allow characters of category So
    # (Symbol - Other) because many of these are emoji, which can convey
    # meaning.
    (?=[\w\p{So}])
    # The start of the token must not be a letter followed by «'h». If it is,
    # we should use Case 3 to match up to the apostrophe, then match a new token
    # starting with «h». This rule lets us break «l'heure» into two tokens, just
@ -67,18 +74,28 @@ TOKEN_RE = regex.compile(r"""
    (?!\w'[Hh])
-    # The start of the token must be 'word-like', not punctuation or whitespace
+    # The entire token is made of graphemes (\X). Matching by graphemes means
-    # or various other things. However, we allow characters of category So
+    # that we don't have to specially account for marks or ZWJ sequences.
-    # (Symbol - Other) because many of these are emoji, which can convey
+    #
-    # meaning.
+    # The token ends as soon as it encounters a word break (\b). We use the
    # non-greedy match (+?) to make sure to end at the first word break we
    # encounter.
    \X+? \b |
-    [\w\p{So}]
+    # If we were matching by codepoints (.) instead of graphemes (\X), then
-
+    # detecting boundaries would be more difficult. Here's a fact that's subtle
-    # The rest of the token matches characters that are not any sort of space
+    # and poorly documented: a position that's between codepoints, but in the
-    # (\S) and do not cause word breaks according to the Unicode word
+    # middle of a grapheme, does not match as a word break (\b), but also does
-    # segmentation heuristic (\B), or are categorized as Marks (\p{M}).
+    # not match as not-a-word-break (\B). The word boundary algorithm simply
-
+    # doesn't apply in such a position.
-    (?:\B\S|\p{M})* |
+    #
    # We used to match the rest of the token using \S, which matches non-space
    # *codepoints*, and this caused us to incompletely work around cases where
    # it left off in the middle of a grapheme.
    #
    # Another subtle fact: the "non-breaking space" U+A0 counts as a word break
    # here. That's surprising, but it's also what we want, because we don't want
    # any kind of spaces in the middle of our tokens.
    # Case 3: Fix French
    # ------------------
@ -90,9 +107,12 @@ TOKEN_RE = regex.compile(r"""
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
 TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
    # This expression is similar to the expression above, but also matches any
    # sequence of punctuation characters.
    [<SPACELESS>]+ |
    [\p{punct}]+ |
-    (?!\w'[Hh]) \S(?:\B\S|\p{M})* |
+    (?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b |
    \w'
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
@ -110,8 +130,12 @@ def simple_tokenize(text, include_punctuation=False):
    The expression mostly implements the rules of Unicode Annex #29 that
    are contained in the `regex` module's word boundary matching, including
    the refinement that splits words between apostrophes and vowels in order
-    to separate tokens such as the French article «l'». Our customizations
+    to separate tokens such as the French article «l'».
-    to the expression are:
+
    It makes sure not to split in the middle of a grapheme, so that zero-width
    joiners and marks on Devanagari words work correctly.
    Our customizations to the expression are:
    - It leaves sequences of Chinese or Japanese characters (specifically, Han
      ideograms and hiragana) relatively untokenized, instead of splitting each
@ -122,13 +146,8 @@ def simple_tokenize(text, include_punctuation=False):
      such as emoji. If `include_punctuation` is True, it outputs all non-space
      tokens.
    - It breaks on all spaces, even the "non-breaking" ones.
    - It aims to keep marks together with words, so that they aren't erroneously
      split off as punctuation in languages such as Hindi.
    - It keeps Southeast Asian scripts, such as Thai, glued together. This yields
-      tokens that are much too long, but the alternative is that every character
+      tokens that are much too long, but the alternative is that every grapheme
      would end up in its own token, which is worse.
    """
    text = unicodedata.normalize('NFC', text)
@ -351,11 +370,8 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
    -----------------------------------
    Any kind of language not previously mentioned will just go through the same
-    tokenizer that alphabetic languages use.
+    tokenizer that alphabetic languages use. This includes the Brahmic scripts
-
+    used in Hindi, Tamil, and Telugu, for example.
    We've tweaked this tokenizer for the case of Indic languages in Brahmic
    scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
    languages where the default Unicode algorithm wouldn't quite work.
    Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
    written in Brahmic-derived scripts, but usually *without spaces*. wordfreq