From 9dac967ca39e29483478a2039bf95e4587d3f40e Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 8 Aug 2017 11:35:28 -0400 Subject: [PATCH] Tokenize by graphemes, not codepoints (#50) * Tokenize by graphemes, not codepoints * Add more documentation to TOKEN_RE * Remove extra line break * Update docstring - Brahmic scripts are no longer an exception * approve using version 2017.07.28 of regex --- setup.py | 4 ++- tests/test.py | 14 ++++++++++ wordfreq/tokens.py | 66 ++++++++++++++++++++++++++++------------------ 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index f3af099..c3727cb 100755 --- a/setup.py +++ b/setup.py @@ -27,7 +27,9 @@ current_dir = os.path.dirname(__file__) README_contents = open(os.path.join(current_dir, 'README.md'), encoding='utf-8').read() doclines = README_contents.split("\n") -dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015'] +dependencies = [ + 'ftfy >= 5', 'msgpack-python', 'langcodes >= 1.4', 'regex == 2017.07.28' +] if sys.version_info < (3, 4): dependencies.append('pathlib') diff --git a/tests/test.py b/tests/test.py index b441a3a..595bc76 100644 --- a/tests/test.py +++ b/tests/test.py @@ -137,6 +137,20 @@ def test_tokenization(): eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), ['this', 'text', 'has', '...', 'punctuation', ':)']) + # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf' + # and 'David Bowie' stay together, because our Unicode segmentation algorithm + # is up to date + eq_(tokenize('emoji test πŸ§•πŸ½', 'en'), ['emoji', 'test', 'πŸ§•πŸ½']) + + eq_(tokenize("πŸ‘¨β€πŸŽ€ Planet Earth is blue, and there's nothing I can do πŸŒŽπŸš€", 'en'), + ['πŸ‘¨β€πŸŽ€', 'planet', 'earth', 'is', 'blue', 'and', "there's", + 'nothing', 'i', 'can', 'do', '🌎', 'πŸš€']) + + # Water wave, surfer, flag of California (indicates ridiculously complete support + # for Unicode 10 and Emoji 5.0) + eq_(tokenize("Surf's up πŸŒŠπŸ„πŸ΄σ ΅σ ³σ £σ ‘σ Ώ'",'en'), + ["surf's", "up", "🌊", "πŸ„", "🏴󠁡󠁳󠁣󠁑󠁿"]) + def test_casefolding(): eq_(tokenize('WEISS', 'de'), ['weiss']) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index 2f08de6..6201c91 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -60,6 +60,13 @@ TOKEN_RE = regex.compile(r""" # Case 2: standard Unicode segmentation # ------------------------------------- + # The start of the token must be 'word-like', not punctuation or whitespace + # or various other things. However, we allow characters of category So + # (Symbol - Other) because many of these are emoji, which can convey + # meaning. + + (?=[\w\p{So}]) + # The start of the token must not be a letter followed by Β«'hΒ». If it is, # we should use Case 3 to match up to the apostrophe, then match a new token # starting with Β«hΒ». This rule lets us break Β«l'heureΒ» into two tokens, just @@ -67,18 +74,28 @@ TOKEN_RE = regex.compile(r""" (?!\w'[Hh]) - # The start of the token must be 'word-like', not punctuation or whitespace - # or various other things. However, we allow characters of category So - # (Symbol - Other) because many of these are emoji, which can convey - # meaning. + # The entire token is made of graphemes (\X). Matching by graphemes means + # that we don't have to specially account for marks or ZWJ sequences. + # + # The token ends as soon as it encounters a word break (\b). We use the + # non-greedy match (+?) to make sure to end at the first word break we + # encounter. + \X+? \b | - [\w\p{So}] - - # The rest of the token matches characters that are not any sort of space - # (\S) and do not cause word breaks according to the Unicode word - # segmentation heuristic (\B), or are categorized as Marks (\p{M}). - - (?:\B\S|\p{M})* | + # If we were matching by codepoints (.) instead of graphemes (\X), then + # detecting boundaries would be more difficult. Here's a fact that's subtle + # and poorly documented: a position that's between codepoints, but in the + # middle of a grapheme, does not match as a word break (\b), but also does + # not match as not-a-word-break (\B). The word boundary algorithm simply + # doesn't apply in such a position. + # + # We used to match the rest of the token using \S, which matches non-space + # *codepoints*, and this caused us to incompletely work around cases where + # it left off in the middle of a grapheme. + # + # Another subtle fact: the "non-breaking space" U+A0 counts as a word break + # here. That's surprising, but it's also what we want, because we don't want + # any kind of spaces in the middle of our tokens. # Case 3: Fix French # ------------------ @@ -90,9 +107,12 @@ TOKEN_RE = regex.compile(r""" """.replace('', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) TOKEN_RE_WITH_PUNCTUATION = regex.compile(r""" + # This expression is similar to the expression above, but also matches any + # sequence of punctuation characters. + []+ | [\p{punct}]+ | - (?!\w'[Hh]) \S(?:\B\S|\p{M})* | + (?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b | \w' """.replace('', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) @@ -110,8 +130,12 @@ def simple_tokenize(text, include_punctuation=False): The expression mostly implements the rules of Unicode Annex #29 that are contained in the `regex` module's word boundary matching, including the refinement that splits words between apostrophes and vowels in order - to separate tokens such as the French article Β«l'Β». Our customizations - to the expression are: + to separate tokens such as the French article Β«l'Β». + + It makes sure not to split in the middle of a grapheme, so that zero-width + joiners and marks on Devanagari words work correctly. + + Our customizations to the expression are: - It leaves sequences of Chinese or Japanese characters (specifically, Han ideograms and hiragana) relatively untokenized, instead of splitting each @@ -122,13 +146,8 @@ def simple_tokenize(text, include_punctuation=False): such as emoji. If `include_punctuation` is True, it outputs all non-space tokens. - - It breaks on all spaces, even the "non-breaking" ones. - - - It aims to keep marks together with words, so that they aren't erroneously - split off as punctuation in languages such as Hindi. - - It keeps Southeast Asian scripts, such as Thai, glued together. This yields - tokens that are much too long, but the alternative is that every character + tokens that are much too long, but the alternative is that every grapheme would end up in its own token, which is worse. """ text = unicodedata.normalize('NFC', text) @@ -351,11 +370,8 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False, ----------------------------------- Any kind of language not previously mentioned will just go through the same - tokenizer that alphabetic languages use. - - We've tweaked this tokenizer for the case of Indic languages in Brahmic - scripts, such as Hindi, Tamil, and Telugu, so that we can handle these - languages where the default Unicode algorithm wouldn't quite work. + tokenizer that alphabetic languages use. This includes the Brahmic scripts + used in Hindi, Tamil, and Telugu, for example. Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are written in Brahmic-derived scripts, but usually *without spaces*. wordfreq