mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Tokenize by graphemes, not codepoints (#50)
* Tokenize by graphemes, not codepoints * Add more documentation to TOKEN_RE * Remove extra line break * Update docstring - Brahmic scripts are no longer an exception * approve using version 2017.07.28 of regex
This commit is contained in:
parent
6c118c0b6a
commit
9dac967ca3
4
setup.py
4
setup.py
@ -27,7 +27,9 @@ current_dir = os.path.dirname(__file__)
|
|||||||
README_contents = open(os.path.join(current_dir, 'README.md'),
|
README_contents = open(os.path.join(current_dir, 'README.md'),
|
||||||
encoding='utf-8').read()
|
encoding='utf-8').read()
|
||||||
doclines = README_contents.split("\n")
|
doclines = README_contents.split("\n")
|
||||||
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015']
|
dependencies = [
|
||||||
|
'ftfy >= 5', 'msgpack-python', 'langcodes >= 1.4', 'regex == 2017.07.28'
|
||||||
|
]
|
||||||
if sys.version_info < (3, 4):
|
if sys.version_info < (3, 4):
|
||||||
dependencies.append('pathlib')
|
dependencies.append('pathlib')
|
||||||
|
|
||||||
|
@ -137,6 +137,20 @@ def test_tokenization():
|
|||||||
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
||||||
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
||||||
|
|
||||||
|
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
|
||||||
|
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
|
||||||
|
# is up to date
|
||||||
|
eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
|
||||||
|
|
||||||
|
eq_(tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
|
||||||
|
['👨🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
|
||||||
|
'nothing', 'i', 'can', 'do', '🌎', '🚀'])
|
||||||
|
|
||||||
|
# Water wave, surfer, flag of California (indicates ridiculously complete support
|
||||||
|
# for Unicode 10 and Emoji 5.0)
|
||||||
|
eq_(tokenize("Surf's up 🌊🏄🏴'",'en'),
|
||||||
|
["surf's", "up", "🌊", "🏄", "🏴"])
|
||||||
|
|
||||||
|
|
||||||
def test_casefolding():
|
def test_casefolding():
|
||||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||||
|
@ -60,6 +60,13 @@ TOKEN_RE = regex.compile(r"""
|
|||||||
# Case 2: standard Unicode segmentation
|
# Case 2: standard Unicode segmentation
|
||||||
# -------------------------------------
|
# -------------------------------------
|
||||||
|
|
||||||
|
# The start of the token must be 'word-like', not punctuation or whitespace
|
||||||
|
# or various other things. However, we allow characters of category So
|
||||||
|
# (Symbol - Other) because many of these are emoji, which can convey
|
||||||
|
# meaning.
|
||||||
|
|
||||||
|
(?=[\w\p{So}])
|
||||||
|
|
||||||
# The start of the token must not be a letter followed by «'h». If it is,
|
# The start of the token must not be a letter followed by «'h». If it is,
|
||||||
# we should use Case 3 to match up to the apostrophe, then match a new token
|
# we should use Case 3 to match up to the apostrophe, then match a new token
|
||||||
# starting with «h». This rule lets us break «l'heure» into two tokens, just
|
# starting with «h». This rule lets us break «l'heure» into two tokens, just
|
||||||
@ -67,18 +74,28 @@ TOKEN_RE = regex.compile(r"""
|
|||||||
|
|
||||||
(?!\w'[Hh])
|
(?!\w'[Hh])
|
||||||
|
|
||||||
# The start of the token must be 'word-like', not punctuation or whitespace
|
# The entire token is made of graphemes (\X). Matching by graphemes means
|
||||||
# or various other things. However, we allow characters of category So
|
# that we don't have to specially account for marks or ZWJ sequences.
|
||||||
# (Symbol - Other) because many of these are emoji, which can convey
|
#
|
||||||
# meaning.
|
# The token ends as soon as it encounters a word break (\b). We use the
|
||||||
|
# non-greedy match (+?) to make sure to end at the first word break we
|
||||||
|
# encounter.
|
||||||
|
\X+? \b |
|
||||||
|
|
||||||
[\w\p{So}]
|
# If we were matching by codepoints (.) instead of graphemes (\X), then
|
||||||
|
# detecting boundaries would be more difficult. Here's a fact that's subtle
|
||||||
# The rest of the token matches characters that are not any sort of space
|
# and poorly documented: a position that's between codepoints, but in the
|
||||||
# (\S) and do not cause word breaks according to the Unicode word
|
# middle of a grapheme, does not match as a word break (\b), but also does
|
||||||
# segmentation heuristic (\B), or are categorized as Marks (\p{M}).
|
# not match as not-a-word-break (\B). The word boundary algorithm simply
|
||||||
|
# doesn't apply in such a position.
|
||||||
(?:\B\S|\p{M})* |
|
#
|
||||||
|
# We used to match the rest of the token using \S, which matches non-space
|
||||||
|
# *codepoints*, and this caused us to incompletely work around cases where
|
||||||
|
# it left off in the middle of a grapheme.
|
||||||
|
#
|
||||||
|
# Another subtle fact: the "non-breaking space" U+A0 counts as a word break
|
||||||
|
# here. That's surprising, but it's also what we want, because we don't want
|
||||||
|
# any kind of spaces in the middle of our tokens.
|
||||||
|
|
||||||
# Case 3: Fix French
|
# Case 3: Fix French
|
||||||
# ------------------
|
# ------------------
|
||||||
@ -90,9 +107,12 @@ TOKEN_RE = regex.compile(r"""
|
|||||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||||
|
# This expression is similar to the expression above, but also matches any
|
||||||
|
# sequence of punctuation characters.
|
||||||
|
|
||||||
[<SPACELESS>]+ |
|
[<SPACELESS>]+ |
|
||||||
[\p{punct}]+ |
|
[\p{punct}]+ |
|
||||||
(?!\w'[Hh]) \S(?:\B\S|\p{M})* |
|
(?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b |
|
||||||
\w'
|
\w'
|
||||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
@ -110,8 +130,12 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
The expression mostly implements the rules of Unicode Annex #29 that
|
The expression mostly implements the rules of Unicode Annex #29 that
|
||||||
are contained in the `regex` module's word boundary matching, including
|
are contained in the `regex` module's word boundary matching, including
|
||||||
the refinement that splits words between apostrophes and vowels in order
|
the refinement that splits words between apostrophes and vowels in order
|
||||||
to separate tokens such as the French article «l'». Our customizations
|
to separate tokens such as the French article «l'».
|
||||||
to the expression are:
|
|
||||||
|
It makes sure not to split in the middle of a grapheme, so that zero-width
|
||||||
|
joiners and marks on Devanagari words work correctly.
|
||||||
|
|
||||||
|
Our customizations to the expression are:
|
||||||
|
|
||||||
- It leaves sequences of Chinese or Japanese characters (specifically, Han
|
- It leaves sequences of Chinese or Japanese characters (specifically, Han
|
||||||
ideograms and hiragana) relatively untokenized, instead of splitting each
|
ideograms and hiragana) relatively untokenized, instead of splitting each
|
||||||
@ -122,13 +146,8 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
||||||
tokens.
|
tokens.
|
||||||
|
|
||||||
- It breaks on all spaces, even the "non-breaking" ones.
|
|
||||||
|
|
||||||
- It aims to keep marks together with words, so that they aren't erroneously
|
|
||||||
split off as punctuation in languages such as Hindi.
|
|
||||||
|
|
||||||
- It keeps Southeast Asian scripts, such as Thai, glued together. This yields
|
- It keeps Southeast Asian scripts, such as Thai, glued together. This yields
|
||||||
tokens that are much too long, but the alternative is that every character
|
tokens that are much too long, but the alternative is that every grapheme
|
||||||
would end up in its own token, which is worse.
|
would end up in its own token, which is worse.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
@ -351,11 +370,8 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
|||||||
-----------------------------------
|
-----------------------------------
|
||||||
|
|
||||||
Any kind of language not previously mentioned will just go through the same
|
Any kind of language not previously mentioned will just go through the same
|
||||||
tokenizer that alphabetic languages use.
|
tokenizer that alphabetic languages use. This includes the Brahmic scripts
|
||||||
|
used in Hindi, Tamil, and Telugu, for example.
|
||||||
We've tweaked this tokenizer for the case of Indic languages in Brahmic
|
|
||||||
scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
|
|
||||||
languages where the default Unicode algorithm wouldn't quite work.
|
|
||||||
|
|
||||||
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
|
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
|
||||||
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
|
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
|
||||||
|
Loading…
Reference in New Issue
Block a user