From 2c688b823847b13ddd6f4cb546d32f8a398beabd Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Wed, 26 Aug 2015 17:04:56 -0400 Subject: [PATCH] copyedit regex comments Former-commit-id: d5fcf4407e7e27eab7feea748670983fa9153bcc --- wordfreq/tokens.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index d867d9a..eb2c631 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -7,10 +7,10 @@ TOKEN_RE = regex.compile(r""" # ----------------------------------------------- # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana - # \p{Script=Hiragana}, we allow a sequence of those characters to be glued - # together as a single token. Without this case, the standard rule (case 2) - # would make each character a separate token. This would be the correct - # behavior for word-wrapping, but a messy failure mode for NLP + # (\p{Script=Hiragana}), we allow a sequence of those characters to be + # glued together as a single token. Without this case, the standard rule + # (case 2) would make each character a separate token. This would be the + # correct behavior for word-wrapping, but a messy failure mode for NLP # tokenization. # # It is, of course, better to use a tokenizer that is designed for Chinese @@ -26,7 +26,8 @@ TOKEN_RE = regex.compile(r""" # The start of the token must be 'word-like', not punctuation or whitespace # or various other things. However, we allow characters of category So - # because many of these are emoji, which can convey meaning. + # (Symbol - Other) because many of these are emoji, which can convey + # meaning. [\w\p{So}]