copyedit regex comments

Former-commit-id: d5fcf4407e
2025-01-14 13:15:59 +00:00 · 2015-08-26 17:04:56 -04:00 · 2015-08-26 17:04:56 -04:00 · 2c688b8238
commit 2c688b8238
parent 0b5d2cdca9
1 changed files with 6 additions and 5 deletions
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -7,10 +7,10 @@ TOKEN_RE = regex.compile(r"""
    # -----------------------------------------------
    # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
-    # \p{Script=Hiragana}, we allow a sequence of those characters to be glued
+    # (\p{Script=Hiragana}), we allow a sequence of those characters to be
-    # together as a single token. Without this case, the standard rule (case 2)
+    # glued together as a single token. Without this case, the standard rule
-    # would make each character a separate token. This would be the correct
+    # (case 2) would make each character a separate token. This would be the
-    # behavior for word-wrapping, but a messy failure mode for NLP
+    # correct behavior for word-wrapping, but a messy failure mode for NLP
    # tokenization.
    #
    # It is, of course, better to use a tokenizer that is designed for Chinese
@ -26,7 +26,8 @@ TOKEN_RE = regex.compile(r"""
    # The start of the token must be 'word-like', not punctuation or whitespace
    # or various other things. However, we allow characters of category So
-    # because many of these are emoji, which can convey meaning.
+    # (Symbol - Other) because many of these are emoji, which can convey
    # meaning.
    [\w\p{So}]