From 2c688b823847b13ddd6f4cb546d32f8a398beabd Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Wed, 26 Aug 2015 17:04:56 -0400
Subject: [PATCH] copyedit regex comments

Former-commit-id: d5fcf4407e7e27eab7feea748670983fa9153bcc
---
 wordfreq/tokens.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index d867d9a..eb2c631 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -7,10 +7,10 @@ TOKEN_RE = regex.compile(r"""
     # -----------------------------------------------
 
     # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
-    # \p{Script=Hiragana}, we allow a sequence of those characters to be glued
-    # together as a single token. Without this case, the standard rule (case 2)
-    # would make each character a separate token. This would be the correct
-    # behavior for word-wrapping, but a messy failure mode for NLP
+    # (\p{Script=Hiragana}), we allow a sequence of those characters to be
+    # glued together as a single token. Without this case, the standard rule
+    # (case 2) would make each character a separate token. This would be the
+    # correct behavior for word-wrapping, but a messy failure mode for NLP
     # tokenization.
     #
     # It is, of course, better to use a tokenizer that is designed for Chinese
@@ -26,7 +26,8 @@ TOKEN_RE = regex.compile(r"""
 
     # The start of the token must be 'word-like', not punctuation or whitespace
     # or various other things. However, we allow characters of category So
-    # because many of these are emoji, which can convey meaning.
+    # (Symbol - Other) because many of these are emoji, which can convey
+    # meaning.
 
     [\w\p{So}]