cleaned up BAD_CHAR_RANGE

2024-12-23 17:31:41 +00:00 · 2015-07-17 15:00:59 -04:00 · 2015-07-17 15:00:59 -04:00 · 3b368b66dd
commit 3b368b66dd
parent c2d1cdcb31
1 changed files with 11 additions and 10 deletions
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -3,16 +3,17 @@ from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
 import re
 import pycld2

-CLD2_BAD_CHAR_RANGE = "".join([
-    '[',
-    '\x00-\x08',
-    '\x0b',
-    '\x0e-\x1f',
-    '\x7f-\x9f',
-    '\ud800-\udfff',
-    '\ufdd0-\ufdef'] +
-    [chr(65534+65536*x+y) for x in range(17) for y in range(2)] +
-    [']'])
+CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
+    [
+        '\x00-\x08',
+        '\x0b',
+        '\x0e-\x1f',
+        '\x7f-\x9f',
+        '\ud800-\udfff',
+        '\ufdd0-\ufdef'
+    ] +
+    [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
+)
 CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)

 TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))