diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 6b9107d..4bca6d4 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -4,13 +4,15 @@ import re import pycld2 CLD2_BAD_CHAR_RANGE = "".join([ + '[', '\x00-\x08', '\x0b', '\x0e-\x1f', '\x7f-\x9f', '\ud800-\udfff', '\ufdd0-\ufdef'] + - [chr(65534+65536*x+y) for x in range(17) for y in range(2)]) + [chr(65534+65536*x+y) for x in range(17) for y in range(2)] + + [']']) CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE) TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))