From 8637aaef9e805e5ae49fc56e0abfed55715b44ec Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Mon, 24 Aug 2015 19:05:46 -0400 Subject: [PATCH] use better regexes in wordfreq_builder tokenizer Former-commit-id: de73888a761238cc4147f1e0ceeb0ad4469a93e7 --- wordfreq_builder/wordfreq_builder/tokenizers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 8e9f192..997c0c8 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -1,7 +1,7 @@ from html.entities import name2codepoint -from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE +from wordfreq import tokenize from ftfy.fixes import unescape_html -import re +import regex import pycld2 CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( @@ -18,10 +18,10 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( ] + [chr(65534+65536*x+y) for x in range(17) for y in range(2)] ) -CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE) +CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE) -TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE)) -TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') +TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+') +TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') def cld2_surface_tokenizer(text):