updated imports

This commit is contained in:
Joshua Chin 2015-07-07 14:46:42 -04:00
parent b1cd2e01d3
commit f2b615b0f0
2 changed files with 18 additions and 0 deletions

View File

@ -1,5 +1,22 @@
import argparse
import unicodedata
import chardata
def _emoji_char_class():
"""
Build a regex for emoji substitution. First we create a regex character set
(like "[a-cv-z]") matching characters we consider emoji The final regex
matches one such character followed by any number of spaces and identical
characters.
"""
ranges = []
for i, c in enumerate(chardata.CHAR_CLASS_STRING):
if c == '3' and i >= 0x2600 and i != 0xfffd:
if ranges and i == ranges[-1][1] + 1:
ranges[-1][1] = i
else:
ranges.append([i, i])
return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
def func_to_regex(accept):
"""

View File

@ -87,6 +87,7 @@ def tokenize(text, lang):
"""
if lang == 'ja':
from wordfreq.mecab import mecab_tokenize
return mecab_tokenize(text)
if lang == 'ar':
text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))