mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
parent
c5135edd88
commit
3d221f0605
@ -1,5 +1,22 @@
|
||||
import argparse
|
||||
import unicodedata
|
||||
import chardata
|
||||
|
||||
def _emoji_char_class():
|
||||
"""
|
||||
Build a regex for emoji substitution. First we create a regex character set
|
||||
(like "[a-cv-z]") matching characters we consider emoji The final regex
|
||||
matches one such character followed by any number of spaces and identical
|
||||
characters.
|
||||
"""
|
||||
ranges = []
|
||||
for i, c in enumerate(chardata.CHAR_CLASS_STRING):
|
||||
if c == '3' and i >= 0x2600 and i != 0xfffd:
|
||||
if ranges and i == ranges[-1][1] + 1:
|
||||
ranges[-1][1] = i
|
||||
else:
|
||||
ranges.append([i, i])
|
||||
return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
|
||||
|
||||
def func_to_regex(accept):
|
||||
"""
|
||||
|
@ -87,6 +87,7 @@ def tokenize(text, lang):
|
||||
"""
|
||||
if lang == 'ja':
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
|
Loading…
Reference in New Issue
Block a user