mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
parent
91a81c1bde
commit
5fedd71a66
@ -84,6 +84,7 @@ def turkish_tokenize(text, include_punctuation=False):
|
|||||||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
|
mecab_tokenize = None
|
||||||
def japanese_tokenize(text, include_punctuation=False):
|
def japanese_tokenize(text, include_punctuation=False):
|
||||||
global mecab_tokenize
|
global mecab_tokenize
|
||||||
if mecab_tokenize is None:
|
if mecab_tokenize is None:
|
||||||
@ -93,6 +94,7 @@ def japanese_tokenize(text, include_punctuation=False):
|
|||||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||||
|
|
||||||
|
|
||||||
|
jieba_tokenize = None
|
||||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||||
global jieba_tokenize
|
global jieba_tokenize
|
||||||
if jieba_tokenize is None:
|
if jieba_tokenize is None:
|
||||||
@ -114,8 +116,6 @@ def remove_arabic_marks(text):
|
|||||||
return ARABIC_MARK_RE.sub('', text)
|
return ARABIC_MARK_RE.sub('', text)
|
||||||
|
|
||||||
|
|
||||||
mecab_tokenize = None
|
|
||||||
jieba_tokenize = None
|
|
||||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's relatively simple but appropriate for
|
Tokenize this text in a way that's relatively simple but appropriate for
|
||||||
|
Loading…
Reference in New Issue
Block a user