Use langcodes when tokenizing again (it no longer connects to a DB)

This commit is contained in:
Robyn Speer 2017-04-27 15:09:59 -04:00
parent ae7bc5764b
commit 71a0ad6abb
4 changed files with 42 additions and 7 deletions

View File

@ -66,3 +66,22 @@ def test_combination():
word_frequency('谢谢谢谢', 'zh'),
xiexie_freq / 20
)
def test_alternate_codes():
# Tokenization of Chinese works when you use other language codes
# that are not equal to 'zh'.
tokens = ['谢谢', '谢谢']
# Code with a region attached
eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)
# Over-long codes for Chinese
eq_(tokenize('谢谢谢谢', 'chi'), tokens)
eq_(tokenize('谢谢谢谢', 'zho'), tokens)
# Separate codes for Mandarin and Cantonese
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
eq_(tokenize('谢谢谢谢', 'yue'), tokens)

View File

@ -29,3 +29,12 @@ def test_catastrophes():
['m', 'acabo', 'd', 'instal·lar'])
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
["m'", 'acabo', "d'", 'instal·lar', '.'])
def test_alternate_codes():
# Try over-long language codes for French and Catalan
eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
['m', 'acabo', 'd', 'instal·lar'])

View File

@ -23,3 +23,11 @@ def test_actually_russian():
['sto', 'iz', 'sta', 'pacany'])
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
def test_alternate_codes():
# Try language codes for Serbo-Croatian that have been split, and now
# are canonically mapped to Serbian
eq_(tokenize("культуры", 'sh'), ["kul'tury"])
eq_(tokenize("культуры", 'hbs'), ["kul'tury"])

View File

@ -1,5 +1,6 @@
import regex
import unicodedata
import langcodes
from .transliterate import serbian_cyrillic_to_latin
mecab_tokenize = None
@ -361,20 +362,18 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
does not support these languages yet. It will split on spaces and
punctuation, giving tokens that are far too long.
"""
# A really simple way to handle language codes with more than just the
# language
lang = lang.split('-')[0]
# Reduce whatever language code was passed in to a normal form,
# containing just the language subtag.
lang = langcodes.get(lang).prefer_macrolanguage().language
if lang == 'ja' or lang == 'ko':
result = tokenize_mecab_language(text, lang, include_punctuation)
elif lang == 'zh':
elif lang == 'zh' or lang == 'yue':
result = chinese_tokenize(text, include_punctuation, external_wordlist)
elif lang == 'tr':
result = simple_tokenize(preprocess_turkish(text), include_punctuation)
elif lang == 'ro':
result = simple_tokenize(preprocess_romanian(text), include_punctuation)
elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
# These are the three language codes that could include Serbian text,
# which could be in Cyrillic.
elif lang == 'sr':
result = simple_tokenize(preprocess_serbian(text), include_punctuation)
elif lang in ABJAD_LANGUAGES:
text = remove_marks(unicodedata.normalize('NFKC', text))