wordfreq/wordfreq_builder/tests/test_tokenizer.py

52 lines
1.7 KiB
Python
Raw Normal View History

from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language
2015-02-05 01:19:36 +00:00
from nose.tools import eq_
def test_tokenizer_1():
2015-06-17 15:30:25 +00:00
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
2015-02-05 01:19:36 +00:00
tokens = [
'this', 'is', 'a', 'test', 'she', 'said',
2015-06-17 15:30:25 +00:00
'and', "i'll", 'bet', "y'all", '3', '50', 'that',
'it', "won't", 'fail',
2015-02-05 01:19:36 +00:00
]
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
2015-02-05 01:19:36 +00:00
def test_tokenizer_2():
text = "i use punctuation informally...see?like this."
tokens = [
'i', 'use', 'punctuation', 'informally', 'see',
'like', 'this'
2015-02-05 01:19:36 +00:00
]
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_tokenizer_3():
text = "@ExampleHandle This parser removes twitter handles!"
tokens = ['this', 'parser', 'removes', 'twitter', 'handles']
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_tokenizer_4():
text = "This is a really boring example tco http://t.co/n15ASlkase"
tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco']
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_language_recognizer_1():
text = "Il est le meilleur livre que je ai jamais lu"
result = cld2_detect_language(text)
eq_(result, 'fr')
def test_language_recognizer_2():
text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort,
é uma nuvem esférica de planetesimais voláteis que se acredita
localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol."""
result = cld2_detect_language(text)
eq_(result, 'pt')