wordfreq/wordfreq_builder/tests/test_tokenizer.py
Rob Speer 5a1fc00aaa Strip apostrophes from edges of tokens
The issue here is that if you had French text with an apostrophe,
such as "d'un", it would split it into "d'" and "un", but if "d'"
were re-tokenized it would come out as "d". Stripping apostrophes
makes the process more idempotent.
2015-08-25 12:41:48 -04:00

52 lines
1.7 KiB
Python

from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language
from nose.tools import eq_
def test_tokenizer_1():
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
tokens = [
'this', 'is', 'a', 'test', 'she', 'said',
'and', "i'll", 'bet', "y", "all", '3.50', 'that',
'it', "won't", 'fail',
]
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_tokenizer_2():
text = "i use punctuation informally...see?like this."
tokens = [
'i', 'use', 'punctuation', 'informally', 'see',
'like', 'this'
]
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_tokenizer_3():
text = "@ExampleHandle This parser removes twitter handles!"
tokens = ['this', 'parser', 'removes', 'twitter', 'handles']
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_tokenizer_4():
text = "This is a really boring example tco http://t.co/n15ASlkase"
tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco']
result = cld2_surface_tokenizer(text)
eq_(result[1], tokens)
eq_(result[0], 'en')
def test_language_recognizer_1():
text = "Il est le meilleur livre que je ai jamais lu"
result = cld2_detect_language(text)
eq_(result, 'fr')
def test_language_recognizer_2():
text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort,
é uma nuvem esférica de planetesimais voláteis que se acredita
localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol."""
result = cld2_detect_language(text)
eq_(result, 'pt')