mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-26 10:28:52 +00:00
5a1fc00aaa
The issue here is that if you had French text with an apostrophe, such as "d'un", it would split it into "d'" and "un", but if "d'" were re-tokenized it would come out as "d". Stripping apostrophes makes the process more idempotent.
52 lines
1.7 KiB
Python
52 lines
1.7 KiB
Python
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language
|
|
from nose.tools import eq_
|
|
|
|
|
|
def test_tokenizer_1():
|
|
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
|
|
tokens = [
|
|
'this', 'is', 'a', 'test', 'she', 'said',
|
|
'and', "i'll", 'bet', "y", "all", '3.50', 'that',
|
|
'it', "won't", 'fail',
|
|
]
|
|
result = cld2_surface_tokenizer(text)
|
|
eq_(result[1], tokens)
|
|
eq_(result[0], 'en')
|
|
|
|
def test_tokenizer_2():
|
|
text = "i use punctuation informally...see?like this."
|
|
tokens = [
|
|
'i', 'use', 'punctuation', 'informally', 'see',
|
|
'like', 'this'
|
|
]
|
|
result = cld2_surface_tokenizer(text)
|
|
eq_(result[1], tokens)
|
|
eq_(result[0], 'en')
|
|
|
|
def test_tokenizer_3():
|
|
text = "@ExampleHandle This parser removes twitter handles!"
|
|
tokens = ['this', 'parser', 'removes', 'twitter', 'handles']
|
|
result = cld2_surface_tokenizer(text)
|
|
eq_(result[1], tokens)
|
|
eq_(result[0], 'en')
|
|
|
|
def test_tokenizer_4():
|
|
text = "This is a really boring example tco http://t.co/n15ASlkase"
|
|
tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco']
|
|
result = cld2_surface_tokenizer(text)
|
|
eq_(result[1], tokens)
|
|
eq_(result[0], 'en')
|
|
|
|
|
|
def test_language_recognizer_1():
|
|
text = "Il est le meilleur livre que je ai jamais lu"
|
|
result = cld2_detect_language(text)
|
|
eq_(result, 'fr')
|
|
|
|
def test_language_recognizer_2():
|
|
text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort,
|
|
é uma nuvem esférica de planetesimais voláteis que se acredita
|
|
localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol."""
|
|
result = cld2_detect_language(text)
|
|
eq_(result, 'pt')
|