wordfreq/wordfreq_builder/tests/test_tokenizer.py

from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language
from nose.tools import eq_


def test_tokenizer_1():
    text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
    tokens = [
        'this', 'is', 'a', 'test', 'she', 'said',
        'and', "i'll", 'bet', "y'all", '3', '50', 'that',
        'it', "won't", 'fail',
    ]
    result = cld2_surface_tokenizer(text)
    eq_(result[1], tokens)
    eq_(result[0], 'en')

def test_tokenizer_2():
    text = "i use punctuation informally...see?like this."
    tokens = [
        'i', 'use', 'punctuation', 'informally', 'see',
        'like', 'this'
    ]
    result = cld2_surface_tokenizer(text)
    eq_(result[1], tokens)
    eq_(result[0], 'en')

def test_tokenizer_3():
    text = "@ExampleHandle This parser removes twitter handles!"
    tokens = ['this', 'parser', 'removes', 'twitter', 'handles']
    result = cld2_surface_tokenizer(text)
    eq_(result[1], tokens)
    eq_(result[0], 'en')

def test_tokenizer_4():
    text = "This is a really boring example tco http://t.co/n15ASlkase"
    tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco']
    result = cld2_surface_tokenizer(text)
    eq_(result[1], tokens)
    eq_(result[0], 'en')


def test_language_recognizer_1():
    text = "Il est le meilleur livre que je ai jamais lu"
    result = cld2_detect_language(text)
    eq_(result, 'fr')

def test_language_recognizer_2():
    text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort,
    é uma nuvem esférica de planetesimais voláteis que se acredita
    localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol."""
    result = cld2_detect_language(text)
    eq_(result, 'pt')
added tests for the tokenizer and language recognizer 2015-06-16 20:00:14 +00:00			`from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language`
Initial commit 2015-02-05 01:19:36 +00:00			`from nose.tools import eq_`


			`def test_tokenizer_1():`
updated test to check number parsing 2015-06-17 15:30:25 +00:00			`text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'`
Initial commit 2015-02-05 01:19:36 +00:00			`tokens = [`
added tests for the tokenizer and language recognizer 2015-06-16 20:00:14 +00:00			`'this', 'is', 'a', 'test', 'she', 'said',`
updated test to check number parsing 2015-06-17 15:30:25 +00:00			`'and', "i'll", 'bet', "y'all", '3', '50', 'that',`
added tests for the tokenizer and language recognizer 2015-06-16 20:00:14 +00:00			`'it', "won't", 'fail',`
Initial commit 2015-02-05 01:19:36 +00:00			`]`
added tests for the tokenizer and language recognizer 2015-06-16 20:00:14 +00:00			`result = cld2_surface_tokenizer(text)`
			`eq_(result[1], tokens)`
			`eq_(result[0], 'en')`
Initial commit 2015-02-05 01:19:36 +00:00
			`def test_tokenizer_2():`
			`text = "i use punctuation informally...see?like this."`
			`tokens = [`
added tests for the tokenizer and language recognizer 2015-06-16 20:00:14 +00:00			`'i', 'use', 'punctuation', 'informally', 'see',`
			`'like', 'this'`
Initial commit 2015-02-05 01:19:36 +00:00			`]`
added tests for the tokenizer and language recognizer 2015-06-16 20:00:14 +00:00			`result = cld2_surface_tokenizer(text)`
			`eq_(result[1], tokens)`
			`eq_(result[0], 'en')`

			`def test_tokenizer_3():`
			`text = "@ExampleHandle This parser removes twitter handles!"`
			`tokens = ['this', 'parser', 'removes', 'twitter', 'handles']`
			`result = cld2_surface_tokenizer(text)`
			`eq_(result[1], tokens)`
			`eq_(result[0], 'en')`

			`def test_tokenizer_4():`
changed tokenizer to only strip t.co urls 2015-06-16 20:11:31 +00:00			`text = "This is a really boring example tco http://t.co/n15ASlkase"`
			`tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco']`
added tests for the tokenizer and language recognizer 2015-06-16 20:00:14 +00:00			`result = cld2_surface_tokenizer(text)`
			`eq_(result[1], tokens)`
			`eq_(result[0], 'en')`


			`def test_language_recognizer_1():`
			`text = "Il est le meilleur livre que je ai jamais lu"`
			`result = cld2_detect_language(text)`
			`eq_(result, 'fr')`

			`def test_language_recognizer_2():`
			`text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort,`
			`é uma nuvem esférica de planetesimais voláteis que se acredita`
			`localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol."""`
			`result = cld2_detect_language(text)`
			`eq_(result, 'pt')`