wordfreq/tests/test.py

from wordfreq import (
    word_frequency, available_languages, cB_to_freq,
    top_n_list, random_words, random_ascii_words, tokenize
)
from nose.tools import (
    eq_, assert_almost_equal, assert_greater, raises
)


def test_freq_examples():
    # Stopwords are most common in the correct language
    assert_greater(word_frequency('the', 'en'),
                   word_frequency('de', 'en'))

    assert_greater(word_frequency('de', 'es'),
                   word_frequency('the', 'es'))


def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 15)

    # Laughter is the universal language. Look up either 'lol' or '笑' in each
    # language and make sure it has a non-zero frequency.
    for lang in avail:
        if lang in {'zh', 'ja'}:
            text = '笑'
        else:
            text = 'lol'
        assert_greater(word_frequency(text, lang), 0)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert_greater(word_frequency(text, new_lang_code), 0)


def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 14)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))


def test_minimums():
    eq_(word_frequency('esquivalience', 'en'), 0)
    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
    eq_(word_frequency('the', 'en', minimum=1), 1)

def test_most_common_words():
    # If something causes the most common words in well-supported languages to
    # change, we should know.

    def get_most_common(lang):
        """
        Return the single most common word in the language.
        """
        return top_n_list(lang, 1)[0]

    eq_(get_most_common('ar'), 'في')
    eq_(get_most_common('de'), 'die')
    eq_(get_most_common('en'), 'the')
    eq_(get_most_common('es'), 'de')
    eq_(get_most_common('fr'), 'de')
    eq_(get_most_common('it'), 'di')
    eq_(get_most_common('ja'), 'の')
    eq_(get_most_common('nl'), 'de')
    eq_(get_most_common('pt'), 'de')
    eq_(get_most_common('ru'), 'в')
    eq_(get_most_common('tr'), 'bir')
    eq_(get_most_common('zh'), '的')


def test_language_matching():
    freq = word_frequency('的', 'zh')
    eq_(word_frequency('的', 'zh-TW'), freq)
    eq_(word_frequency('的', 'zh-CN'), freq)
    eq_(word_frequency('的', 'zh-Hant'), freq)
    eq_(word_frequency('的', 'zh-Hans'), freq)
    eq_(word_frequency('的', 'yue-HK'), freq)
    eq_(word_frequency('的', 'cmn'), freq)


def test_cB_conversion():
    eq_(cB_to_freq(0), 1.)
    assert_almost_equal(cB_to_freq(-100), 0.1)
    assert_almost_equal(cB_to_freq(-600), 1e-6)


@raises(ValueError)
def test_failed_cB_conversion():
    cB_to_freq(1)


def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
    
    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])

    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])

    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
        ['this', 'text', 'has', '...', 'punctuation', ':)'])


def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
    eq_(tokenize('weiß', 'de'), ['weiss'])
    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])


def test_phrase_freq():
    ff = word_frequency("flip-flop", 'en')
    assert_greater(ff, 0)
    assert_almost_equal(
        1.0 / ff,
        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    )


def test_not_really_random():
    # If your xkcd-style password comes out like this, maybe you shouldn't
    # use it
    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
        'the the the the')

    # This not only tests random_ascii_words, it makes sure we didn't end
    # up with 'eos' as a very common Japanese word
    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
        'rt rt rt rt')


@raises(ValueError)
def test_not_enough_ascii():
    random_ascii_words(lang='zh')


def test_ar():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
        ['متعب']
    )

    # Remove combining marks
    eq_(
        tokenize('حَرَكَات', 'ar'),
        ['حركات']
    )

    eq_(
        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )


def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								from wordfreq import (
-												removed unused imports


Former-commit-id: b9578ae21e58ff40cd63506e4f31e4ddae11f179
											
										
										
											2015-07-07 20:21:22 +00:00
+								    word_frequency, available_languages, cB_to_freq,
-												Express the combining of word frequencies in an explicitly associative and commutative way.


Former-commit-id: 32b4033d6399f10e10dd3f1c9194847a7f01f302
											
										
										
											2015-07-09 19:26:54 +00:00
+								    top_n_list, random_words, random_ascii_words, tokenize
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								)
 								from nose.tools import (
-												removed unused imports


Former-commit-id: b9578ae21e58ff40cd63506e4f31e4ddae11f179
											
										
										
											2015-07-07 20:21:22 +00:00
+								    eq_, assert_almost_equal, assert_greater, raises
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								)
 								def test_freq_examples():
 								    # Stopwords are most common in the correct language
 								    assert_greater(word_frequency('the', 'en'),
 								                   word_frequency('de', 'en'))
 								    assert_greater(word_frequency('de', 'es'),
 								                   word_frequency('the', 'es'))
 								def test_languages():
 								    # Make sure the number of available languages doesn't decrease
 								    avail = available_languages()
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    assert_greater(len(avail), 15)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								    # Laughter is the universal language. Look up either 'lol' or '笑' in each
 								    # language and make sure it has a non-zero frequency.
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    for lang in avail:
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								        if lang in {'zh', 'ja'}:
 								            text = '笑'
 								        else:
 								            text = 'lol'
 								        assert_greater(word_frequency(text, lang), 0)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								        # Make up a weirdly verbose language code and make sure
 								        # we still get it
 								        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
 								        assert_greater(word_frequency(text, new_lang_code), 0)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												test and document new twitter wordlists


Former-commit-id: 14cb40810019eb8ca5d1350be46c41c645bf12b6
											
										
										
											2015-07-01 21:53:38 +00:00
+								def test_twitter():
 								    avail = available_languages('twitter')
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    assert_greater(len(avail), 14)
-												test and document new twitter wordlists


Former-commit-id: 14cb40810019eb8ca5d1350be46c41c645bf12b6
											
										
										
											2015-07-01 21:53:38 +00:00
 								    for lang in avail:
 								        assert_greater(word_frequency('rt', lang, 'twitter'),
 								                       word_frequency('rt', lang, 'combined'))
-												updated minimum


Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb
											
										
										
											2015-07-07 19:46:33 +00:00
+								def test_minimums():
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(word_frequency('esquivalience', 'en'), 0)
-												changed default to minimum for word_frequency


Former-commit-id: 9aa773aa2bba694c691d1ea7b18e16a64fe7695e
											
										
										
											2015-07-07 19:03:26 +00:00
+								    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
-												updated minimum


Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb
											
										
										
											2015-07-07 19:46:33 +00:00
+								    eq_(word_frequency('the', 'en', minimum=1), 1)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_most_common_words():
 								    # If something causes the most common words in well-supported languages to
 								    # change, we should know.
 								    def get_most_common(lang):
 								        """
 								        Return the single most common word in the language.
 								        """
 								        return top_n_list(lang, 1)[0]
 								    eq_(get_most_common('ar'), 'في')
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								    eq_(get_most_common('de'), 'die')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(get_most_common('en'), 'the')
 								    eq_(get_most_common('es'), 'de')
 								    eq_(get_most_common('fr'), 'de')
 								    eq_(get_most_common('it'), 'di')
 								    eq_(get_most_common('ja'), 'の')
 								    eq_(get_most_common('nl'), 'de')
 								    eq_(get_most_common('pt'), 'de')
 								    eq_(get_most_common('ru'), 'в')
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    eq_(get_most_common('tr'), 'bir')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(get_most_common('zh'), '的')
 								def test_language_matching():
 								    freq = word_frequency('的', 'zh')
 								    eq_(word_frequency('的', 'zh-TW'), freq)
 								    eq_(word_frequency('的', 'zh-CN'), freq)
 								    eq_(word_frequency('的', 'zh-Hant'), freq)
 								    eq_(word_frequency('的', 'zh-Hans'), freq)
 								    eq_(word_frequency('的', 'yue-HK'), freq)
 								    eq_(word_frequency('的', 'cmn'), freq)
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								def test_cB_conversion():
 								    eq_(cB_to_freq(0), 1.)
 								    assert_almost_equal(cB_to_freq(-100), 0.1)
 								    assert_almost_equal(cB_to_freq(-600), 1e-6)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								@raises(ValueError)
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								def test_failed_cB_conversion():
 								    cB_to_freq(1)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_tokenization():
 								    # We preserve apostrophes within words, so "can't" is a single word in the
-												updated comments


Former-commit-id: 131b916c579b8c45db0444a6eaffe51ef419039b
											
										
										
											2015-07-17 18:50:12 +00:00
+								    # data
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
 								        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
 								    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
 								        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    # Certain punctuation does not inherently split a word.
 								    eq_(tokenize("Anything is possible at zombo.com", 'en'),
 								        ['anything', 'is', 'possible', 'at', 'zombo.com'])
 								    # Splits occur after symbols, and at splitting punctuation such as hyphens.
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
+								    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
 								        ['this', 'text', 'has', '...', 'punctuation', ':)'])
-												case-fold instead of just lowercasing tokens


Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70
											
										
										
											2015-06-30 19:14:02 +00:00
 								def test_casefolding():
 								    eq_(tokenize('WEISS', 'de'), ['weiss'])
 								    eq_(tokenize('weiß', 'de'), ['weiss'])
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
 								    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
-												case-fold instead of just lowercasing tokens


Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70
											
										
										
											2015-06-30 19:14:02 +00:00
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
+								def test_phrase_freq():
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    ff = word_frequency("flip-flop", 'en')
 								    assert_greater(ff, 0)
-												updated tests


Former-commit-id: ca66a5f883d4a19c2b9fa81e1f6c3c8309924f69
											
										
										
											2015-07-07 18:13:28 +00:00
+								    assert_almost_equal(
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+.0 / ff,
 .0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
-												Express the combining of word frequencies in an explicitly associative and commutative way.


Former-commit-id: 32b4033d6399f10e10dd3f1c9194847a7f01f302
											
										
										
											2015-07-09 19:26:54 +00:00
+								    )
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_not_really_random():
 								    # If your xkcd-style password comes out like this, maybe you shouldn't
 								    # use it
-												revert changes to test_not_really_random


Former-commit-id: bbf7b9de34f4b1f7ff3ac4a3b3789f5f45fa1a86
											
										
										
											2015-06-30 15:29:14 +00:00
+								    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								        'the the the the')
 								    # This not only tests random_ascii_words, it makes sure we didn't end
 								    # up with 'eos' as a very common Japanese word
-												revert changes to test_not_really_random


Former-commit-id: bbf7b9de34f4b1f7ff3ac4a3b3789f5f45fa1a86
											
										
										
											2015-06-30 15:29:14 +00:00
+								    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
-												update data using new build


Former-commit-id: f9a9ee7a82fb122124aec58a4fbf14cccaf27c35
											
										
										
											2015-07-01 15:18:39 +00:00
+								        'rt rt rt rt')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								@raises(ValueError)
 								def test_not_enough_ascii():
 								    random_ascii_words(lang='zh')
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								def test_ar():
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    # Remove tatweels
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
+								    eq_(
 								        tokenize('متــــــــعب', 'ar'),
 								        ['متعب']
 								    )
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    # Remove combining marks
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
+								    eq_(
 								        tokenize('حَرَكَات', 'ar'),
 								        ['حركات']
 								    )
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
 								    eq_(
-												Document the NFKC-normalized ligature in the Arabic test.


Former-commit-id: 41e1dd41d82358fd44f972e501c8586d0bbd64a2
											
										
										
											2015-08-03 15:09:44 +00:00
+								        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
 								        ['\u0644\u0627']  # ...that is affected by NFKC normalization
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    )
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
 								def test_ideographic_fallback():
-												tokenize Chinese using jieba and our own frequencies


Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628
											
										
										
											2015-09-05 07:16:56 +00:00
+								    # Try tokenizing Chinese text as English -- it should remain stuck together.
 								    eq_(tokenize('中国文字', 'en'), ['中国文字'])
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
 								    # When Japanese is tagged with the wrong language, it will be split
 								    # at script boundaries.
 								    ja_text = 'ひらがなカタカナromaji'
 								    eq_(
 								        tokenize(ja_text, 'en'),
 								        ['ひらがな', 'カタカナ', 'romaji']
 								    )