Merge pull request #57 from LuminosoInsight/version2.1

Version 2.1
This commit is contained in:
Lance Nathan 2018-06-18 12:06:47 -04:00 committed by GitHub
commit a95b360563
61 changed files with 36320 additions and 25967 deletions

View File

@ -14,7 +14,7 @@ those cases we want to detect only the most obvious token boundaries.
In this situation, we no longer try to detect script changes, such as between
kanji and katakana, as token boundaries. This particularly allows us to keep
together Japanese words where ヶ appears betwen kanji, as well as words that
together Japanese words where ヶ appears between kanji, as well as words that
use the iteration mark 々.
This change does not affect any word frequencies. (The Japanese word list uses

View File

@ -174,13 +174,13 @@ least 3 different sources of word frequencies:
Bosnian bs [1] 3 - │ Yes Yes - - - Yes - -
Bulgarian bg 3 - │ Yes Yes - - - Yes - -
Catalan ca 4 - │ Yes Yes Yes - - Yes - -
Chinese zh [3] 6 Yes │ Yes - Yes Yes Yes Yes - Jieba
Chinese zh [3] 7 Yes │ Yes Yes Yes Yes Yes Yes - Jieba
Croatian hr [1] 3 │ Yes Yes - - - Yes - -
Czech cs 3 - │ Yes Yes - - - Yes - -
Czech cs 5 Yes │ Yes Yes Yes - Yes Yes - -
Danish da 3 - │ Yes Yes - - - Yes - -
Dutch nl 4 Yes │ Yes Yes Yes - - Yes - -
Dutch nl 5 Yes │ Yes Yes Yes - Yes Yes - -
English en 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
Finnish fi 5 Yes │ Yes Yes Yes - - Yes Yes -
Finnish fi 6 Yes │ Yes Yes Yes - Yes Yes Yes -
French fr 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
German de 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
Greek el 3 - │ Yes Yes - - Yes - - -
@ -191,13 +191,14 @@ least 3 different sources of word frequencies:
Italian it 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
Japanese ja 5 Yes │ Yes Yes - - Yes Yes Yes -
Korean ko 4 - │ Yes Yes - - - Yes Yes -
Latvian lv 4 - │ Yes Yes - - Yes Yes - -
Macedonian mk 3 - │ Yes Yes Yes - - - - -
Malay ms 3 - │ Yes Yes - - - Yes - -
Norwegian nb [2] 4 - │ Yes Yes - - - Yes Yes -
Persian fa 3 - │ Yes Yes - - - Yes - -
Polish pl 5 Yes │ Yes Yes Yes - - Yes Yes -
Polish pl 6 Yes │ Yes Yes Yes - Yes Yes Yes -
Portuguese pt 5 Yes │ Yes Yes Yes - Yes Yes - -
Romanian ro 3 - │ Yes Yes - - - Yes - -
Romanian ro 4 - │ Yes Yes - - Yes Yes - -
Russian ru 6 Yes │ Yes Yes Yes Yes Yes Yes - -
Serbian sr [1] 3 - │ Yes Yes - - - Yes - -
Spanish es 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
@ -219,7 +220,7 @@ Chinese, with primarily Mandarin Chinese vocabulary. See "Multi-script
languages" below.
Some languages provide 'large' wordlists, including words with a Zipf frequency
between 1.0 and 3.0. These are available in 13 languages that are covered by
between 1.0 and 3.0. These are available in 14 languages that are covered by
enough data sources.

2
pytest.ini Normal file
View File

@ -0,0 +1,2 @@
[pytest]
addopts = --doctest-modules

View File

@ -1,5 +1,2 @@
[nosetests]
verbosity=2
with-doctest=1
with-coverage=0
cover-package=wordfreq
[aliases]
test=pytest

View File

@ -35,7 +35,7 @@ if sys.version_info < (3, 4):
setup(
name="wordfreq",
version='2.0.1',
version='2.1.0',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',
@ -60,5 +60,5 @@ setup(
'mecab': 'mecab-python3',
'jieba': 'jieba'
},
tests_require=['mecab-python3', 'jieba'],
tests_require=['pytest', 'mecab-python3', 'jieba'],
)

View File

@ -1,235 +0,0 @@
from wordfreq import (
word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
)
from nose.tools import (
eq_, assert_almost_equal, assert_greater, raises, assert_not_equal
)
def test_freq_examples():
# Stopwords are most common in the correct language
assert_greater(word_frequency('the', 'en'),
word_frequency('de', 'en'))
assert_greater(word_frequency('de', 'es'),
word_frequency('the', 'es'))
# We get word frequencies from the 'large' list when available
assert_greater(word_frequency('infrequency', 'en'), 0.)
def test_languages():
# Make sure we get all the languages when looking for the default
# 'best' wordlist
avail = available_languages()
assert_greater(len(avail), 32)
# 'small' covers the same languages, but with some different lists
avail_small = available_languages('small')
eq_(len(avail_small), len(avail))
assert_not_equal(avail_small, avail)
# 'combined' is the same as 'small'
avail_old_name = available_languages('combined')
eq_(avail_old_name, avail_small)
# 'large' covers fewer languages
avail_large = available_languages('large')
assert_greater(len(avail_large), 12)
assert_greater(len(avail), len(avail_large))
# Look up the digit '2' in the main word list for each language
for lang in avail:
assert_greater(word_frequency('2', lang), 0, lang)
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
def test_minimums():
eq_(word_frequency('esquivalience', 'en'), 0)
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
eq_(word_frequency('the', 'en', minimum=1), 1)
def test_most_common_words():
# If something causes the most common words in well-supported languages to
# change, we should know.
def get_most_common(lang):
"""
Return the single most common word in the language.
"""
return top_n_list(lang, 1)[0]
eq_(get_most_common('ar'), 'في')
eq_(get_most_common('de'), 'die')
eq_(get_most_common('en'), 'the')
eq_(get_most_common('es'), 'de')
eq_(get_most_common('fr'), 'de')
eq_(get_most_common('it'), 'di')
eq_(get_most_common('ja'), '')
eq_(get_most_common('nl'), 'de')
eq_(get_most_common('pl'), 'w')
eq_(get_most_common('pt'), 'de')
eq_(get_most_common('ru'), 'в')
eq_(get_most_common('tr'), 'bir')
eq_(get_most_common('zh'), '')
def test_language_matching():
freq = word_frequency('', 'zh')
eq_(word_frequency('', 'zh-TW'), freq)
eq_(word_frequency('', 'zh-CN'), freq)
eq_(word_frequency('', 'zh-Hant'), freq)
eq_(word_frequency('', 'zh-Hans'), freq)
eq_(word_frequency('', 'yue-HK'), freq)
eq_(word_frequency('', 'cmn'), freq)
def test_cB_conversion():
eq_(cB_to_freq(0), 1.)
assert_almost_equal(cB_to_freq(-100), 0.1)
assert_almost_equal(cB_to_freq(-600), 1e-6)
@raises(ValueError)
def test_failed_cB_conversion():
cB_to_freq(1)
def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the
# data
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
# Certain punctuation does not inherently split a word.
eq_(tokenize("Anything is possible at zombo.com", 'en'),
['anything', 'is', 'possible', 'at', 'zombo.com'])
# Splits occur after symbols, and at splitting punctuation such as hyphens.
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
['this', 'text', 'has', '...', 'punctuation', ':)'])
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
# is up to date
eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
'nothing', 'i', 'can', 'do', '🌎', '🚀'])
# Water wave, surfer, flag of California (indicates ridiculously complete support
# for Unicode 10 and Emoji 5.0)
eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss'])
eq_(tokenize('weiß', 'de'), ['weiss'])
eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
def test_number_smashing():
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
['715', 'crσσks', 'by', 'bon', 'iver'])
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
['000', 'crσσks', 'by', 'bon', 'iver'])
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
eq_(lossy_tokenize('1', 'en'), ['1'])
eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
eq_(lossy_tokenize('24601', 'en'), ['00000'])
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
def test_phrase_freq():
ff = word_frequency("flip-flop", 'en')
assert_greater(ff, 0)
assert_almost_equal(
1.0 / ff,
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
)
def test_not_really_random():
# If your xkcd-style password comes out like this, maybe you shouldn't
# use it
eq_(random_words(nwords=4, lang='en', bits_per_word=0),
'the the the the')
# This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
'1 1 1 1')
@raises(ValueError)
def test_not_enough_ascii():
random_ascii_words(lang='zh', bits_per_word=14)
def test_arabic():
# Remove tatweels
eq_(
tokenize('متــــــــعب', 'ar'),
['متعب']
)
# Remove combining marks
eq_(
tokenize('حَرَكَات', 'ar'),
['حركات']
)
eq_(
tokenize('\ufefb', 'ar'), # An Arabic ligature...
['\u0644\u0627'] # ...that is affected by NFKC normalization
)
def test_ideographic_fallback():
# Try tokenizing Chinese text as English -- it should remain stuck together.
#
# More complex examples like this, involving the multiple scripts of Japanese,
# are in test_japanese.py.
eq_(tokenize('中国文字', 'en'), ['中国文字'])
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
['การเล่นดนตรี', 'means', 'playing', 'music'])
# Test Khmer, a script similar to Thai
eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])
# Test Hindi -- tokens split where there are spaces, and not where there aren't
eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
# Remove vowel points in Hebrew
eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
# Deal with commas, cedillas, and I's in Turkish
eq_(tokenize('kișinin', 'tr'), ['kişinin'])
eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])
# Deal with cedillas that should be commas-below in Romanian
eq_(tokenize('acelaşi', 'ro'), ['același'])
eq_(tokenize('ACELAŞI', 'ro'), ['același'])

View File

@ -1,5 +1,5 @@
from nose.tools import eq_, assert_almost_equal, assert_greater
from wordfreq import tokenize, word_frequency
import pytest
def test_tokens():
@ -17,64 +17,49 @@ def test_tokens():
# His name breaks into five pieces, with the only piece staying together
# being the one that means 'Bart'. The dot is not included as a token.
eq_(
tokenize(hobart, 'zh'),
['', '', '', '', '巴特']
)
assert tokenize(hobart, 'zh') == ['', '', '', '', '巴特']
eq_(
tokenize(fact_simplified, 'zh'),
[
# he / is / history / in / #6 / counter for people
'', '', '历史', '', '第六', '',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
]
)
assert tokenize(fact_simplified, 'zh') == [
# he / is / history / in / #6 / counter for people
'', '', '历史', '', '第六', '',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
]
# Jieba's original tokenizer knows a lot of names, it seems.
eq_(
tokenize(hobart, 'zh', external_wordlist=True),
['加勒特', '霍巴特']
)
assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']
# We get almost the same tokens from the sentence using Jieba's own
# wordlist, but it tokenizes "in history" as two words and
# "sixth person" as one.
eq_(
tokenize(fact_simplified, 'zh', external_wordlist=True),
[
# he / is / history / in / sixth person
'', '', '历史', '', '第六位',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
]
)
assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
# he / is / history / in / sixth person
'', '', '历史', '', '第六位',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
]
# Check that Traditional Chinese works at all
assert_greater(word_frequency(fact_traditional, 'zh'), 0)
assert word_frequency(fact_traditional, 'zh') > 0
# You get the same token lengths if you look it up in Traditional Chinese,
# but the words are different
simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
eq_(''.join(simp_tokens), fact_simplified)
eq_(''.join(trad_tokens), fact_traditional)
assert ''.join(simp_tokens) == fact_simplified
assert ''.join(trad_tokens) == fact_traditional
simp_lengths = [len(token) for token in simp_tokens]
trad_lengths = [len(token) for token in trad_tokens]
eq_(simp_lengths, trad_lengths)
assert simp_lengths == trad_lengths
def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert_almost_equal(
word_frequency('谢谢谢谢', 'zh'),
xiexie_freq / 20
)
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
def test_alternate_codes():
@ -83,12 +68,12 @@ def test_alternate_codes():
tokens = ['谢谢', '谢谢']
# Code with a region attached
eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)
assert tokenize('谢谢谢谢', 'zh-CN') == tokens
# Over-long codes for Chinese
eq_(tokenize('谢谢谢谢', 'chi'), tokens)
eq_(tokenize('谢谢谢谢', 'zho'), tokens)
assert tokenize('谢谢谢谢', 'chi') == tokens
assert tokenize('谢谢谢谢', 'zho') == tokens
# Separate codes for Mandarin and Cantonese
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
eq_(tokenize('谢谢谢谢', 'yue'), tokens)
assert tokenize('谢谢谢谢', 'cmn') == tokens
assert tokenize('谢谢谢谢', 'yue') == tokens

View File

@ -1,40 +1,32 @@
from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency
def test_apostrophes():
# Test that we handle apostrophes in French reasonably.
eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
eq_(tokenize("qu'un", 'fr', include_punctuation=True),
["qu'", "un"])
eq_(tokenize("langues d'oïl", 'fr'),
['langues', "d", 'oïl'])
eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
['langues', "d'", 'oïl'])
eq_(tokenize("l'heure", 'fr'),
['l', 'heure'])
eq_(tokenize("l'heure", 'fr', include_punctuation=True),
["l'", 'heure'])
eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
["l'", 'hôpital'])
eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])
eq_(tokenize("This isn't French", 'en'),
['this', "isn't", 'french'])
assert tokenize("qu'un", 'fr') == ['qu', 'un']
assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
assert tokenize("l'heure", 'fr') == ['l', 'heure']
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
def test_catastrophes():
# More apostrophes, but this time they're in Catalan, and there's other
# mid-word punctuation going on too.
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
['m', 'acabo', 'd', 'instal·lar'])
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
["m'", 'acabo', "d'", 'instal·lar', '.'])
assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
assert (
tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
["m'", 'acabo', "d'", 'instal·lar', '.']
)
def test_alternate_codes():
# Try over-long language codes for French and Catalan
eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
['m', 'acabo', 'd', 'instal·lar'])
assert tokenize("qu'un", 'fra') == ['qu', 'un']
assert tokenize("qu'un", 'fre') == ['qu', 'un']
assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']

223
tests/test_general.py Normal file
View File

@ -0,0 +1,223 @@
from wordfreq import (
word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
)
import pytest
def test_freq_examples():
# Stopwords are most common in the correct language
assert word_frequency('the', 'en') > word_frequency('de', 'en')
assert word_frequency('de', 'es') > word_frequency('the', 'es')
# We get word frequencies from the 'large' list when available
assert word_frequency('infrequency', 'en') > 0.
def test_languages():
# Make sure we get all the languages when looking for the default
# 'best' wordlist
avail = available_languages()
assert len(avail) >= 34
# 'small' covers the same languages, but with some different lists
avail_small = available_languages('small')
assert len(avail_small) == len(avail)
assert avail_small != avail
# 'combined' is the same as 'small'
avail_old_name = available_languages('combined')
assert avail_old_name == avail_small
# 'large' covers fewer languages
avail_large = available_languages('large')
assert len(avail_large) >= 14
assert len(avail) > len(avail_large)
# Look up the digit '2' in the main word list for each language
for lang in avail:
assert word_frequency('2', lang) > 0
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert word_frequency('2', new_lang_code) > 0
def test_minimums():
assert word_frequency('esquivalience', 'en') == 0
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
assert word_frequency('the', 'en', minimum=1) == 1
def test_most_common_words():
# If something causes the most common words in well-supported languages to
# change, we should know.
def get_most_common(lang):
"""
Return the single most common word in the language.
"""
return top_n_list(lang, 1)[0]
assert get_most_common('ar') == 'في'
assert get_most_common('cs') == 'a'
assert get_most_common('de') == 'die'
assert get_most_common('en') == 'the'
assert get_most_common('es') == 'de'
assert get_most_common('fr') == 'de'
assert get_most_common('it') == 'di'
assert get_most_common('ja') == ''
assert get_most_common('nl') == 'de'
assert get_most_common('pl') == 'w'
assert get_most_common('pt') == 'de'
assert get_most_common('ru') == 'в'
assert get_most_common('tr') == 'bir'
assert get_most_common('zh') == ''
def test_language_matching():
freq = word_frequency('', 'zh')
assert word_frequency('', 'zh-TW') == freq
assert word_frequency('', 'zh-CN') == freq
assert word_frequency('', 'zh-Hant') == freq
assert word_frequency('', 'zh-Hans') == freq
assert word_frequency('', 'yue-HK') == freq
assert word_frequency('', 'cmn') == freq
def test_cB_conversion():
assert cB_to_freq(0) == 1.
assert cB_to_freq(-100) == pytest.approx(0.1)
assert cB_to_freq(-600) == pytest.approx(1e-6)
def test_failed_cB_conversion():
with pytest.raises(ValueError):
cB_to_freq(1)
def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the
# data
assert (
tokenize("I don't split at apostrophes, you see.", 'en')
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
)
assert (
tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
)
# Certain punctuation does not inherently split a word.
assert (
tokenize("Anything is possible at zombo.com", 'en')
== ['anything', 'is', 'possible', 'at', 'zombo.com']
)
# Splits occur after symbols, and at splitting punctuation such as hyphens.
assert tokenize('😂test', 'en') == ['😂', 'test']
assert tokenize("flip-flop", 'en') == ['flip', 'flop']
assert (
tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
== ['this', 'text', 'has', '...', 'punctuation', ':)']
)
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
# is up to date
assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
assert (
tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
== ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
'nothing', 'i', 'can', 'do', '🌎', '🚀']
)
# Water wave, surfer, flag of California (indicates ridiculously complete support
# for Unicode 10 and Emoji 5.0)
assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
def test_casefolding():
assert tokenize('WEISS', 'de') == ['weiss']
assert tokenize('weiß', 'de') == ['weiss']
assert tokenize('İstanbul', 'tr') == ['istanbul']
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
def test_number_smashing():
assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
assert (
lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
== ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
)
assert lossy_tokenize('1', 'en') == ['1']
assert lossy_tokenize('3.14', 'en') == ['0.00']
assert lossy_tokenize('24601', 'en') == ['00000']
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def test_phrase_freq():
ff = word_frequency("flip-flop", 'en')
assert ff > 0
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
assert 1.0 / ff == pytest.approx(phrase_freq)
def test_not_really_random():
# If your xkcd-style password comes out like this, maybe you shouldn't
# use it
assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
# This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word
assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '00 00 00 00'
def test_not_enough_ascii():
with pytest.raises(ValueError):
random_ascii_words(lang='zh', bits_per_word=16)
def test_arabic():
# Remove tatweels
assert tokenize('متــــــــعب', 'ar') == ['متعب']
# Remove combining marks
assert tokenize('حَرَكَات', 'ar') == ['حركات']
# An Arabic ligature that is affected by NFKC normalization
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
def test_ideographic_fallback():
# Try tokenizing Chinese text as English -- it should remain stuck together.
#
# More complex examples like this, involving the multiple scripts of Japanese,
# are in test_japanese.py.
assert tokenize('中国文字', 'en') == ['中国文字']
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
# Test Khmer, a script similar to Thai
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
# Test Hindi -- tokens split where there are spaces, and not where there aren't
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
# Remove vowel points in Hebrew
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
# Deal with commas, cedillas, and I's in Turkish
assert tokenize('kișinin', 'tr') == ['kişinin']
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
# Deal with cedillas that should be commas-below in Romanian
assert tokenize('acelaşi', 'ro') == ['același']
assert tokenize('ACELAŞI', 'ro') == ['același']

View File

@ -1,10 +1,9 @@
from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, simple_tokenize, word_frequency
import pytest
def test_tokens():
eq_(tokenize('おはようございます', 'ja'),
['おはよう', 'ござい', 'ます'])
assert tokenize('おはようございます', 'ja') == ['おはよう', 'ござい', 'ます']
def test_simple_tokenize():
@ -19,31 +18,29 @@ def test_simple_tokenize():
# We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary.
ja_text = 'ひらがなカタカナromaji'
eq_(
simple_tokenize(ja_text),
['ひらがなカタカナ', 'romaji']
)
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
eq_(simple_tokenize('おはようございます'), ['おはようございます'])
assert simple_tokenize('おはようございます') == ['おはようございます']
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
# token
eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]
# The word in ConceptNet that made me notice that simple_tokenize used
# to have a problem with the character 々
eq_(simple_tokenize("晴々しい"), ["晴々しい"])
assert simple_tokenize("晴々しい") == ["晴々しい"]
# Explicit word separators are still token boundaries, such as the dot
# between "toner" and "cartridge" in "toner cartridge"
eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]
# This word has multiple weird characters that aren't quite kanji in it,
# and is in the dictionary
eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
@ -52,12 +49,11 @@ def test_combination():
gozai_freq = word_frequency('ござい', 'ja')
masu_freq = word_frequency('ます', 'ja')
assert_almost_equal(
word_frequency('おはようおはよう', 'ja'),
ohayou_freq / 2
)
assert_almost_equal(
1.0 / word_frequency('おはようございます', 'ja'),
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2)
assert (
1.0 / word_frequency('おはようございます', 'ja') ==
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
)

View File

@ -1,22 +1,18 @@
from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency
import pytest
def test_tokens():
eq_(tokenize('감사합니다', 'ko'),
['감사', '합니다'])
assert tokenize('감사합니다', 'ko') == ['감사', '합니다']
def test_combination():
gamsa_freq = word_frequency('감사', 'ko')
habnida_freq = word_frequency('합니다', 'ko')
assert_almost_equal(
word_frequency('감사감사', 'ko'),
gamsa_freq / 2
)
assert_almost_equal(
1.0 / word_frequency('감사합니다', 'ko'),
1.0 / gamsa_freq + 1.0 / habnida_freq
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2)
assert (
1.0 / word_frequency('감사합니다', 'ko') ==
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq)
)

View File

@ -1,15 +1,18 @@
from nose.tools import eq_
from wordfreq import tokenize
from wordfreq.preprocess import preprocess_text
def test_transliteration():
# "Well, there's a lot of things you do not understand."
# (from somewhere in OpenSubtitles)
eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
# (from somewhere in OpenSubtitles
assert (
tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
)
assert (
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
)
# I don't have examples of complete sentences in Azerbaijani that are
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
@ -17,14 +20,14 @@ def test_transliteration():
# So here are some individual words.
# 'library' in Azerbaijani Cyrillic
eq_(preprocess_text('китабхана', 'az'), 'kitabxana')
eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana')
eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana')
assert preprocess_text('китабхана', 'az') == 'kitabxana'
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'
# 'scream' in Azerbaijani Cyrillic
eq_(preprocess_text('бағырты', 'az'), 'bağırtı')
eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı')
eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı')
assert preprocess_text('бағырты', 'az') == 'bağırtı'
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
def test_actually_russian():
@ -35,15 +38,13 @@ def test_actually_russian():
# We make sure to handle this case so we don't end up with a mixed-script
# word like "pacanы".
eq_(tokenize("сто из ста, пацаны!", 'sr'),
['sto', 'iz', 'sta', 'pacany'])
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany']
assert tokenize("культуры", 'sr') == ["kul'tury"]
def test_alternate_codes():
# Try language codes for Serbo-Croatian that have been split, and now
# are canonically mapped to Serbian
eq_(tokenize("культуры", 'sh'), ["kul'tury"])
eq_(tokenize("культуры", 'hbs'), ["kul'tury"])
assert tokenize("культуры", 'sh') == ["kul'tury"]
assert tokenize("культуры", 'hbs') == ["kul'tury"]

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.