diff --git a/CHANGELOG.md b/CHANGELOG.md index e26aab4..3bb5c32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,5 @@ +## Version 2.2 + ## Version 2.1 (2018-06-18) Data changes: diff --git a/README.md b/README.md index 41c40d5..0cdabd7 100644 --- a/README.md +++ b/README.md @@ -276,7 +276,8 @@ produces tokens that follow the recommendations in [Unicode Annex #29, Text Segmentation][uax29], including the optional rule that splits words between apostrophes and vowels. -There are language-specific exceptions: +There are exceptions where we change the tokenization to work better +with certain languages: - In Arabic and Hebrew, it additionally normalizes ligatures and removes combining marks. @@ -288,11 +289,21 @@ There are language-specific exceptions: - In Chinese, it uses the external Python library `jieba`, another optional dependency. +- While the @ sign is usually considered a symbol and not part of a word, + wordfreq will allow a word to end with "@" or "@s". This is one way of + writing gender-neutral words in Spanish and Portuguese. + [uax29]: http://unicode.org/reports/tr29/ When wordfreq's frequency lists are built in the first place, the words are tokenized according to this function. + >>> from wordfreq import tokenize + >>> tokenize('l@s niñ@s', 'es') + ['l@s', 'niñ@s'] + >>> zipf_frequency('l@s', 'es') + 2.8 + Because tokenization in the real world is far from consistent, wordfreq will also try to deal gracefully when you query it with texts that actually break into multiple tokens: diff --git a/setup.py b/setup.py index 4178cac..bb2550d 100755 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ if sys.version_info < (3, 4): setup( name="wordfreq", - version='2.1.0', + version='2.2.0', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/',