From 71a0ad6abb0670da8927370b13fbb15ced516cf8 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 27 Apr 2017 15:09:59 -0400 Subject: [PATCH 1/2] Use langcodes when tokenizing again (it no longer connects to a DB) --- tests/test_chinese.py | 19 +++++++++++++++++++ tests/test_french_and_related.py | 9 +++++++++ tests/test_serbian.py | 8 ++++++++ wordfreq/tokens.py | 13 ++++++------- 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/tests/test_chinese.py b/tests/test_chinese.py index db5cabc..d26c690 100644 --- a/tests/test_chinese.py +++ b/tests/test_chinese.py @@ -66,3 +66,22 @@ def test_combination(): word_frequency('谢谢谢谢', 'zh'), xiexie_freq / 20 ) + + +def test_alternate_codes(): + # Tokenization of Chinese works when you use other language codes + # that are not equal to 'zh'. + tokens = ['谢谢', '谢谢'] + + # Code with a region attached + eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens) + + # Over-long codes for Chinese + eq_(tokenize('谢谢谢谢', 'chi'), tokens) + eq_(tokenize('谢谢谢谢', 'zho'), tokens) + + # Separate codes for Mandarin and Cantonese + eq_(tokenize('谢谢谢谢', 'cmn'), tokens) + eq_(tokenize('谢谢谢谢', 'yue'), tokens) + + diff --git a/tests/test_french_and_related.py b/tests/test_french_and_related.py index 17f59c3..c347213 100644 --- a/tests/test_french_and_related.py +++ b/tests/test_french_and_related.py @@ -29,3 +29,12 @@ def test_catastrophes(): ['m', 'acabo', 'd', 'instal·lar']) eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True), ["m'", 'acabo', "d'", 'instal·lar', '.']) + + +def test_alternate_codes(): + # Try over-long language codes for French and Catalan + eq_(tokenize("qu'un", 'fra'), ['qu', 'un']) + eq_(tokenize("qu'un", 'fre'), ['qu', 'un']) + eq_(tokenize("M'acabo d'instal·lar.", 'cat'), + ['m', 'acabo', 'd', 'instal·lar']) + diff --git a/tests/test_serbian.py b/tests/test_serbian.py index 7d33367..3f8c93b 100644 --- a/tests/test_serbian.py +++ b/tests/test_serbian.py @@ -23,3 +23,11 @@ def test_actually_russian(): ['sto', 'iz', 'sta', 'pacany']) eq_(tokenize("культуры", 'sr'), ["kul'tury"]) + + +def test_alternate_codes(): + # Try language codes for Serbo-Croatian that have been split, and now + # are canonically mapped to Serbian + eq_(tokenize("культуры", 'sh'), ["kul'tury"]) + eq_(tokenize("культуры", 'hbs'), ["kul'tury"]) + diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index de4b566..2f08de6 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -1,5 +1,6 @@ import regex import unicodedata +import langcodes from .transliterate import serbian_cyrillic_to_latin mecab_tokenize = None @@ -361,20 +362,18 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False, does not support these languages yet. It will split on spaces and punctuation, giving tokens that are far too long. """ - # A really simple way to handle language codes with more than just the - # language - lang = lang.split('-')[0] + # Reduce whatever language code was passed in to a normal form, + # containing just the language subtag. + lang = langcodes.get(lang).prefer_macrolanguage().language if lang == 'ja' or lang == 'ko': result = tokenize_mecab_language(text, lang, include_punctuation) - elif lang == 'zh': + elif lang == 'zh' or lang == 'yue': result = chinese_tokenize(text, include_punctuation, external_wordlist) elif lang == 'tr': result = simple_tokenize(preprocess_turkish(text), include_punctuation) elif lang == 'ro': result = simple_tokenize(preprocess_romanian(text), include_punctuation) - elif lang == 'sr' or lang == 'sh' or lang == 'hbs': - # These are the three language codes that could include Serbian text, - # which could be in Cyrillic. + elif lang == 'sr': result = simple_tokenize(preprocess_serbian(text), include_punctuation) elif lang in ABJAD_LANGUAGES: text = remove_marks(unicodedata.normalize('NFKC', text)) From aa3ed232826101236c120b1241e53fd5740d36d2 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Wed, 10 May 2017 13:26:23 -0400 Subject: [PATCH 2/2] v1.6.1: depend on langcodes 1.4 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7f7b124..f3af099 100755 --- a/setup.py +++ b/setup.py @@ -27,14 +27,14 @@ current_dir = os.path.dirname(__file__) README_contents = open(os.path.join(current_dir, 'README.md'), encoding='utf-8').read() doclines = README_contents.split("\n") -dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015'] +dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015'] if sys.version_info < (3, 4): dependencies.append('pathlib') setup( name="wordfreq", - version='1.6', + version='1.6.1', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/',