From 71a0ad6abb0670da8927370b13fbb15ced516cf8 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 27 Apr 2017 15:09:59 -0400
Subject: [PATCH 1/2] Use langcodes when tokenizing again (it no longer
 connects to a DB)

---
 tests/test_chinese.py            | 19 +++++++++++++++++++
 tests/test_french_and_related.py |  9 +++++++++
 tests/test_serbian.py            |  8 ++++++++
 wordfreq/tokens.py               | 13 ++++++-------
 4 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/tests/test_chinese.py b/tests/test_chinese.py
index db5cabc..d26c690 100644
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@@ -66,3 +66,22 @@ def test_combination():
         word_frequency('谢谢谢谢', 'zh'),
         xiexie_freq / 20
     )
+
+
+def test_alternate_codes():
+    # Tokenization of Chinese works when you use other language codes
+    # that are not equal to 'zh'.
+    tokens = ['谢谢', '谢谢']
+
+    # Code with a region attached
+    eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)
+
+    # Over-long codes for Chinese
+    eq_(tokenize('谢谢谢谢', 'chi'), tokens)
+    eq_(tokenize('谢谢谢谢', 'zho'), tokens)
+
+    # Separate codes for Mandarin and Cantonese
+    eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
+    eq_(tokenize('谢谢谢谢', 'yue'), tokens)
+
+
diff --git a/tests/test_french_and_related.py b/tests/test_french_and_related.py
index 17f59c3..c347213 100644
--- a/tests/test_french_and_related.py
+++ b/tests/test_french_and_related.py
@@ -29,3 +29,12 @@ def test_catastrophes():
         ['m', 'acabo', 'd', 'instal·lar'])
     eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
         ["m'", 'acabo', "d'", 'instal·lar', '.'])
+
+
+def test_alternate_codes():
+    # Try over-long language codes for French and Catalan
+    eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
+    eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
+    eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
+        ['m', 'acabo', 'd', 'instal·lar'])
+
diff --git a/tests/test_serbian.py b/tests/test_serbian.py
index 7d33367..3f8c93b 100644
--- a/tests/test_serbian.py
+++ b/tests/test_serbian.py
@@ -23,3 +23,11 @@ def test_actually_russian():
         ['sto', 'iz', 'sta', 'pacany'])
 
     eq_(tokenize("культуры", 'sr'), ["kul'tury"])
+
+
+def test_alternate_codes():
+    # Try language codes for Serbo-Croatian that have been split, and now
+    # are canonically mapped to Serbian
+    eq_(tokenize("культуры", 'sh'), ["kul'tury"])
+    eq_(tokenize("культуры", 'hbs'), ["kul'tury"])
+
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index de4b566..2f08de6 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -1,5 +1,6 @@
 import regex
 import unicodedata
+import langcodes
 from .transliterate import serbian_cyrillic_to_latin
 
 mecab_tokenize = None
@@ -361,20 +362,18 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
     does not support these languages yet. It will split on spaces and
     punctuation, giving tokens that are far too long.
     """
-    # A really simple way to handle language codes with more than just the
-    # language
-    lang = lang.split('-')[0]
+    # Reduce whatever language code was passed in to a normal form,
+    # containing just the language subtag.
+    lang = langcodes.get(lang).prefer_macrolanguage().language
     if lang == 'ja' or lang == 'ko':
         result = tokenize_mecab_language(text, lang, include_punctuation)
-    elif lang == 'zh':
+    elif lang == 'zh' or lang == 'yue':
         result = chinese_tokenize(text, include_punctuation, external_wordlist)
     elif lang == 'tr':
         result = simple_tokenize(preprocess_turkish(text), include_punctuation)
     elif lang == 'ro':
         result = simple_tokenize(preprocess_romanian(text), include_punctuation)
-    elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
-        # These are the three language codes that could include Serbian text,
-        # which could be in Cyrillic.
+    elif lang == 'sr':
         result = simple_tokenize(preprocess_serbian(text), include_punctuation)
     elif lang in ABJAD_LANGUAGES:
         text = remove_marks(unicodedata.normalize('NFKC', text))

From aa3ed232826101236c120b1241e53fd5740d36d2 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Wed, 10 May 2017 13:26:23 -0400
Subject: [PATCH 2/2] v1.6.1: depend on langcodes 1.4

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7f7b124..f3af099 100755
--- a/setup.py
+++ b/setup.py
@@ -27,14 +27,14 @@ current_dir = os.path.dirname(__file__)
 README_contents = open(os.path.join(current_dir, 'README.md'),
                        encoding='utf-8').read()
 doclines = README_contents.split("\n")
-dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
+dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015']
 if sys.version_info < (3, 4):
     dependencies.append('pathlib')
 
 
 setup(
     name="wordfreq",
-    version='1.6',
+    version='1.6.1',
     maintainer='Luminoso Technologies, Inc.',
     maintainer_email='info@luminoso.com',
     url='http://github.com/LuminosoInsight/wordfreq/',