From ad02d96f1b9d57cd13f169edc8246598169e6874 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Tue, 8 Sep 2020 16:03:33 -0400
Subject: [PATCH] update dependencies and test for consistent results

---
 CHANGELOG.md                                        |  5 +++++
 setup.py                                            |  6 +++---
 ...st_french_and_related.py => test_apostrophes.py} | 13 +++++++++++++
 tests/test_chinese.py                               | 12 ++++++++++++
 wordfreq/language_info.py                           |  8 ++++----
 5 files changed, 37 insertions(+), 7 deletions(-)
 rename tests/{test_french_and_related.py => test_apostrophes.py} (70%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0099d45..5af25fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## Version 2.3.3 (2020-09-08)
+
+- Set minimum version requierements on `regex`, `jieba`, and `langcodes`
+  so that tokenization will give consistent results.
+
 ## Version 2.3.2 (2020-04-28)
 
 - Relaxing the dependency on regex had an unintended consequence in 2.3.1:
diff --git a/setup.py b/setup.py
index 4b5927b..61e0b25 100755
--- a/setup.py
+++ b/setup.py
@@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
                        encoding='utf-8').read()
 doclines = README_contents.split("\n")
 dependencies = [
-    'msgpack >= 1.0', 'langcodes >= 2', 'regex'
+    'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
 ]
 if sys.version_info < (3, 4):
     dependencies.append('pathlib')
 
 setup(
     name="wordfreq",
-    version='2.3.2',
+    version='2.3.3',
     maintainer='Robyn Speer',
     maintainer_email='rspeer@luminoso.com',
     url='http://github.com/LuminosoInsight/wordfreq/',
@@ -58,7 +58,7 @@ setup(
     # Similarly, jieba is required for Chinese word frequencies.
     extras_require={
         'mecab': 'mecab-python3',
-        'jieba': 'jieba'
+        'jieba': 'jieba >= 0.42'
     },
     tests_require=['pytest', 'mecab-python3', 'jieba'],
 )
diff --git a/tests/test_french_and_related.py b/tests/test_apostrophes.py
similarity index 70%
rename from tests/test_french_and_related.py
rename to tests/test_apostrophes.py
index c27ecae..0c6b9b7 100644
--- a/tests/test_french_and_related.py
+++ b/tests/test_apostrophes.py
@@ -9,11 +9,24 @@ def test_apostrophes():
     assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
     assert tokenize("l'heure", 'fr') == ['l', 'heure']
     assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
+    assert tokenize("l'anima", 'it') == ['l', 'anima']
     assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
     assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
     assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
     assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
 
+    # This next behavior is not ideal -- we would prefer "dell'" to be handled
+    # the same as "l'" -- but this is the most consistent result we can get without
+    # Italian-specific rules.
+    #
+    # Versions of regex from 2019 and earlier would give ['dell', 'anima'], which
+    # is better but inconsistent.
+    assert tokenize("dell'anima", 'it') == ["dell'anima"]
+
+    # Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's
+    # an example of why we don't want the apostrophe-vowel fix to apply everywhere.
+    assert tokenize("hawai'i", 'en') == ["hawai'i"]
+
 
 def test_catastrophes():
     # More apostrophes, but this time they're in Catalan, and there's other
diff --git a/tests/test_chinese.py b/tests/test_chinese.py
index ce157db..c841335 100644
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@@ -77,3 +77,15 @@ def test_alternate_codes():
     # Separate codes for Mandarin and Cantonese
     assert tokenize('谢谢谢谢', 'cmn') == tokens
     assert tokenize('谢谢谢谢', 'yue') == tokens
+
+
+def test_hyphens():
+    # An edge case of Chinese tokenization that changed sometime around
+    # jieba 0.42.
+
+    tok = tokenize('--------', 'zh', include_punctuation=True)
+    assert tok == ['-'] * 8
+    
+    tok = tokenize('--------', 'zh', include_punctuation=True, external_wordlist=True)
+    assert tok == ['--------']
+
diff --git a/wordfreq/language_info.py b/wordfreq/language_info.py
index 3b736be..73a7b69 100644
--- a/wordfreq/language_info.py
+++ b/wordfreq/language_info.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from langcodes import Language, best_match
+from langcodes import Language, closest_match
 
 
 # Text in scripts written without spaces has to be handled specially in our
@@ -45,7 +45,7 @@ EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
 # happens in ConceptNet.
 
 
-def _language_in_list(language, targets, min_score=80):
+def _language_in_list(language, targets, max_distance=10):
     """
     A helper function to determine whether this language matches one of the
     target languages, with a match score above a certain threshold.
@@ -53,8 +53,8 @@ def _language_in_list(language, targets, min_score=80):
     The languages can be given as strings (language tags) or as Language
     objects. `targets` can be any iterable of such languages.
     """
-    matched = best_match(language, targets, min_score=min_score)
-    return matched[1] > 0
+    matched = closest_match(language, targets, max_distance=max_distance)
+    return matched[0] != 'und'
 
 
 @lru_cache(maxsize=None)