Merge pull request #75 from LuminosoInsight/language-match-update

use langcodes 2.0 and deprecate 'match_cutoff'
2024-12-23 17:31:41 +00:00 · 2020-04-20 14:48:58 -04:00 · 2020-04-20 14:48:58 -04:00 · af22c03609
commit af22c03609
parent 33bfb1409d 258670b823
3 changed files with 36 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,14 @@
+## Version 2.3 (2020-04-16)
+
+- Python 3.5 is the oldest maintained version of Python, and we have stopped
+  claiming support for earlier versions.
+
+- Updated to langcodes 2.0.
+
+- Deprecated the `match_cutoff` parameter, which was intended for situations
+  where we need to approximately match a language code, but was not usefully
+  configurable in those situations.
+
 ## Version 2.2.2 (2020-02-28)

 Library change:
--- a/setup.py
+++ b/setup.py
@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
                       encoding='utf-8').read()
 doclines = README_contents.split("\n")
 dependencies = [
-    'msgpack', 'langcodes >= 1.4.1', 'regex >= 2017.07.11, <= 2018.02.21'
+    'msgpack', 'langcodes >= 2', 'regex'
 ]
 if sys.version_info < (3, 4):
    dependencies.append('pathlib')

 setup(
    name="wordfreq",
-    version='2.2.2',
+    version='2.3',
    maintainer='Robyn Speer',
    maintainer_email='rspeer@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
@ -46,7 +46,7 @@ setup(
    long_description=README_contents,
    long_description_content_type='text/markdown',
    packages=['wordfreq'],
-    python_requires='>=3.3',
+    python_requires='>=3.5',
    include_package_data=True,
    install_requires=dependencies,

--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -8,6 +8,7 @@ import pathlib
 import random
 import logging
 import math
+import warnings

 from .tokens import tokenize, simple_tokenize, lossy_tokenize
 from .language_info import get_language_info
@ -110,7 +111,7 @@ def available_languages(wordlist='best'):


@lru_cache(maxsize=None)
-def get_frequency_list(lang, wordlist='best', match_cutoff=30):
+def get_frequency_list(lang, wordlist='best', match_cutoff=None):
    """
    Read the raw data from a wordlist file, returning it as a list of
    lists. (See `read_cBpack` for what this represents.)
@ -120,10 +121,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=30):
    'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
    Looking up the alternate code 'por' will also get the same list.
    """
+    if match_cutoff is not None:
+        warnings.warn(
+            "The `match_cutoff` parameter is deprecated",
+            DeprecationWarning
+        )
    available = available_languages(wordlist)
-    best, score = langcodes.best_match(lang, list(available),
-                                       min_score=match_cutoff)
-    if score == 0:
+
+    # TODO: decrease the maximum distance. This distance is so high just
+    # because it allows a test where 'yue' matches 'zh', and maybe the
+    # distance between those is high because they shouldn't match.
+    best, _distance = langcodes.closest_match(
+        lang, list(available), max_distance=70
+    )
+    if best == 'und':
        raise LookupError("No wordlist %r available for language %r"
                          % (wordlist, lang))

@ -192,13 +203,18 @@ def freq_to_zipf(freq):


@lru_cache(maxsize=None)
-def get_frequency_dict(lang, wordlist='best', match_cutoff=30):
+def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
    """
    Get a word frequency list as a dictionary, mapping tokens to
    frequencies as floating-point probabilities.
    """
+    if match_cutoff is not None:
+        warnings.warn(
+            "The `match_cutoff` parameter is deprecated",
+            DeprecationWarning
+        )
    freqs = {}
-    pack = get_frequency_list(lang, wordlist, match_cutoff)
+    pack = get_frequency_list(lang, wordlist)
    for index, bucket in enumerate(pack):
        freq = cB_to_freq(-index)
        for word in bucket: