diff --git a/setup.py b/setup.py index 014624f..dabe7ce 100755 --- a/setup.py +++ b/setup.py @@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'), encoding='utf-8').read() doclines = README_contents.split("\n") dependencies = [ - 'msgpack', 'langcodes >= 1.4.1', 'regex >= 2017.07.11, <= 2018.02.21' + 'msgpack', 'langcodes >= 2', 'regex' ] if sys.version_info < (3, 4): dependencies.append('pathlib') setup( name="wordfreq", - version='2.2.2', + version='2.3', maintainer='Robyn Speer', maintainer_email='rspeer@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', @@ -46,7 +46,7 @@ setup( long_description=README_contents, long_description_content_type='text/markdown', packages=['wordfreq'], - python_requires='>=3.3', + python_requires='>=3.5', include_package_data=True, install_requires=dependencies, diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index a72770f..2f73cd1 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -8,6 +8,7 @@ import pathlib import random import logging import math +import warnings from .tokens import tokenize, simple_tokenize, lossy_tokenize from .language_info import get_language_info @@ -110,7 +111,7 @@ def available_languages(wordlist='best'): @lru_cache(maxsize=None) -def get_frequency_list(lang, wordlist='best', match_cutoff=30): +def get_frequency_list(lang, wordlist='best', match_cutoff=None): """ Read the raw data from a wordlist file, returning it as a list of lists. (See `read_cBpack` for what this represents.) @@ -120,10 +121,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=30): 'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list. Looking up the alternate code 'por' will also get the same list. """ + if match_cutoff is not None: + warnings.warn( + "The `match_cutoff` parameter is deprecated", + DeprecationWarning + ) available = available_languages(wordlist) - best, score = langcodes.best_match(lang, list(available), - min_score=match_cutoff) - if score == 0: + + # TODO: decrease the maximum distance. This distance is so high just + # because it allows a test where 'yue' matches 'zh', and maybe the + # distance between those is high because they shouldn't match. + best, _distance = langcodes.closest_match( + lang, list(available), max_distance=70 + ) + if best == 'und': raise LookupError("No wordlist %r available for language %r" % (wordlist, lang)) @@ -192,13 +203,18 @@ def freq_to_zipf(freq): @lru_cache(maxsize=None) -def get_frequency_dict(lang, wordlist='best', match_cutoff=30): +def get_frequency_dict(lang, wordlist='best', match_cutoff=None): """ Get a word frequency list as a dictionary, mapping tokens to frequencies as floating-point probabilities. """ + if match_cutoff is not None: + warnings.warn( + "The `match_cutoff` parameter is deprecated", + DeprecationWarning + ) freqs = {} - pack = get_frequency_list(lang, wordlist, match_cutoff) + pack = get_frequency_list(lang, wordlist) for index, bucket in enumerate(pack): freq = cB_to_freq(-index) for word in bucket: