use langcodes 2.0 and deprecate 'match_cutoff'

This commit is contained in:
Robyn Speer 2020-04-16 14:08:43 -04:00
parent 40443c9a3b
commit bf795e6d6c
2 changed files with 25 additions and 9 deletions

View File

@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read()
doclines = README_contents.split("\n")
dependencies = [
'msgpack', 'langcodes >= 1.4.1', 'regex >= 2017.07.11, <= 2018.02.21'
'msgpack', 'langcodes >= 2', 'regex'
]
if sys.version_info < (3, 4):
dependencies.append('pathlib')
setup(
name="wordfreq",
version='2.2.2',
version='2.3',
maintainer='Robyn Speer',
maintainer_email='rspeer@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',
@ -46,7 +46,7 @@ setup(
long_description=README_contents,
long_description_content_type='text/markdown',
packages=['wordfreq'],
python_requires='>=3.3',
python_requires='>=3.5',
include_package_data=True,
install_requires=dependencies,

View File

@ -8,6 +8,7 @@ import pathlib
import random
import logging
import math
import warnings
from .tokens import tokenize, simple_tokenize, lossy_tokenize
from .language_info import get_language_info
@ -110,7 +111,7 @@ def available_languages(wordlist='best'):
@lru_cache(maxsize=None)
def get_frequency_list(lang, wordlist='best', match_cutoff=30):
def get_frequency_list(lang, wordlist='best', match_cutoff=None):
"""
Read the raw data from a wordlist file, returning it as a list of
lists. (See `read_cBpack` for what this represents.)
@ -120,10 +121,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=30):
'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
Looking up the alternate code 'por' will also get the same list.
"""
if match_cutoff is not None:
warnings.warn(
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
available = available_languages(wordlist)
best, score = langcodes.best_match(lang, list(available),
min_score=match_cutoff)
if score == 0:
# TODO: decrease the maximum distance. This distance is so high just
# because it allows a test where 'yue' matches 'zh', and maybe the
# distance between those is high because they shouldn't match.
best, _distance = langcodes.closest_match(
lang, list(available), max_distance=70
)
if best == 'und':
raise LookupError("No wordlist %r available for language %r"
% (wordlist, lang))
@ -192,13 +203,18 @@ def freq_to_zipf(freq):
@lru_cache(maxsize=None)
def get_frequency_dict(lang, wordlist='best', match_cutoff=30):
def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
"""
Get a word frequency list as a dictionary, mapping tokens to
frequencies as floating-point probabilities.
"""
if match_cutoff is not None:
warnings.warn(
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
freqs = {}
pack = get_frequency_list(lang, wordlist, match_cutoff)
pack = get_frequency_list(lang, wordlist)
for index, bucket in enumerate(pack):
freq = cB_to_freq(-index)
for word in bucket: