use langcodes 2.0 and deprecate 'match_cutoff'

This commit is contained in:
Robyn Speer 2020-04-16 14:08:43 -04:00
parent 40443c9a3b
commit bf795e6d6c
2 changed files with 25 additions and 9 deletions

View File

@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read() encoding='utf-8').read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = [ dependencies = [
'msgpack', 'langcodes >= 1.4.1', 'regex >= 2017.07.11, <= 2018.02.21' 'msgpack', 'langcodes >= 2', 'regex'
] ]
if sys.version_info < (3, 4): if sys.version_info < (3, 4):
dependencies.append('pathlib') dependencies.append('pathlib')
setup( setup(
name="wordfreq", name="wordfreq",
version='2.2.2', version='2.3',
maintainer='Robyn Speer', maintainer='Robyn Speer',
maintainer_email='rspeer@luminoso.com', maintainer_email='rspeer@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',
@ -46,7 +46,7 @@ setup(
long_description=README_contents, long_description=README_contents,
long_description_content_type='text/markdown', long_description_content_type='text/markdown',
packages=['wordfreq'], packages=['wordfreq'],
python_requires='>=3.3', python_requires='>=3.5',
include_package_data=True, include_package_data=True,
install_requires=dependencies, install_requires=dependencies,

View File

@ -8,6 +8,7 @@ import pathlib
import random import random
import logging import logging
import math import math
import warnings
from .tokens import tokenize, simple_tokenize, lossy_tokenize from .tokens import tokenize, simple_tokenize, lossy_tokenize
from .language_info import get_language_info from .language_info import get_language_info
@ -110,7 +111,7 @@ def available_languages(wordlist='best'):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def get_frequency_list(lang, wordlist='best', match_cutoff=30): def get_frequency_list(lang, wordlist='best', match_cutoff=None):
""" """
Read the raw data from a wordlist file, returning it as a list of Read the raw data from a wordlist file, returning it as a list of
lists. (See `read_cBpack` for what this represents.) lists. (See `read_cBpack` for what this represents.)
@ -120,10 +121,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=30):
'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list. 'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
Looking up the alternate code 'por' will also get the same list. Looking up the alternate code 'por' will also get the same list.
""" """
if match_cutoff is not None:
warnings.warn(
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
available = available_languages(wordlist) available = available_languages(wordlist)
best, score = langcodes.best_match(lang, list(available),
min_score=match_cutoff) # TODO: decrease the maximum distance. This distance is so high just
if score == 0: # because it allows a test where 'yue' matches 'zh', and maybe the
# distance between those is high because they shouldn't match.
best, _distance = langcodes.closest_match(
lang, list(available), max_distance=70
)
if best == 'und':
raise LookupError("No wordlist %r available for language %r" raise LookupError("No wordlist %r available for language %r"
% (wordlist, lang)) % (wordlist, lang))
@ -192,13 +203,18 @@ def freq_to_zipf(freq):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def get_frequency_dict(lang, wordlist='best', match_cutoff=30): def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
""" """
Get a word frequency list as a dictionary, mapping tokens to Get a word frequency list as a dictionary, mapping tokens to
frequencies as floating-point probabilities. frequencies as floating-point probabilities.
""" """
if match_cutoff is not None:
warnings.warn(
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
freqs = {} freqs = {}
pack = get_frequency_list(lang, wordlist, match_cutoff) pack = get_frequency_list(lang, wordlist)
for index, bucket in enumerate(pack): for index, bucket in enumerate(pack):
freq = cB_to_freq(-index) freq = cB_to_freq(-index)
for word in bucket: for word in bucket: