Merge pull request #75 from LuminosoInsight/language-match-update

use langcodes 2.0 and deprecate 'match_cutoff'
This commit is contained in:
Lance Nathan 2020-04-20 14:48:58 -04:00 committed by GitHub
commit af22c03609
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 9 deletions

View File

@ -1,3 +1,14 @@
## Version 2.3 (2020-04-16)
- Python 3.5 is the oldest maintained version of Python, and we have stopped
claiming support for earlier versions.
- Updated to langcodes 2.0.
- Deprecated the `match_cutoff` parameter, which was intended for situations
where we need to approximately match a language code, but was not usefully
configurable in those situations.
## Version 2.2.2 (2020-02-28) ## Version 2.2.2 (2020-02-28)
Library change: Library change:

View File

@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read() encoding='utf-8').read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = [ dependencies = [
'msgpack', 'langcodes >= 1.4.1', 'regex >= 2017.07.11, <= 2018.02.21' 'msgpack', 'langcodes >= 2', 'regex'
] ]
if sys.version_info < (3, 4): if sys.version_info < (3, 4):
dependencies.append('pathlib') dependencies.append('pathlib')
setup( setup(
name="wordfreq", name="wordfreq",
version='2.2.2', version='2.3',
maintainer='Robyn Speer', maintainer='Robyn Speer',
maintainer_email='rspeer@luminoso.com', maintainer_email='rspeer@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',
@ -46,7 +46,7 @@ setup(
long_description=README_contents, long_description=README_contents,
long_description_content_type='text/markdown', long_description_content_type='text/markdown',
packages=['wordfreq'], packages=['wordfreq'],
python_requires='>=3.3', python_requires='>=3.5',
include_package_data=True, include_package_data=True,
install_requires=dependencies, install_requires=dependencies,

View File

@ -8,6 +8,7 @@ import pathlib
import random import random
import logging import logging
import math import math
import warnings
from .tokens import tokenize, simple_tokenize, lossy_tokenize from .tokens import tokenize, simple_tokenize, lossy_tokenize
from .language_info import get_language_info from .language_info import get_language_info
@ -110,7 +111,7 @@ def available_languages(wordlist='best'):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def get_frequency_list(lang, wordlist='best', match_cutoff=30): def get_frequency_list(lang, wordlist='best', match_cutoff=None):
""" """
Read the raw data from a wordlist file, returning it as a list of Read the raw data from a wordlist file, returning it as a list of
lists. (See `read_cBpack` for what this represents.) lists. (See `read_cBpack` for what this represents.)
@ -120,10 +121,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=30):
'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list. 'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
Looking up the alternate code 'por' will also get the same list. Looking up the alternate code 'por' will also get the same list.
""" """
if match_cutoff is not None:
warnings.warn(
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
available = available_languages(wordlist) available = available_languages(wordlist)
best, score = langcodes.best_match(lang, list(available),
min_score=match_cutoff) # TODO: decrease the maximum distance. This distance is so high just
if score == 0: # because it allows a test where 'yue' matches 'zh', and maybe the
# distance between those is high because they shouldn't match.
best, _distance = langcodes.closest_match(
lang, list(available), max_distance=70
)
if best == 'und':
raise LookupError("No wordlist %r available for language %r" raise LookupError("No wordlist %r available for language %r"
% (wordlist, lang)) % (wordlist, lang))
@ -192,13 +203,18 @@ def freq_to_zipf(freq):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def get_frequency_dict(lang, wordlist='best', match_cutoff=30): def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
""" """
Get a word frequency list as a dictionary, mapping tokens to Get a word frequency list as a dictionary, mapping tokens to
frequencies as floating-point probabilities. frequencies as floating-point probabilities.
""" """
if match_cutoff is not None:
warnings.warn(
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
freqs = {} freqs = {}
pack = get_frequency_list(lang, wordlist, match_cutoff) pack = get_frequency_list(lang, wordlist)
for index, bucket in enumerate(pack): for index, bucket in enumerate(pack):
freq = cB_to_freq(-index) freq = cB_to_freq(-index)
for word in bucket: for word in bucket: