mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
Merge pull request #75 from LuminosoInsight/language-match-update
use langcodes 2.0 and deprecate 'match_cutoff'
This commit is contained in:
commit
af22c03609
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
|||||||
|
## Version 2.3 (2020-04-16)
|
||||||
|
|
||||||
|
- Python 3.5 is the oldest maintained version of Python, and we have stopped
|
||||||
|
claiming support for earlier versions.
|
||||||
|
|
||||||
|
- Updated to langcodes 2.0.
|
||||||
|
|
||||||
|
- Deprecated the `match_cutoff` parameter, which was intended for situations
|
||||||
|
where we need to approximately match a language code, but was not usefully
|
||||||
|
configurable in those situations.
|
||||||
|
|
||||||
## Version 2.2.2 (2020-02-28)
|
## Version 2.2.2 (2020-02-28)
|
||||||
|
|
||||||
Library change:
|
Library change:
|
||||||
|
6
setup.py
6
setup.py
@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
|
|||||||
encoding='utf-8').read()
|
encoding='utf-8').read()
|
||||||
doclines = README_contents.split("\n")
|
doclines = README_contents.split("\n")
|
||||||
dependencies = [
|
dependencies = [
|
||||||
'msgpack', 'langcodes >= 1.4.1', 'regex >= 2017.07.11, <= 2018.02.21'
|
'msgpack', 'langcodes >= 2', 'regex'
|
||||||
]
|
]
|
||||||
if sys.version_info < (3, 4):
|
if sys.version_info < (3, 4):
|
||||||
dependencies.append('pathlib')
|
dependencies.append('pathlib')
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='2.2.2',
|
version='2.3',
|
||||||
maintainer='Robyn Speer',
|
maintainer='Robyn Speer',
|
||||||
maintainer_email='rspeer@luminoso.com',
|
maintainer_email='rspeer@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
@ -46,7 +46,7 @@ setup(
|
|||||||
long_description=README_contents,
|
long_description=README_contents,
|
||||||
long_description_content_type='text/markdown',
|
long_description_content_type='text/markdown',
|
||||||
packages=['wordfreq'],
|
packages=['wordfreq'],
|
||||||
python_requires='>=3.3',
|
python_requires='>=3.5',
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
install_requires=dependencies,
|
install_requires=dependencies,
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ import pathlib
|
|||||||
import random
|
import random
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
|
import warnings
|
||||||
|
|
||||||
from .tokens import tokenize, simple_tokenize, lossy_tokenize
|
from .tokens import tokenize, simple_tokenize, lossy_tokenize
|
||||||
from .language_info import get_language_info
|
from .language_info import get_language_info
|
||||||
@ -110,7 +111,7 @@ def available_languages(wordlist='best'):
|
|||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def get_frequency_list(lang, wordlist='best', match_cutoff=30):
|
def get_frequency_list(lang, wordlist='best', match_cutoff=None):
|
||||||
"""
|
"""
|
||||||
Read the raw data from a wordlist file, returning it as a list of
|
Read the raw data from a wordlist file, returning it as a list of
|
||||||
lists. (See `read_cBpack` for what this represents.)
|
lists. (See `read_cBpack` for what this represents.)
|
||||||
@ -120,10 +121,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=30):
|
|||||||
'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
|
'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list.
|
||||||
Looking up the alternate code 'por' will also get the same list.
|
Looking up the alternate code 'por' will also get the same list.
|
||||||
"""
|
"""
|
||||||
|
if match_cutoff is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"The `match_cutoff` parameter is deprecated",
|
||||||
|
DeprecationWarning
|
||||||
|
)
|
||||||
available = available_languages(wordlist)
|
available = available_languages(wordlist)
|
||||||
best, score = langcodes.best_match(lang, list(available),
|
|
||||||
min_score=match_cutoff)
|
# TODO: decrease the maximum distance. This distance is so high just
|
||||||
if score == 0:
|
# because it allows a test where 'yue' matches 'zh', and maybe the
|
||||||
|
# distance between those is high because they shouldn't match.
|
||||||
|
best, _distance = langcodes.closest_match(
|
||||||
|
lang, list(available), max_distance=70
|
||||||
|
)
|
||||||
|
if best == 'und':
|
||||||
raise LookupError("No wordlist %r available for language %r"
|
raise LookupError("No wordlist %r available for language %r"
|
||||||
% (wordlist, lang))
|
% (wordlist, lang))
|
||||||
|
|
||||||
@ -192,13 +203,18 @@ def freq_to_zipf(freq):
|
|||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def get_frequency_dict(lang, wordlist='best', match_cutoff=30):
|
def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
|
||||||
"""
|
"""
|
||||||
Get a word frequency list as a dictionary, mapping tokens to
|
Get a word frequency list as a dictionary, mapping tokens to
|
||||||
frequencies as floating-point probabilities.
|
frequencies as floating-point probabilities.
|
||||||
"""
|
"""
|
||||||
|
if match_cutoff is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"The `match_cutoff` parameter is deprecated",
|
||||||
|
DeprecationWarning
|
||||||
|
)
|
||||||
freqs = {}
|
freqs = {}
|
||||||
pack = get_frequency_list(lang, wordlist, match_cutoff)
|
pack = get_frequency_list(lang, wordlist)
|
||||||
for index, bucket in enumerate(pack):
|
for index, bucket in enumerate(pack):
|
||||||
freq = cB_to_freq(-index)
|
freq = cB_to_freq(-index)
|
||||||
for word in bucket:
|
for word in bucket:
|
||||||
|
Loading…
Reference in New Issue
Block a user