mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
update dependencies and test for consistent results
This commit is contained in:
parent
ca4681b361
commit
ad02d96f1b
@ -1,3 +1,8 @@
|
|||||||
|
## Version 2.3.3 (2020-09-08)
|
||||||
|
|
||||||
|
- Set minimum version requierements on `regex`, `jieba`, and `langcodes`
|
||||||
|
so that tokenization will give consistent results.
|
||||||
|
|
||||||
## Version 2.3.2 (2020-04-28)
|
## Version 2.3.2 (2020-04-28)
|
||||||
|
|
||||||
- Relaxing the dependency on regex had an unintended consequence in 2.3.1:
|
- Relaxing the dependency on regex had an unintended consequence in 2.3.1:
|
||||||
|
6
setup.py
6
setup.py
@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
|
|||||||
encoding='utf-8').read()
|
encoding='utf-8').read()
|
||||||
doclines = README_contents.split("\n")
|
doclines = README_contents.split("\n")
|
||||||
dependencies = [
|
dependencies = [
|
||||||
'msgpack >= 1.0', 'langcodes >= 2', 'regex'
|
'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
|
||||||
]
|
]
|
||||||
if sys.version_info < (3, 4):
|
if sys.version_info < (3, 4):
|
||||||
dependencies.append('pathlib')
|
dependencies.append('pathlib')
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='2.3.2',
|
version='2.3.3',
|
||||||
maintainer='Robyn Speer',
|
maintainer='Robyn Speer',
|
||||||
maintainer_email='rspeer@luminoso.com',
|
maintainer_email='rspeer@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
@ -58,7 +58,7 @@ setup(
|
|||||||
# Similarly, jieba is required for Chinese word frequencies.
|
# Similarly, jieba is required for Chinese word frequencies.
|
||||||
extras_require={
|
extras_require={
|
||||||
'mecab': 'mecab-python3',
|
'mecab': 'mecab-python3',
|
||||||
'jieba': 'jieba'
|
'jieba': 'jieba >= 0.42'
|
||||||
},
|
},
|
||||||
tests_require=['pytest', 'mecab-python3', 'jieba'],
|
tests_require=['pytest', 'mecab-python3', 'jieba'],
|
||||||
)
|
)
|
||||||
|
@ -9,11 +9,24 @@ def test_apostrophes():
|
|||||||
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
|
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
|
||||||
assert tokenize("l'heure", 'fr') == ['l', 'heure']
|
assert tokenize("l'heure", 'fr') == ['l', 'heure']
|
||||||
assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
|
assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
|
||||||
|
assert tokenize("l'anima", 'it') == ['l', 'anima']
|
||||||
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
|
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
|
||||||
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
|
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
|
||||||
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
|
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
|
||||||
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
|
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
|
||||||
|
|
||||||
|
# This next behavior is not ideal -- we would prefer "dell'" to be handled
|
||||||
|
# the same as "l'" -- but this is the most consistent result we can get without
|
||||||
|
# Italian-specific rules.
|
||||||
|
#
|
||||||
|
# Versions of regex from 2019 and earlier would give ['dell', 'anima'], which
|
||||||
|
# is better but inconsistent.
|
||||||
|
assert tokenize("dell'anima", 'it') == ["dell'anima"]
|
||||||
|
|
||||||
|
# Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's
|
||||||
|
# an example of why we don't want the apostrophe-vowel fix to apply everywhere.
|
||||||
|
assert tokenize("hawai'i", 'en') == ["hawai'i"]
|
||||||
|
|
||||||
|
|
||||||
def test_catastrophes():
|
def test_catastrophes():
|
||||||
# More apostrophes, but this time they're in Catalan, and there's other
|
# More apostrophes, but this time they're in Catalan, and there's other
|
@ -77,3 +77,15 @@ def test_alternate_codes():
|
|||||||
# Separate codes for Mandarin and Cantonese
|
# Separate codes for Mandarin and Cantonese
|
||||||
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
||||||
assert tokenize('谢谢谢谢', 'yue') == tokens
|
assert tokenize('谢谢谢谢', 'yue') == tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_hyphens():
|
||||||
|
# An edge case of Chinese tokenization that changed sometime around
|
||||||
|
# jieba 0.42.
|
||||||
|
|
||||||
|
tok = tokenize('--------', 'zh', include_punctuation=True)
|
||||||
|
assert tok == ['-'] * 8
|
||||||
|
|
||||||
|
tok = tokenize('--------', 'zh', include_punctuation=True, external_wordlist=True)
|
||||||
|
assert tok == ['--------']
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from langcodes import Language, best_match
|
from langcodes import Language, closest_match
|
||||||
|
|
||||||
|
|
||||||
# Text in scripts written without spaces has to be handled specially in our
|
# Text in scripts written without spaces has to be handled specially in our
|
||||||
@ -45,7 +45,7 @@ EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
|
|||||||
# happens in ConceptNet.
|
# happens in ConceptNet.
|
||||||
|
|
||||||
|
|
||||||
def _language_in_list(language, targets, min_score=80):
|
def _language_in_list(language, targets, max_distance=10):
|
||||||
"""
|
"""
|
||||||
A helper function to determine whether this language matches one of the
|
A helper function to determine whether this language matches one of the
|
||||||
target languages, with a match score above a certain threshold.
|
target languages, with a match score above a certain threshold.
|
||||||
@ -53,8 +53,8 @@ def _language_in_list(language, targets, min_score=80):
|
|||||||
The languages can be given as strings (language tags) or as Language
|
The languages can be given as strings (language tags) or as Language
|
||||||
objects. `targets` can be any iterable of such languages.
|
objects. `targets` can be any iterable of such languages.
|
||||||
"""
|
"""
|
||||||
matched = best_match(language, targets, min_score=min_score)
|
matched = closest_match(language, targets, max_distance=max_distance)
|
||||||
return matched[1] > 0
|
return matched[0] != 'und'
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
|
Loading…
Reference in New Issue
Block a user