diff --git a/CHANGELOG.md b/CHANGELOG.md index 5460ffc..2fb7188 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,18 @@ +## Version 2.5.1 (2021-09-02) + +- Import ftfy and use its `uncurl_quotes` method to turn curly quotes into + straight ones, providing consistency with multiple forms of apostrophes. + +- Set minimum version requierements on `regex`, `jieba`, and `langcodes` + so that tokenization will give consistent results. + +- Workaround an inconsistency in the `msgpack` API around + `strict_map_key=False`. + +## Version 2.5 (2021-04-15) + +- Incorporate data from the OSCAR corpus. + ## Version 2.4.2 (2021-02-19) - When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to diff --git a/setup.py b/setup.py index 8aa3f64..b938402 100755 --- a/setup.py +++ b/setup.py @@ -28,12 +28,20 @@ README_contents = open(os.path.join(current_dir, 'README.md'), encoding='utf-8').read() doclines = README_contents.split("\n") dependencies = [ +<<<<<<< HEAD 'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04', 'ftfy >= 3.0' +======= + 'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04' +>>>>>>> origin/apostrophe-consistency ] setup( name="wordfreq", +<<<<<<< HEAD version='2.5.1', +======= + version='2.3.3', +>>>>>>> origin/apostrophe-consistency maintainer='Robyn Speer', maintainer_email='rspeer@arborelia.net', url='http://github.com/LuminosoInsight/wordfreq/', diff --git a/tests/test_french_and_related.py b/tests/test_apostrophes.py similarity index 70% rename from tests/test_french_and_related.py rename to tests/test_apostrophes.py index c27ecae..0c6b9b7 100644 --- a/tests/test_french_and_related.py +++ b/tests/test_apostrophes.py @@ -9,11 +9,24 @@ def test_apostrophes(): assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl'] assert tokenize("l'heure", 'fr') == ['l', 'heure'] assert tokenize("l'ànima", 'ca') == ['l', 'ànima'] + assert tokenize("l'anima", 'it') == ['l', 'anima'] assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure'] assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital'] assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"] assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french'] + # This next behavior is not ideal -- we would prefer "dell'" to be handled + # the same as "l'" -- but this is the most consistent result we can get without + # Italian-specific rules. + # + # Versions of regex from 2019 and earlier would give ['dell', 'anima'], which + # is better but inconsistent. + assert tokenize("dell'anima", 'it') == ["dell'anima"] + + # Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's + # an example of why we don't want the apostrophe-vowel fix to apply everywhere. + assert tokenize("hawai'i", 'en') == ["hawai'i"] + def test_catastrophes(): # More apostrophes, but this time they're in Catalan, and there's other diff --git a/tests/test_chinese.py b/tests/test_chinese.py index 4bde6c2..a6ff202 100644 --- a/tests/test_chinese.py +++ b/tests/test_chinese.py @@ -87,3 +87,14 @@ def test_unreasonably_long(): assert word_frequency(lots_of_ls, 'zh') == 0. assert zipf_frequency(lots_of_ls, 'zh') == 0. + +def test_hyphens(): + # An edge case of Chinese tokenization that changed sometime around + # jieba 0.42. + + tok = tokenize('--------', 'zh', include_punctuation=True) + assert tok == ['-'] * 8 + + tok = tokenize('--------', 'zh', include_punctuation=True, external_wordlist=True) + assert tok == ['--------'] + diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index 61e931a..95b6b5d 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -6,7 +6,11 @@ import gzip DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt') SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') -SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False) +try: + SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False) +except TypeError: + # work around incompatibility between pure-Python msgpack and C msgpack + SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False) jieba_tokenizer = None jieba_orig_tokenizer = None diff --git a/wordfreq/language_info.py b/wordfreq/language_info.py index 3b736be..73a7b69 100644 --- a/wordfreq/language_info.py +++ b/wordfreq/language_info.py @@ -1,5 +1,5 @@ from functools import lru_cache -from langcodes import Language, best_match +from langcodes import Language, closest_match # Text in scripts written without spaces has to be handled specially in our @@ -45,7 +45,7 @@ EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆' # happens in ConceptNet. -def _language_in_list(language, targets, min_score=80): +def _language_in_list(language, targets, max_distance=10): """ A helper function to determine whether this language matches one of the target languages, with a match score above a certain threshold. @@ -53,8 +53,8 @@ def _language_in_list(language, targets, min_score=80): The languages can be given as strings (language tags) or as Language objects. `targets` can be any iterable of such languages. """ - matched = best_match(language, targets, min_score=min_score) - return matched[1] > 0 + matched = closest_match(language, targets, max_distance=max_distance) + return matched[0] != 'und' @lru_cache(maxsize=None)