mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge remote-tracking branch 'origin/apostrophe-consistency'
This commit is contained in:
commit
cc4f39d8c2
15
CHANGELOG.md
15
CHANGELOG.md
@ -1,3 +1,18 @@
|
||||
## Version 2.5.1 (2021-09-02)
|
||||
|
||||
- Import ftfy and use its `uncurl_quotes` method to turn curly quotes into
|
||||
straight ones, providing consistency with multiple forms of apostrophes.
|
||||
|
||||
- Set minimum version requierements on `regex`, `jieba`, and `langcodes`
|
||||
so that tokenization will give consistent results.
|
||||
|
||||
- Workaround an inconsistency in the `msgpack` API around
|
||||
`strict_map_key=False`.
|
||||
|
||||
## Version 2.5 (2021-04-15)
|
||||
|
||||
- Incorporate data from the OSCAR corpus.
|
||||
|
||||
## Version 2.4.2 (2021-02-19)
|
||||
|
||||
- When tokenizing Japanese or Korean, MeCab's dictionaries no longer have to
|
||||
|
8
setup.py
8
setup.py
@ -28,12 +28,20 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
|
||||
encoding='utf-8').read()
|
||||
doclines = README_contents.split("\n")
|
||||
dependencies = [
|
||||
<<<<<<< HEAD
|
||||
'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04', 'ftfy >= 3.0'
|
||||
=======
|
||||
'msgpack >= 1.0', 'langcodes >= 2.1', 'regex >= 2020.04.04'
|
||||
>>>>>>> origin/apostrophe-consistency
|
||||
]
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
<<<<<<< HEAD
|
||||
version='2.5.1',
|
||||
=======
|
||||
version='2.3.3',
|
||||
>>>>>>> origin/apostrophe-consistency
|
||||
maintainer='Robyn Speer',
|
||||
maintainer_email='rspeer@arborelia.net',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
|
@ -9,11 +9,24 @@ def test_apostrophes():
|
||||
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
|
||||
assert tokenize("l'heure", 'fr') == ['l', 'heure']
|
||||
assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
|
||||
assert tokenize("l'anima", 'it') == ['l', 'anima']
|
||||
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
|
||||
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
|
||||
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
|
||||
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
|
||||
|
||||
# This next behavior is not ideal -- we would prefer "dell'" to be handled
|
||||
# the same as "l'" -- but this is the most consistent result we can get without
|
||||
# Italian-specific rules.
|
||||
#
|
||||
# Versions of regex from 2019 and earlier would give ['dell', 'anima'], which
|
||||
# is better but inconsistent.
|
||||
assert tokenize("dell'anima", 'it') == ["dell'anima"]
|
||||
|
||||
# Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's
|
||||
# an example of why we don't want the apostrophe-vowel fix to apply everywhere.
|
||||
assert tokenize("hawai'i", 'en') == ["hawai'i"]
|
||||
|
||||
|
||||
def test_catastrophes():
|
||||
# More apostrophes, but this time they're in Catalan, and there's other
|
@ -87,3 +87,14 @@ def test_unreasonably_long():
|
||||
assert word_frequency(lots_of_ls, 'zh') == 0.
|
||||
assert zipf_frequency(lots_of_ls, 'zh') == 0.
|
||||
|
||||
|
||||
def test_hyphens():
|
||||
# An edge case of Chinese tokenization that changed sometime around
|
||||
# jieba 0.42.
|
||||
|
||||
tok = tokenize('--------', 'zh', include_punctuation=True)
|
||||
assert tok == ['-'] * 8
|
||||
|
||||
tok = tokenize('--------', 'zh', include_punctuation=True, external_wordlist=True)
|
||||
assert tok == ['--------']
|
||||
|
||||
|
@ -6,7 +6,11 @@ import gzip
|
||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
||||
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False)
|
||||
try:
|
||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False)
|
||||
except TypeError:
|
||||
# work around incompatibility between pure-Python msgpack and C msgpack
|
||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False)
|
||||
jieba_tokenizer = None
|
||||
jieba_orig_tokenizer = None
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
from functools import lru_cache
|
||||
from langcodes import Language, best_match
|
||||
from langcodes import Language, closest_match
|
||||
|
||||
|
||||
# Text in scripts written without spaces has to be handled specially in our
|
||||
@ -45,7 +45,7 @@ EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
|
||||
# happens in ConceptNet.
|
||||
|
||||
|
||||
def _language_in_list(language, targets, min_score=80):
|
||||
def _language_in_list(language, targets, max_distance=10):
|
||||
"""
|
||||
A helper function to determine whether this language matches one of the
|
||||
target languages, with a match score above a certain threshold.
|
||||
@ -53,8 +53,8 @@ def _language_in_list(language, targets, min_score=80):
|
||||
The languages can be given as strings (language tags) or as Language
|
||||
objects. `targets` can be any iterable of such languages.
|
||||
"""
|
||||
matched = best_match(language, targets, min_score=min_score)
|
||||
return matched[1] > 0
|
||||
matched = closest_match(language, targets, max_distance=max_distance)
|
||||
return matched[0] != 'und'
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
|
Loading…
Reference in New Issue
Block a user