From d30183a7d736df8ddec38a8d7364d797c74e4d37 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 25 Oct 2018 11:07:55 -0400 Subject: [PATCH 1/3] Allow a wider range of 'regex' versions The behavior of segmentation shouldn't change within this range, and it includes the version currently used by SpaCy. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d5352df..7717c64 100755 --- a/setup.py +++ b/setup.py @@ -28,14 +28,14 @@ README_contents = open(os.path.join(current_dir, 'README.md'), encoding='utf-8').read() doclines = README_contents.split("\n") dependencies = [ - 'msgpack', 'langcodes >= 1.4.1', 'regex == 2018.02.21' + 'msgpack', 'langcodes >= 1.4.1', 'regex >= 2017.07.11, <= 2018.02.21' ] if sys.version_info < (3, 4): dependencies.append('pathlib') setup( name="wordfreq", - version='2.2.0', + version='2.2.1', maintainer='Robyn Speer', maintainer_email='rspeer@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', From 61a1604b38dce0c2927a05086b8e0549f4016ccb Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Mon, 4 Feb 2019 14:57:38 -0500 Subject: [PATCH 2/3] update encoding='utf-8' to raw=False --- wordfreq/__init__.py | 2 +- wordfreq/chinese.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 6a993f1..a72770f 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -74,7 +74,7 @@ def read_cBpack(filename): ] """ with gzip.open(filename, 'rb') as infile: - data = msgpack.load(infile, encoding='utf-8') + data = msgpack.load(infile, raw=False) header = data[0] if ( not isinstance(header, dict) or header.get('format') != 'cB' diff --git a/wordfreq/chinese.py b/wordfreq/chinese.py index 9f7b95a..c8215fc 100644 --- a/wordfreq/chinese.py +++ b/wordfreq/chinese.py @@ -6,7 +6,7 @@ import gzip DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt') SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') -SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8') +SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False) jieba_tokenizer = None jieba_orig_tokenizer = None From dd72051929123571eb5b8b5c7dd65f231d4ec5b2 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 5 Feb 2019 11:16:22 -0500 Subject: [PATCH 3/3] update msgpack call in scripts/make_chinese_mapping --- scripts/make_chinese_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/make_chinese_mapping.py b/scripts/make_chinese_mapping.py index 4a17d4f..3d78708 100644 --- a/scripts/make_chinese_mapping.py +++ b/scripts/make_chinese_mapping.py @@ -36,7 +36,7 @@ def make_hanzi_converter(table_in, msgpack_out): if chr(codept) != char: table[codept] = char with gzip.open(msgpack_out, 'wb') as outfile: - msgpack.dump(table, outfile, encoding='utf-8') + msgpack.dump(table, outfile, raw=False) def build():