From c2a9fe03f163e2fd66f4180bfde4d51dcdcba25c Mon Sep 17 00:00:00 2001 From: Elia Robyn Speer Date: Thu, 2 Sep 2021 17:47:47 +0000 Subject: [PATCH] use ftfy's uncurl_quotes in lossy_tokenize --- setup.py | 4 ++-- tests/test_general.py | 5 +++++ wordfreq/tokens.py | 6 +++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index d4c5fac..8aa3f64 100755 --- a/setup.py +++ b/setup.py @@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'), encoding='utf-8').read() doclines = README_contents.split("\n") dependencies = [ - 'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04' + 'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04', 'ftfy >= 3.0' ] setup( name="wordfreq", - version='2.5.0', + version='2.5.1', maintainer='Robyn Speer', maintainer_email='rspeer@arborelia.net', url='http://github.com/LuminosoInsight/wordfreq/', diff --git a/tests/test_general.py b/tests/test_general.py index badbb73..8783c74 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -185,6 +185,11 @@ def test_number_smashing(): assert word_frequency('24601', 'en') == word_frequency('90210', 'en') +def test_uncurl_quotes(): + assert lossy_tokenize("let’s", 'en') == ["let's"] + assert word_frequency("let’s", 'en') == word_frequency("let's", 'en') + + def test_phrase_freq(): ff = word_frequency("flip-flop", 'en') assert ff > 0 diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index 30f9a09..25d945b 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -2,6 +2,7 @@ import regex import unicodedata import logging import langcodes +from ftfy.fixes import uncurl_quotes from .language_info import ( get_language_info, @@ -306,6 +307,9 @@ def lossy_tokenize( - In Chinese, unless Traditional Chinese is specifically requested using 'zh-Hant', all characters will be converted to Simplified Chinese. + + - Curly quotes will be converted to straight quotes, and in particular ’ + will be converted to ', in order to match the input data. """ global _simplify_chinese @@ -317,4 +321,4 @@ def lossy_tokenize( tokens = [_simplify_chinese(token) for token in tokens] - return [smash_numbers(token) for token in tokens] + return [uncurl_quotes(smash_numbers(token)) for token in tokens]