From c2a9fe03f163e2fd66f4180bfde4d51dcdcba25c Mon Sep 17 00:00:00 2001
From: Elia Robyn Speer <elia@explosion.ai>
Date: Thu, 2 Sep 2021 17:47:47 +0000
Subject: [PATCH] use ftfy's uncurl_quotes in lossy_tokenize

---
 setup.py              | 4 ++--
 tests/test_general.py | 5 +++++
 wordfreq/tokens.py    | 6 +++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index d4c5fac..8aa3f64 100755
--- a/setup.py
+++ b/setup.py
@@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
                        encoding='utf-8').read()
 doclines = README_contents.split("\n")
 dependencies = [
-    'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
+    'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04', 'ftfy >= 3.0'
 ]
 
 setup(
     name="wordfreq",
-    version='2.5.0',
+    version='2.5.1',
     maintainer='Robyn Speer',
     maintainer_email='rspeer@arborelia.net',
     url='http://github.com/LuminosoInsight/wordfreq/',
diff --git a/tests/test_general.py b/tests/test_general.py
index badbb73..8783c74 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -185,6 +185,11 @@ def test_number_smashing():
     assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
 
 
+def test_uncurl_quotes():
+    assert lossy_tokenize("let’s", 'en') == ["let's"]
+    assert word_frequency("let’s", 'en') == word_frequency("let's", 'en')
+
+
 def test_phrase_freq():
     ff = word_frequency("flip-flop", 'en')
     assert ff > 0
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index 30f9a09..25d945b 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -2,6 +2,7 @@ import regex
 import unicodedata
 import logging
 import langcodes
+from ftfy.fixes import uncurl_quotes
 
 from .language_info import (
     get_language_info,
@@ -306,6 +307,9 @@ def lossy_tokenize(
 
     - In Chinese, unless Traditional Chinese is specifically requested using
       'zh-Hant', all characters will be converted to Simplified Chinese.
+
+    - Curly quotes will be converted to straight quotes, and in particular ’
+      will be converted to ', in order to match the input data.
     """
     global _simplify_chinese
 
@@ -317,4 +321,4 @@ def lossy_tokenize(
 
         tokens = [_simplify_chinese(token) for token in tokens]
 
-    return [smash_numbers(token) for token in tokens]
+    return [uncurl_quotes(smash_numbers(token)) for token in tokens]