use ftfy's uncurl_quotes in lossy_tokenize

This commit is contained in:
Elia Robyn Speer 2021-09-02 17:47:47 +00:00
parent 6f1f626f1b
commit c2a9fe03f1
3 changed files with 12 additions and 3 deletions

View File

@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read() encoding='utf-8').read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = [ dependencies = [
'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04' 'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04', 'ftfy >= 3.0'
] ]
setup( setup(
name="wordfreq", name="wordfreq",
version='2.5.0', version='2.5.1',
maintainer='Robyn Speer', maintainer='Robyn Speer',
maintainer_email='rspeer@arborelia.net', maintainer_email='rspeer@arborelia.net',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -185,6 +185,11 @@ def test_number_smashing():
assert word_frequency('24601', 'en') == word_frequency('90210', 'en') assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def test_uncurl_quotes():
assert lossy_tokenize("lets", 'en') == ["let's"]
assert word_frequency("lets", 'en') == word_frequency("let's", 'en')
def test_phrase_freq(): def test_phrase_freq():
ff = word_frequency("flip-flop", 'en') ff = word_frequency("flip-flop", 'en')
assert ff > 0 assert ff > 0

View File

@ -2,6 +2,7 @@ import regex
import unicodedata import unicodedata
import logging import logging
import langcodes import langcodes
from ftfy.fixes import uncurl_quotes
from .language_info import ( from .language_info import (
get_language_info, get_language_info,
@ -306,6 +307,9 @@ def lossy_tokenize(
- In Chinese, unless Traditional Chinese is specifically requested using - In Chinese, unless Traditional Chinese is specifically requested using
'zh-Hant', all characters will be converted to Simplified Chinese. 'zh-Hant', all characters will be converted to Simplified Chinese.
- Curly quotes will be converted to straight quotes, and in particular
will be converted to ', in order to match the input data.
""" """
global _simplify_chinese global _simplify_chinese
@ -317,4 +321,4 @@ def lossy_tokenize(
tokens = [_simplify_chinese(token) for token in tokens] tokens = [_simplify_chinese(token) for token in tokens]
return [smash_numbers(token) for token in tokens] return [uncurl_quotes(smash_numbers(token)) for token in tokens]