mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
use ftfy's uncurl_quotes in lossy_tokenize
This commit is contained in:
parent
6f1f626f1b
commit
c2a9fe03f1
4
setup.py
4
setup.py
@ -28,12 +28,12 @@ README_contents = open(os.path.join(current_dir, 'README.md'),
|
|||||||
encoding='utf-8').read()
|
encoding='utf-8').read()
|
||||||
doclines = README_contents.split("\n")
|
doclines = README_contents.split("\n")
|
||||||
dependencies = [
|
dependencies = [
|
||||||
'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04'
|
'msgpack >= 1.0', 'langcodes >= 3.0', 'regex >= 2020.04.04', 'ftfy >= 3.0'
|
||||||
]
|
]
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='2.5.0',
|
version='2.5.1',
|
||||||
maintainer='Robyn Speer',
|
maintainer='Robyn Speer',
|
||||||
maintainer_email='rspeer@arborelia.net',
|
maintainer_email='rspeer@arborelia.net',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
@ -185,6 +185,11 @@ def test_number_smashing():
|
|||||||
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
|
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
|
||||||
|
|
||||||
|
|
||||||
|
def test_uncurl_quotes():
|
||||||
|
assert lossy_tokenize("let’s", 'en') == ["let's"]
|
||||||
|
assert word_frequency("let’s", 'en') == word_frequency("let's", 'en')
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_freq():
|
def test_phrase_freq():
|
||||||
ff = word_frequency("flip-flop", 'en')
|
ff = word_frequency("flip-flop", 'en')
|
||||||
assert ff > 0
|
assert ff > 0
|
||||||
|
@ -2,6 +2,7 @@ import regex
|
|||||||
import unicodedata
|
import unicodedata
|
||||||
import logging
|
import logging
|
||||||
import langcodes
|
import langcodes
|
||||||
|
from ftfy.fixes import uncurl_quotes
|
||||||
|
|
||||||
from .language_info import (
|
from .language_info import (
|
||||||
get_language_info,
|
get_language_info,
|
||||||
@ -306,6 +307,9 @@ def lossy_tokenize(
|
|||||||
|
|
||||||
- In Chinese, unless Traditional Chinese is specifically requested using
|
- In Chinese, unless Traditional Chinese is specifically requested using
|
||||||
'zh-Hant', all characters will be converted to Simplified Chinese.
|
'zh-Hant', all characters will be converted to Simplified Chinese.
|
||||||
|
|
||||||
|
- Curly quotes will be converted to straight quotes, and in particular ’
|
||||||
|
will be converted to ', in order to match the input data.
|
||||||
"""
|
"""
|
||||||
global _simplify_chinese
|
global _simplify_chinese
|
||||||
|
|
||||||
@ -317,4 +321,4 @@ def lossy_tokenize(
|
|||||||
|
|
||||||
tokens = [_simplify_chinese(token) for token in tokens]
|
tokens = [_simplify_chinese(token) for token in tokens]
|
||||||
|
|
||||||
return [smash_numbers(token) for token in tokens]
|
return [uncurl_quotes(smash_numbers(token)) for token in tokens]
|
||||||
|
Loading…
Reference in New Issue
Block a user