update packaging, try to handle digits better

This commit is contained in:
Elia Robyn Lake 2022-02-08 18:24:36 -05:00
parent 11a3138cea
commit 91195c793d
4 changed files with 41 additions and 3 deletions

26
pyproject.toml Normal file
View File

@ -0,0 +1,26 @@
[tool.poetry]
name = "wordfreq"
version = "2.6.0"
description = "Look up the frequencies of words in many languages, based on many sources of data."
authors = ["Robyn Speer <rspeer@arborelia.net>"]
license = "MIT"
[tool.poetry.dependencies]
python = "^3.7"
msgpack = ">= 1.0"
langcodes = ">= 3.0"
regex = ">= 2020.04.04"
ftfy = ">= 3.0"
[tool.poetry.dev-dependencies]
pytest = "^6.2.5"
mecab-python3 = "^1.0.4"
jieba = ">= 0.42"
ipadic = "^1.0.0"
mecab-ko-dic = "^1.0.0"
ipython = ">=7"
black = "^22.1.0"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -33,10 +33,10 @@ dependencies = [
setup(
name="wordfreq",
version='2.5.1',
version='2.6.0',
maintainer='Robyn Speer',
maintainer_email='rspeer@arborelia.net',
url='http://github.com/LuminosoInsight/wordfreq/',
url='http://github.com/rspeer/wordfreq/',
license="MIT",
platforms=["any"],
description=doclines[0],

View File

@ -12,6 +12,7 @@ import warnings
from .tokens import tokenize, simple_tokenize, lossy_tokenize
from .language_info import get_language_info
from .preprocess import num_generic_digits
logger = logging.getLogger(__name__)
@ -242,6 +243,7 @@ _wf_cache = {}
def _word_frequency(word, lang, wordlist, minimum):
tokens = lossy_tokenize(word, lang)
digits = num_generic_digits(word)
if not tokens:
return minimum
@ -255,7 +257,9 @@ def _word_frequency(word, lang, wordlist, minimum):
if token not in freqs:
# If any word is missing, just return the default value
return minimum
one_over_result += 1.0 / freqs[token]
# spread the frequency of digits over all digit combinations
freq = freqs[token] / (10. ** digits)
one_over_result += 1.0 / freq
freq = 1.0 / one_over_result

View File

@ -260,6 +260,14 @@ def _sub_zeroes(match):
return DIGIT_RE.sub('0', match.group(0))
def num_generic_digits(text):
"""
Determine how many "generic digits" are in the text (digits that we
replace with 0 to combine numbers of the same length).
"""
return sum([len(match) for match in MULTI_DIGIT_RE.findall(text)])
def smash_numbers(text):
"""
Replace sequences of multiple digits with zeroes, so we don't need to