mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
update packaging, try to handle digits better
This commit is contained in:
parent
11a3138cea
commit
91195c793d
26
pyproject.toml
Normal file
26
pyproject.toml
Normal file
@ -0,0 +1,26 @@
|
||||
[tool.poetry]
|
||||
name = "wordfreq"
|
||||
version = "2.6.0"
|
||||
description = "Look up the frequencies of words in many languages, based on many sources of data."
|
||||
authors = ["Robyn Speer <rspeer@arborelia.net>"]
|
||||
license = "MIT"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.7"
|
||||
msgpack = ">= 1.0"
|
||||
langcodes = ">= 3.0"
|
||||
regex = ">= 2020.04.04"
|
||||
ftfy = ">= 3.0"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^6.2.5"
|
||||
mecab-python3 = "^1.0.4"
|
||||
jieba = ">= 0.42"
|
||||
ipadic = "^1.0.0"
|
||||
mecab-ko-dic = "^1.0.0"
|
||||
ipython = ">=7"
|
||||
black = "^22.1.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
4
setup.py
4
setup.py
@ -33,10 +33,10 @@ dependencies = [
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='2.5.1',
|
||||
version='2.6.0',
|
||||
maintainer='Robyn Speer',
|
||||
maintainer_email='rspeer@arborelia.net',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
url='http://github.com/rspeer/wordfreq/',
|
||||
license="MIT",
|
||||
platforms=["any"],
|
||||
description=doclines[0],
|
||||
|
@ -12,6 +12,7 @@ import warnings
|
||||
|
||||
from .tokens import tokenize, simple_tokenize, lossy_tokenize
|
||||
from .language_info import get_language_info
|
||||
from .preprocess import num_generic_digits
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -242,6 +243,7 @@ _wf_cache = {}
|
||||
|
||||
def _word_frequency(word, lang, wordlist, minimum):
|
||||
tokens = lossy_tokenize(word, lang)
|
||||
digits = num_generic_digits(word)
|
||||
if not tokens:
|
||||
return minimum
|
||||
|
||||
@ -255,7 +257,9 @@ def _word_frequency(word, lang, wordlist, minimum):
|
||||
if token not in freqs:
|
||||
# If any word is missing, just return the default value
|
||||
return minimum
|
||||
one_over_result += 1.0 / freqs[token]
|
||||
# spread the frequency of digits over all digit combinations
|
||||
freq = freqs[token] / (10. ** digits)
|
||||
one_over_result += 1.0 / freq
|
||||
|
||||
freq = 1.0 / one_over_result
|
||||
|
||||
|
@ -260,6 +260,14 @@ def _sub_zeroes(match):
|
||||
return DIGIT_RE.sub('0', match.group(0))
|
||||
|
||||
|
||||
def num_generic_digits(text):
|
||||
"""
|
||||
Determine how many "generic digits" are in the text (digits that we
|
||||
replace with 0 to combine numbers of the same length).
|
||||
"""
|
||||
return sum([len(match) for match in MULTI_DIGIT_RE.findall(text)])
|
||||
|
||||
|
||||
def smash_numbers(text):
|
||||
"""
|
||||
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||
|
Loading…
Reference in New Issue
Block a user