mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
update packaging, try to handle digits better
This commit is contained in:
parent
2361606b3a
commit
3c4819e7e5
26
pyproject.toml
Normal file
26
pyproject.toml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "wordfreq"
|
||||||
|
version = "2.6.0"
|
||||||
|
description = "Look up the frequencies of words in many languages, based on many sources of data."
|
||||||
|
authors = ["Robyn Speer <rspeer@arborelia.net>"]
|
||||||
|
license = "MIT"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.7"
|
||||||
|
msgpack = ">= 1.0"
|
||||||
|
langcodes = ">= 3.0"
|
||||||
|
regex = ">= 2020.04.04"
|
||||||
|
ftfy = ">= 3.0"
|
||||||
|
|
||||||
|
[tool.poetry.dev-dependencies]
|
||||||
|
pytest = "^6.2.5"
|
||||||
|
mecab-python3 = "^1.0.4"
|
||||||
|
jieba = ">= 0.42"
|
||||||
|
ipadic = "^1.0.0"
|
||||||
|
mecab-ko-dic = "^1.0.0"
|
||||||
|
ipython = ">=7"
|
||||||
|
black = "^22.1.0"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core>=1.0.0"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
4
setup.py
4
setup.py
@ -33,10 +33,10 @@ dependencies = [
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='2.5.1',
|
version='2.6.0',
|
||||||
maintainer='Robyn Speer',
|
maintainer='Robyn Speer',
|
||||||
maintainer_email='rspeer@arborelia.net',
|
maintainer_email='rspeer@arborelia.net',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/rspeer/wordfreq/',
|
||||||
license="MIT",
|
license="MIT",
|
||||||
platforms=["any"],
|
platforms=["any"],
|
||||||
description=doclines[0],
|
description=doclines[0],
|
||||||
|
@ -12,6 +12,7 @@ import warnings
|
|||||||
|
|
||||||
from .tokens import tokenize, simple_tokenize, lossy_tokenize
|
from .tokens import tokenize, simple_tokenize, lossy_tokenize
|
||||||
from .language_info import get_language_info
|
from .language_info import get_language_info
|
||||||
|
from .preprocess import num_generic_digits
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -242,6 +243,7 @@ _wf_cache = {}
|
|||||||
|
|
||||||
def _word_frequency(word, lang, wordlist, minimum):
|
def _word_frequency(word, lang, wordlist, minimum):
|
||||||
tokens = lossy_tokenize(word, lang)
|
tokens = lossy_tokenize(word, lang)
|
||||||
|
digits = num_generic_digits(word)
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return minimum
|
return minimum
|
||||||
|
|
||||||
@ -255,7 +257,9 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
if token not in freqs:
|
if token not in freqs:
|
||||||
# If any word is missing, just return the default value
|
# If any word is missing, just return the default value
|
||||||
return minimum
|
return minimum
|
||||||
one_over_result += 1.0 / freqs[token]
|
# spread the frequency of digits over all digit combinations
|
||||||
|
freq = freqs[token] / (10. ** digits)
|
||||||
|
one_over_result += 1.0 / freq
|
||||||
|
|
||||||
freq = 1.0 / one_over_result
|
freq = 1.0 / one_over_result
|
||||||
|
|
||||||
|
@ -260,6 +260,14 @@ def _sub_zeroes(match):
|
|||||||
return DIGIT_RE.sub('0', match.group(0))
|
return DIGIT_RE.sub('0', match.group(0))
|
||||||
|
|
||||||
|
|
||||||
|
def num_generic_digits(text):
|
||||||
|
"""
|
||||||
|
Determine how many "generic digits" are in the text (digits that we
|
||||||
|
replace with 0 to combine numbers of the same length).
|
||||||
|
"""
|
||||||
|
return sum([len(match) for match in MULTI_DIGIT_RE.findall(text)])
|
||||||
|
|
||||||
|
|
||||||
def smash_numbers(text):
|
def smash_numbers(text):
|
||||||
"""
|
"""
|
||||||
Replace sequences of multiple digits with zeroes, so we don't need to
|
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||||
|
Loading…
Reference in New Issue
Block a user