update packaging, try to handle digits better

2024-12-23 09:21:37 +00:00 · 2022-02-08 18:24:36 -05:00 · 2022-02-08 18:24:36 -05:00 · 3c4819e7e5
commit 3c4819e7e5
parent 2361606b3a
4 changed files with 41 additions and 3 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,26 @@
 [tool.poetry]
 name = "wordfreq"
 version = "2.6.0"
 description = "Look up the frequencies of words in many languages, based on many sources of data."
 authors = ["Robyn Speer <rspeer@arborelia.net>"]
 license = "MIT"
 [tool.poetry.dependencies]
 python = "^3.7"
 msgpack = ">= 1.0"
 langcodes = ">= 3.0"
 regex = ">= 2020.04.04"
 ftfy = ">= 3.0"
 [tool.poetry.dev-dependencies]
 pytest = "^6.2.5"
 mecab-python3 = "^1.0.4"
 jieba = ">= 0.42"
 ipadic = "^1.0.0"
 mecab-ko-dic = "^1.0.0"
 ipython = ">=7"
 black = "^22.1.0"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/setup.py
+++ b/setup.py
@ -33,10 +33,10 @@ dependencies = [
 setup(
    name="wordfreq",
-    version='2.5.1',
+    version='2.6.0',
    maintainer='Robyn Speer',
    maintainer_email='rspeer@arborelia.net',
-    url='http://github.com/LuminosoInsight/wordfreq/',
+    url='http://github.com/rspeer/wordfreq/',
    license="MIT",
    platforms=["any"],
    description=doclines[0],
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -12,6 +12,7 @@ import warnings
 from .tokens import tokenize, simple_tokenize, lossy_tokenize
 from .language_info import get_language_info
 from .preprocess import num_generic_digits
 logger = logging.getLogger(__name__)
@ -242,6 +243,7 @@ _wf_cache = {}
 def _word_frequency(word, lang, wordlist, minimum):
    tokens = lossy_tokenize(word, lang)
    digits = num_generic_digits(word)
    if not tokens:
        return minimum
@ -255,7 +257,9 @@ def _word_frequency(word, lang, wordlist, minimum):
        if token not in freqs:
            # If any word is missing, just return the default value
            return minimum
-        one_over_result += 1.0 / freqs[token]
+        # spread the frequency of digits over all digit combinations
        freq = freqs[token] / (10. ** digits)
        one_over_result += 1.0 / freq
    freq = 1.0 / one_over_result
--- a/wordfreq/preprocess.py
+++ b/wordfreq/preprocess.py
@ -260,6 +260,14 @@ def _sub_zeroes(match):
    return DIGIT_RE.sub('0', match.group(0))
 def num_generic_digits(text):
    """
    Determine how many "generic digits" are in the text (digits that we
    replace with 0 to combine numbers of the same length).
    """
    return sum([len(match) for match in MULTI_DIGIT_RE.findall(text)])
 def smash_numbers(text):
    """
    Replace sequences of multiple digits with zeroes, so we don't need to