update packaging, try to handle digits better

2024-12-23 09:21:37 +00:00 · 2022-02-08 18:24:36 -05:00 · 2022-02-08 18:24:36 -05:00 · 3c4819e7e5
commit 3c4819e7e5
parent 2361606b3a
4 changed files with 41 additions and 3 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,26 @@
+[tool.poetry]
+name = "wordfreq"
+version = "2.6.0"
+description = "Look up the frequencies of words in many languages, based on many sources of data."
+authors = ["Robyn Speer <rspeer@arborelia.net>"]
+license = "MIT"
+
+[tool.poetry.dependencies]
+python = "^3.7"
+msgpack = ">= 1.0"
+langcodes = ">= 3.0"
+regex = ">= 2020.04.04"
+ftfy = ">= 3.0"
+
+[tool.poetry.dev-dependencies]
+pytest = "^6.2.5"
+mecab-python3 = "^1.0.4"
+jieba = ">= 0.42"
+ipadic = "^1.0.0"
+mecab-ko-dic = "^1.0.0"
+ipython = ">=7"
+black = "^22.1.0"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/setup.py
+++ b/setup.py
@ -33,10 +33,10 @@ dependencies = [

 setup(
    name="wordfreq",
-    version='2.5.1',
+    version='2.6.0',
    maintainer='Robyn Speer',
    maintainer_email='rspeer@arborelia.net',
-    url='http://github.com/LuminosoInsight/wordfreq/',
+    url='http://github.com/rspeer/wordfreq/',
    license="MIT",
    platforms=["any"],
    description=doclines[0],
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -12,6 +12,7 @@ import warnings

 from .tokens import tokenize, simple_tokenize, lossy_tokenize
 from .language_info import get_language_info
+from .preprocess import num_generic_digits

 logger = logging.getLogger(__name__)

@ -242,6 +243,7 @@ _wf_cache = {}

 def _word_frequency(word, lang, wordlist, minimum):
    tokens = lossy_tokenize(word, lang)
+    digits = num_generic_digits(word)
    if not tokens:
        return minimum

@ -255,7 +257,9 @@ def _word_frequency(word, lang, wordlist, minimum):
        if token not in freqs:
            # If any word is missing, just return the default value
            return minimum
-        one_over_result += 1.0 / freqs[token]
+        # spread the frequency of digits over all digit combinations
+        freq = freqs[token] / (10. ** digits)
+        one_over_result += 1.0 / freq

    freq = 1.0 / one_over_result

--- a/wordfreq/preprocess.py
+++ b/wordfreq/preprocess.py
@ -260,6 +260,14 @@ def _sub_zeroes(match):
    return DIGIT_RE.sub('0', match.group(0))


+def num_generic_digits(text):
+    """
+    Determine how many "generic digits" are in the text (digits that we
+    replace with 0 to combine numbers of the same length).
+    """
+    return sum([len(match) for match in MULTI_DIGIT_RE.findall(text)])
+
+
 def smash_numbers(text):
    """
    Replace sequences of multiple digits with zeroes, so we don't need to