diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e96c038 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,26 @@ +[tool.poetry] +name = "wordfreq" +version = "2.6.0" +description = "Look up the frequencies of words in many languages, based on many sources of data." +authors = ["Robyn Speer "] +license = "MIT" + +[tool.poetry.dependencies] +python = "^3.7" +msgpack = ">= 1.0" +langcodes = ">= 3.0" +regex = ">= 2020.04.04" +ftfy = ">= 3.0" + +[tool.poetry.dev-dependencies] +pytest = "^6.2.5" +mecab-python3 = "^1.0.4" +jieba = ">= 0.42" +ipadic = "^1.0.0" +mecab-ko-dic = "^1.0.0" +ipython = ">=7" +black = "^22.1.0" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/setup.py b/setup.py index 8aa3f64..539b4ae 100755 --- a/setup.py +++ b/setup.py @@ -33,10 +33,10 @@ dependencies = [ setup( name="wordfreq", - version='2.5.1', + version='2.6.0', maintainer='Robyn Speer', maintainer_email='rspeer@arborelia.net', - url='http://github.com/LuminosoInsight/wordfreq/', + url='http://github.com/rspeer/wordfreq/', license="MIT", platforms=["any"], description=doclines[0], diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 17c910a..c58de33 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -12,6 +12,7 @@ import warnings from .tokens import tokenize, simple_tokenize, lossy_tokenize from .language_info import get_language_info +from .preprocess import num_generic_digits logger = logging.getLogger(__name__) @@ -242,6 +243,7 @@ _wf_cache = {} def _word_frequency(word, lang, wordlist, minimum): tokens = lossy_tokenize(word, lang) + digits = num_generic_digits(word) if not tokens: return minimum @@ -255,7 +257,9 @@ def _word_frequency(word, lang, wordlist, minimum): if token not in freqs: # If any word is missing, just return the default value return minimum - one_over_result += 1.0 / freqs[token] + # spread the frequency of digits over all digit combinations + freq = freqs[token] / (10. ** digits) + one_over_result += 1.0 / freq freq = 1.0 / one_over_result diff --git a/wordfreq/preprocess.py b/wordfreq/preprocess.py index 1563212..0d3145b 100644 --- a/wordfreq/preprocess.py +++ b/wordfreq/preprocess.py @@ -260,6 +260,14 @@ def _sub_zeroes(match): return DIGIT_RE.sub('0', match.group(0)) +def num_generic_digits(text): + """ + Determine how many "generic digits" are in the text (digits that we + replace with 0 to combine numbers of the same length). + """ + return sum([len(match) for match in MULTI_DIGIT_RE.findall(text)]) + + def smash_numbers(text): """ Replace sequences of multiple digits with zeroes, so we don't need to