estimate the freq distribution of numbers

This commit is contained in:
Elia Robyn Lake 2022-03-10 18:33:42 -05:00
parent 4e373750e8
commit bf05b1b1dc
14 changed files with 552 additions and 405 deletions

244
poetry.lock generated
View File

@ -61,7 +61,7 @@ uvloop = ["uvloop (>=0.15.2)"]
[[package]] [[package]]
name = "click" name = "click"
version = "8.0.3" version = "8.0.4"
description = "Composable command line interface toolkit" description = "Composable command line interface toolkit"
category = "dev" category = "dev"
optional = false optional = false
@ -103,17 +103,14 @@ pyflakes = ">=2.4.0,<2.5.0"
[[package]] [[package]]
name = "ftfy" name = "ftfy"
version = "6.0.3" version = "6.1.1"
description = "Fixes some problems with Unicode text after the fact" description = "Fixes mojibake and other problems with Unicode, after the fact"
category = "main" category = "main"
optional = false optional = false
python-versions = ">=3.6" python-versions = ">=3.7,<4"
[package.dependencies] [package.dependencies]
wcwidth = "*" wcwidth = ">=0.2.5"
[package.extras]
docs = ["furo", "sphinx"]
[[package]] [[package]]
name = "importlib-metadata" name = "importlib-metadata"
@ -149,7 +146,7 @@ python-versions = "*"
[[package]] [[package]]
name = "ipython" name = "ipython"
version = "7.31.1" version = "7.32.0"
description = "IPython: Productive Interactive Computing" description = "IPython: Productive Interactive Computing"
category = "dev" category = "dev"
optional = false optional = false
@ -242,7 +239,7 @@ python-versions = "*"
[[package]] [[package]]
name = "mecab-python3" name = "mecab-python3"
version = "1.0.4" version = "1.0.5"
description = "Python wrapper for the MeCab morphological analyzer for Japanese" description = "Python wrapper for the MeCab morphological analyzer for Japanese"
category = "dev" category = "dev"
optional = false optional = false
@ -338,7 +335,7 @@ python-versions = "*"
[[package]] [[package]]
name = "platformdirs" name = "platformdirs"
version = "2.5.0" version = "2.5.1"
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "dev" category = "dev"
optional = false optional = false
@ -365,7 +362,7 @@ testing = ["pytest", "pytest-benchmark"]
[[package]] [[package]]
name = "prompt-toolkit" name = "prompt-toolkit"
version = "3.0.27" version = "3.0.28"
description = "Library for building powerful interactive command lines in Python" description = "Library for building powerful interactive command lines in Python"
category = "dev" category = "dev"
optional = false optional = false
@ -449,11 +446,11 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xm
[[package]] [[package]]
name = "regex" name = "regex"
version = "2022.1.18" version = "2022.3.2"
description = "Alternative regular expression module, to replace re." description = "Alternative regular expression module, to replace re."
category = "main" category = "main"
optional = false optional = false
python-versions = "*" python-versions = ">=3.6"
[[package]] [[package]]
name = "toml" name = "toml"
@ -492,7 +489,7 @@ python-versions = ">=3.6"
[[package]] [[package]]
name = "types-setuptools" name = "types-setuptools"
version = "57.4.9" version = "57.4.10"
description = "Typing stubs for setuptools" description = "Typing stubs for setuptools"
category = "dev" category = "dev"
optional = false optional = false
@ -500,7 +497,7 @@ python-versions = "*"
[[package]] [[package]]
name = "typing-extensions" name = "typing-extensions"
version = "4.0.1" version = "4.1.1"
description = "Backported and Experimental Type Hints for Python 3.6+" description = "Backported and Experimental Type Hints for Python 3.6+"
category = "main" category = "main"
optional = false optional = false
@ -529,7 +526,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.7" python-versions = "^3.7"
content-hash = "a3b1a9c3b80e338764f1907a77e31f59d6e1e231092b7813182e09e55d7c2f45" content-hash = "8507a13e0c8c79c30e911cc5f32bdc35284304246ae50531917df6197d7dcab8"
[metadata.files] [metadata.files]
appnope = [ appnope = [
@ -574,8 +571,8 @@ black = [
{file = "black-22.1.0.tar.gz", hash = "sha256:a7c0192d35635f6fc1174be575cb7915e92e5dd629ee79fdaf0dcfa41a80afb5"}, {file = "black-22.1.0.tar.gz", hash = "sha256:a7c0192d35635f6fc1174be575cb7915e92e5dd629ee79fdaf0dcfa41a80afb5"},
] ]
click = [ click = [
{file = "click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3"}, {file = "click-8.0.4-py3-none-any.whl", hash = "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1"},
{file = "click-8.0.3.tar.gz", hash = "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"}, {file = "click-8.0.4.tar.gz", hash = "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"},
] ]
colorama = [ colorama = [
{file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
@ -590,7 +587,8 @@ flake8 = [
{file = "flake8-4.0.1.tar.gz", hash = "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"}, {file = "flake8-4.0.1.tar.gz", hash = "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"},
] ]
ftfy = [ ftfy = [
{file = "ftfy-6.0.3.tar.gz", hash = "sha256:ba71121a9c8d7790d3e833c6c1021143f3e5c4118293ec3afb5d43ed9ca8e72b"}, {file = "ftfy-6.1.1-py3-none-any.whl", hash = "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca"},
{file = "ftfy-6.1.1.tar.gz", hash = "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"},
] ]
importlib-metadata = [ importlib-metadata = [
{file = "importlib_metadata-4.2.0-py3-none-any.whl", hash = "sha256:057e92c15bc8d9e8109738a48db0ccb31b4d9d5cfbee5a8670879a30be66304b"}, {file = "importlib_metadata-4.2.0-py3-none-any.whl", hash = "sha256:057e92c15bc8d9e8109738a48db0ccb31b4d9d5cfbee5a8670879a30be66304b"},
@ -604,8 +602,8 @@ ipadic = [
{file = "ipadic-1.0.0.tar.gz", hash = "sha256:f5923d31eca6131acaaf18ed28d8998665b1347b640d3a6476f64650e9a71c07"}, {file = "ipadic-1.0.0.tar.gz", hash = "sha256:f5923d31eca6131acaaf18ed28d8998665b1347b640d3a6476f64650e9a71c07"},
] ]
ipython = [ ipython = [
{file = "ipython-7.31.1-py3-none-any.whl", hash = "sha256:55df3e0bd0f94e715abd968bedd89d4e8a7bce4bf498fb123fed4f5398fea874"}, {file = "ipython-7.32.0-py3-none-any.whl", hash = "sha256:86df2cf291c6c70b5be6a7b608650420e89180c8ec74f376a34e2dc15c3400e7"},
{file = "ipython-7.31.1.tar.gz", hash = "sha256:b5548ec5329a4bcf054a5deed5099b0f9622eb9ea51aaa7104d215fece201d8c"}, {file = "ipython-7.32.0.tar.gz", hash = "sha256:468abefc45c15419e3c8e8c0a6a5c115b2127bafa34d7c641b1d443658793909"},
] ]
jedi = [ jedi = [
{file = "jedi-0.18.1-py2.py3-none-any.whl", hash = "sha256:637c9635fcf47945ceb91cd7f320234a7be540ded6f3e99a50cb6febdfd1ba8d"}, {file = "jedi-0.18.1-py2.py3-none-any.whl", hash = "sha256:637c9635fcf47945ceb91cd7f320234a7be540ded6f3e99a50cb6febdfd1ba8d"},
@ -630,23 +628,27 @@ mecab-ko-dic = [
{file = "mecab-ko-dic-1.0.0.tar.gz", hash = "sha256:3ba22858736e02e8a0e92f2a7f099528c733ae47701b29d12c75e982a85d1f11"}, {file = "mecab-ko-dic-1.0.0.tar.gz", hash = "sha256:3ba22858736e02e8a0e92f2a7f099528c733ae47701b29d12c75e982a85d1f11"},
] ]
mecab-python3 = [ mecab-python3 = [
{file = "mecab-python3-1.0.4.tar.gz", hash = "sha256:b150ad5fe4260539b4ef184657e552ef81307fbbe60ae1f258bc814549ea90f8"}, {file = "mecab-python3-1.0.5.tar.gz", hash = "sha256:e703d78c88a671abb8170351644850015d9bbfab31530a3b40d12481a6779a11"},
{file = "mecab_python3-1.0.4-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:3c7e87c65160e5e4edb08cb80dbce50f4e711c53f45063321aab72ab2566ffe4"}, {file = "mecab_python3-1.0.5-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:8a64bd228704ed9b24da5cbd6c4e325ef22310227153ef481f9037183351aa10"},
{file = "mecab_python3-1.0.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2fbed960ef82f4192b31efd88af1f3c24cd1692b62720ed70d7e314a50f581e"}, {file = "mecab_python3-1.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf083884116fa05ca0394c4c8d62013a4954fbac414c33a1931906ddf0f3585a"},
{file = "mecab_python3-1.0.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cb6eb6cc47e3937a2edfaa9595dc2d165ed9f025e3a53bd0a5033a12fa6bcdcf"}, {file = "mecab_python3-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fe020df27b249f43df3d38b84473d226e36d6d4a31f951cedbddabfcc450e36"},
{file = "mecab_python3-1.0.4-cp36-cp36m-win_amd64.whl", hash = "sha256:b149b51f0f62c9512d219c9e79c6db2eb66e70863a97eb412d8fc3ba7a25f351"}, {file = "mecab_python3-1.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:644f781de083311fcf81f7d55f21a756ceef7ebae7c111bd50a2c9d0855c1927"},
{file = "mecab_python3-1.0.4-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c1606b35df0136b3e9dc7add2e69d2c1151e69fd5675c0cde62d0b017b2319e7"}, {file = "mecab_python3-1.0.5-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:4309a91f0d5b66d3f0e8c9ba5a4d3cf7dbac1334269338704599820e051d1d7f"},
{file = "mecab_python3-1.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53b0b899ef03f364bfd7fa28f260ee1e893e4f47ff90a141a522709b892f0a4e"}, {file = "mecab_python3-1.0.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be2d1cd2ecd1f04b91eb0e26c906f21b50b8526e977f7f01f3901f9a6306944"},
{file = "mecab_python3-1.0.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:52a789c708f8b89044236201eb03c7fe5517fad5210a9de2230c7d99a2a8c760"}, {file = "mecab_python3-1.0.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:644bbde31ab1244ff18fb1dcac1e5fee8121f8b27a5c3e041c01ebc301df9266"},
{file = "mecab_python3-1.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:d6ca73c0dec72038290faa6de17d57d771535eb47c22346e170dffcb82d696bb"}, {file = "mecab_python3-1.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:401a2d1608b6503cb755d7d864ad74b64a7a4346309235f84577de807bb29050"},
{file = "mecab_python3-1.0.4-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:18e14dfe3d8c66cfa1c9f49e3bc8ac480b79a433ec9e5b5d2c1fb73f36ec7c3e"}, {file = "mecab_python3-1.0.5-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:5f91d5d8a9ac0ea7351e5e2423df98dd463b02013e006b18096cd365de37b2a9"},
{file = "mecab_python3-1.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:221256b84be0ee29dc8fa450210236b40707b9d63cfc70de5102d2531622d062"}, {file = "mecab_python3-1.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc8ce0151b973f4ca15e651619264442011568ebe48c6fce51d55e64f7e5c2e1"},
{file = "mecab_python3-1.0.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:de39b82f44d97fc0fd636644ad14c9662f51afcd73775379d5a8b1eb20ee85a6"}, {file = "mecab_python3-1.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e381df4c55f3ec5bccbb5625c65c54ecf982c215574d1102aff2803ac1a24cd"},
{file = "mecab_python3-1.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:96d9e8c098401fb4b5bd32258f4952f3b22cdb30ab291f5ff82eae1d0941cbed"}, {file = "mecab_python3-1.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:8eaaa78227f470c4cf1d6c2a87b92889041f317517fbe65e635b86ea0c84a194"},
{file = "mecab_python3-1.0.4-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:dcd62ebf2eecde1263119b92ff5379a046bb8231cb999fafda00f0925dfcb67e"}, {file = "mecab_python3-1.0.5-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:dd8601565dd1331ee5cd67bcc45f713cebc14b730ee2e956ed120a0ec6e4fd8a"},
{file = "mecab_python3-1.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178b632b717e3249054a7ad4c0fbc60ce8493d357afa7673d535ffa11e45eaba"}, {file = "mecab_python3-1.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76a40f717f9592bd12edc7bcf1fa869f4c8058e5d0b80d4cc6c301435afb1f96"},
{file = "mecab_python3-1.0.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fbfad60261ad3b9390b8615528fc013302a3e8febba220f799216c1a1154ee7e"}, {file = "mecab_python3-1.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f299d6ef96495371f5a622a7004a205e303dabba1fc3a7f9a07e741e315ed2b"},
{file = "mecab_python3-1.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:445b4f5ee5674d85f6de2726ec28991801844ff71eb096129da5f5ba077d5a87"}, {file = "mecab_python3-1.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:4cdb07edbbd508d9b98ac9529e0ff0b89d93e50a6beeb7b8b946439594bf5e01"},
{file = "mecab_python3-1.0.5-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:eb412a25e485e33d7ab69262b58f7365b727f8c447e4c9c1c56b5fd91414ecd2"},
{file = "mecab_python3-1.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91e8ac11ef4440418312dd4f1f200f7957fdc0148bb49dc049264c5d07bed527"},
{file = "mecab_python3-1.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae1c126cf4982035794042280998066c8b6d26eb89136731078d9105a7070c13"},
{file = "mecab_python3-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:34a196c6a410e57f975ee077d075ac994b94bb6930b04e207e59e7c7521ecb58"},
] ]
msgpack = [ msgpack = [
{file = "msgpack-1.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:96acc674bb9c9be63fa8b6dabc3248fdc575c4adc005c440ad02f87ca7edd079"}, {file = "msgpack-1.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:96acc674bb9c9be63fa8b6dabc3248fdc575c4adc005c440ad02f87ca7edd079"},
@ -731,16 +733,16 @@ pickleshare = [
{file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"},
] ]
platformdirs = [ platformdirs = [
{file = "platformdirs-2.5.0-py3-none-any.whl", hash = "sha256:30671902352e97b1eafd74ade8e4a694782bd3471685e78c32d0fdfd3aa7e7bb"}, {file = "platformdirs-2.5.1-py3-none-any.whl", hash = "sha256:bcae7cab893c2d310a711b70b24efb93334febe65f8de776ee320b517471e227"},
{file = "platformdirs-2.5.0.tar.gz", hash = "sha256:8ec11dfba28ecc0715eb5fb0147a87b1bf325f349f3da9aab2cd6b50b96b692b"}, {file = "platformdirs-2.5.1.tar.gz", hash = "sha256:7535e70dfa32e84d4b34996ea99c5e432fa29a708d0f4e394bbcb2a8faa4f16d"},
] ]
pluggy = [ pluggy = [
{file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
{file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
] ]
prompt-toolkit = [ prompt-toolkit = [
{file = "prompt_toolkit-3.0.27-py3-none-any.whl", hash = "sha256:cb7dae7d2c59188c85a1d6c944fad19aded6a26bd9c8ae115a4e1c20eb90b713"}, {file = "prompt_toolkit-3.0.28-py3-none-any.whl", hash = "sha256:30129d870dcb0b3b6a53efdc9d0a83ea96162ffd28ffe077e94215b233dc670c"},
{file = "prompt_toolkit-3.0.27.tar.gz", hash = "sha256:f2b6a8067a4fb959d3677d1ed764cc4e63e0f6f565b9a4fc7edc2b18bf80217b"}, {file = "prompt_toolkit-3.0.28.tar.gz", hash = "sha256:9f1cd16b1e86c2968f2519d7fb31dd9d669916f515612c269d14e9ed52b51650"},
] ]
ptyprocess = [ ptyprocess = [
{file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
@ -771,80 +773,80 @@ pytest = [
{file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
] ]
regex = [ regex = [
{file = "regex-2022.1.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5"}, {file = "regex-2022.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7"},
{file = "regex-2022.1.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7"}, {file = "regex-2022.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f"}, {file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949"}, {file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c"}, {file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59"}, {file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4"}, {file = "regex-2022.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af"}, {file = "regex-2022.3.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6"}, {file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832"}, {file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0"}, {file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9"}, {file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7"}, {file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd"},
{file = "regex-2022.1.18-cp310-cp310-win32.whl", hash = "sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d"}, {file = "regex-2022.3.2-cp310-cp310-win32.whl", hash = "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834"},
{file = "regex-2022.1.18-cp310-cp310-win_amd64.whl", hash = "sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184"}, {file = "regex-2022.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7"},
{file = "regex-2022.1.18-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed"}, {file = "regex-2022.3.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3"}, {file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b"}, {file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f"}, {file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9"}, {file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3"}, {file = "regex-2022.3.2-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8"}, {file = "regex-2022.3.2-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74"}, {file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb"}, {file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f"}, {file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43"}, {file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a"}, {file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29"},
{file = "regex-2022.1.18-cp36-cp36m-win32.whl", hash = "sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3"}, {file = "regex-2022.3.2-cp36-cp36m-win32.whl", hash = "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863"},
{file = "regex-2022.1.18-cp36-cp36m-win_amd64.whl", hash = "sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633"}, {file = "regex-2022.3.2-cp36-cp36m-win_amd64.whl", hash = "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b"},
{file = "regex-2022.1.18-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3"}, {file = "regex-2022.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38"}, {file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc"}, {file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87"}, {file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a"}, {file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8"}, {file = "regex-2022.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4"}, {file = "regex-2022.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52"}, {file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2"}, {file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202"}, {file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c"}, {file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a"}, {file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec"},
{file = "regex-2022.1.18-cp37-cp37m-win32.whl", hash = "sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6"}, {file = "regex-2022.3.2-cp37-cp37m-win32.whl", hash = "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f"},
{file = "regex-2022.1.18-cp37-cp37m-win_amd64.whl", hash = "sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118"}, {file = "regex-2022.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0"},
{file = "regex-2022.1.18-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899"}, {file = "regex-2022.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640"},
{file = "regex-2022.1.18-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52"}, {file = "regex-2022.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3"}, {file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298"}, {file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab"}, {file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1"}, {file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288"}, {file = "regex-2022.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6"}, {file = "regex-2022.3.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4"}, {file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607"}, {file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729"}, {file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c"}, {file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d"}, {file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056"},
{file = "regex-2022.1.18-cp38-cp38-win32.whl", hash = "sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02"}, {file = "regex-2022.3.2-cp38-cp38-win32.whl", hash = "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0"},
{file = "regex-2022.1.18-cp38-cp38-win_amd64.whl", hash = "sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e"}, {file = "regex-2022.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47"},
{file = "regex-2022.1.18-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d"}, {file = "regex-2022.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3"},
{file = "regex-2022.1.18-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a"}, {file = "regex-2022.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4"}, {file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68"}, {file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101"}, {file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c"}, {file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d"}, {file = "regex-2022.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75"}, {file = "regex-2022.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338"}, {file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f"}, {file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605"}, {file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a"}, {file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093"}, {file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015"},
{file = "regex-2022.1.18-cp39-cp39-win32.whl", hash = "sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1"}, {file = "regex-2022.3.2-cp39-cp39-win32.whl", hash = "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1"},
{file = "regex-2022.1.18-cp39-cp39-win_amd64.whl", hash = "sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442"}, {file = "regex-2022.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7"},
{file = "regex-2022.1.18.tar.gz", hash = "sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916"}, {file = "regex-2022.3.2.tar.gz", hash = "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b"},
] ]
toml = [ toml = [
{file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
@ -885,12 +887,12 @@ typed-ast = [
{file = "typed_ast-1.5.2.tar.gz", hash = "sha256:525a2d4088e70a9f75b08b3f87a51acc9cde640e19cc523c7e41aa355564ae27"}, {file = "typed_ast-1.5.2.tar.gz", hash = "sha256:525a2d4088e70a9f75b08b3f87a51acc9cde640e19cc523c7e41aa355564ae27"},
] ]
types-setuptools = [ types-setuptools = [
{file = "types-setuptools-57.4.9.tar.gz", hash = "sha256:536ef74744f8e1e4be4fc719887f886e74e4cf3c792b4a06984320be4df450b5"}, {file = "types-setuptools-57.4.10.tar.gz", hash = "sha256:9a13513679c640f6616e2d9ab50d431c99ca8ae9848a97243f887c80fd5cf294"},
{file = "types_setuptools-57.4.9-py3-none-any.whl", hash = "sha256:948dc6863373750e2cd0b223a84f1fb608414cde5e55cf38ea657b93aeb411d2"}, {file = "types_setuptools-57.4.10-py3-none-any.whl", hash = "sha256:ddc98da82c12e1208012d65276641a132d3aadc78ecfff68fd3e17d85933a3c1"},
] ]
typing-extensions = [ typing-extensions = [
{file = "typing_extensions-4.0.1-py3-none-any.whl", hash = "sha256:7f001e5ac290a0c0401508864c7ec868be4e701886d5b573a9528ed3973d9d3b"}, {file = "typing_extensions-4.1.1-py3-none-any.whl", hash = "sha256:21c85e0fe4b9a155d0799430b0ad741cdce7e359660ccbd8b530613e8df88ce2"},
{file = "typing_extensions-4.0.1.tar.gz", hash = "sha256:4ca091dea149f945ec56afb48dae714f21e8692ef22a395223bcd328961b6a0e"}, {file = "typing_extensions-4.1.1.tar.gz", hash = "sha256:1a9462dcc3347a79b1f1c0271fbe79e844580bb598bafa1ed208b94da3cdcd42"},
] ]
wcwidth = [ wcwidth = [
{file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"},

View File

@ -4,6 +4,7 @@ version = "2.6.0"
description = "Look up the frequencies of words in many languages, based on many sources of data." description = "Look up the frequencies of words in many languages, based on many sources of data."
authors = ["Robyn Speer <rspeer@arborelia.net>"] authors = ["Robyn Speer <rspeer@arborelia.net>"]
license = "MIT" license = "MIT"
readme = "README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.7" python = "^3.7"

View File

@ -3,17 +3,21 @@ from wordfreq import tokenize, word_frequency
def test_apostrophes(): def test_apostrophes():
# Test that we handle apostrophes in French reasonably. # Test that we handle apostrophes in French reasonably.
assert tokenize("qu'un", 'fr') == ['qu', 'un'] assert tokenize("qu'un", "fr") == ["qu", "un"]
assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"] assert tokenize("qu'un", "fr", include_punctuation=True) == ["qu'", "un"]
assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl'] assert tokenize("langues d'oïl", "fr") == ["langues", "d", "oïl"]
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl'] assert tokenize("langues d'oïl", "fr", include_punctuation=True) == [
assert tokenize("l'heure", 'fr') == ['l', 'heure'] "langues",
assert tokenize("l'ànima", 'ca') == ['l', 'ànima'] "d'",
assert tokenize("l'anima", 'it') == ['l', 'anima'] "oïl",
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure'] ]
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital'] assert tokenize("l'heure", "fr") == ["l", "heure"]
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"] assert tokenize("l'ànima", "ca") == ["l", "ànima"]
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french'] assert tokenize("l'anima", "it") == ["l", "anima"]
assert tokenize("l'heure", "fr", include_punctuation=True) == ["l'", "heure"]
assert tokenize("L'Hôpital", "fr", include_punctuation=True) == ["l'", "hôpital"]
assert tokenize("aujourd'hui", "fr") == ["aujourd'hui"]
assert tokenize("This isn't French", "en") == ["this", "isn't", "french"]
# This next behavior is not ideal -- we would prefer "dell'" to be handled # This next behavior is not ideal -- we would prefer "dell'" to be handled
# the same as "l'" -- but this is the most consistent result we can get without # the same as "l'" -- but this is the most consistent result we can get without
@ -21,26 +25,28 @@ def test_apostrophes():
# #
# Versions of regex from 2019 and earlier would give ['dell', 'anima'], which # Versions of regex from 2019 and earlier would give ['dell', 'anima'], which
# is better but inconsistent. # is better but inconsistent.
assert tokenize("dell'anima", 'it') == ["dell'anima"] assert tokenize("dell'anima", "it") == ["dell'anima"]
# Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's # Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's
# an example of why we don't want the apostrophe-vowel fix to apply everywhere. # an example of why we don't want the apostrophe-vowel fix to apply everywhere.
assert tokenize("hawai'i", 'en') == ["hawai'i"] assert tokenize("hawai'i", "en") == ["hawai'i"]
def test_catastrophes(): def test_catastrophes():
# More apostrophes, but this time they're in Catalan, and there's other # More apostrophes, but this time they're in Catalan, and there's other
# mid-word punctuation going on too. # mid-word punctuation going on too.
assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar'] assert tokenize("M'acabo d'instal·lar.", "ca") == ["m", "acabo", "d", "instal·lar"]
assert ( assert tokenize("M'acabo d'instal·lar.", "ca", include_punctuation=True) == [
tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) == "m'",
["m'", 'acabo', "d'", 'instal·lar', '.'] "acabo",
) "d'",
"instal·lar",
".",
]
def test_alternate_codes(): def test_alternate_codes():
# Try over-long language codes for French and Catalan # Try over-long language codes for French and Catalan
assert tokenize("qu'un", 'fra') == ['qu', 'un'] assert tokenize("qu'un", "fra") == ["qu", "un"]
assert tokenize("qu'un", 'fre') == ['qu', 'un'] assert tokenize("qu'un", "fre") == ["qu", "un"]
assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar'] assert tokenize("M'acabo d'instal·lar.", "cat") == ["m", "acabo", "d", "instal·lar"]

View File

@ -14,12 +14,12 @@ def test_gender_neutral_at():
"tod@s", "tod@s",
"l@s", "l@s",
"trabajador@s", "trabajador@s",
"migrantes" "migrantes",
] ]
text = "el distrito 22@ de Barcelona" text = "el distrito 22@ de Barcelona"
assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"] assert tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"] assert lossy_tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
# It also appears in Portuguese # It also appears in Portuguese
text = "direitos e deveres para @s membr@s da comunidade virtual" text = "direitos e deveres para @s membr@s da comunidade virtual"
@ -32,7 +32,7 @@ def test_gender_neutral_at():
"membr@s", "membr@s",
"da", "da",
"comunidade", "comunidade",
"virtual" "virtual",
] ]
# Because this is part of our tokenization, the language code doesn't # Because this is part of our tokenization, the language code doesn't
@ -43,10 +43,10 @@ def test_gender_neutral_at():
def test_at_in_corpus(): def test_at_in_corpus():
# We have a word frequency for "l@s" # We have a word frequency for "l@s"
assert word_frequency('l@s', 'es') > 0 assert word_frequency("l@s", "es") > 0
# It's not just treated as a word break # It's not just treated as a word break
assert word_frequency('l@s', 'es') < word_frequency('l s', 'es') assert word_frequency("l@s", "es") < word_frequency("l s", "es")
def test_punctuation_at(): def test_punctuation_at():
@ -65,7 +65,7 @@ def test_punctuation_at():
"ao", "ao",
"lado", "lado",
"do", "do",
"nick" "nick",
] ]
assert tokenize(text, "pt", include_punctuation=True) == [ assert tokenize(text, "pt", include_punctuation=True) == [
@ -83,7 +83,7 @@ def test_punctuation_at():
"ao", "ao",
"lado", "lado",
"do", "do",
"nick" "nick",
] ]
# If the @ is not at the end of the word or part of the word ending '@s', # If the @ is not at the end of the word or part of the word ending '@s',
@ -98,12 +98,9 @@ def test_punctuation_at():
"la", "la",
"línea", "línea",
"all:all", "all:all",
"all" "all",
] ]
# Make sure not to catch e-mail addresses # Make sure not to catch e-mail addresses
text = "info@something.example" text = "info@something.example"
assert tokenize(text, "en") == [ assert tokenize(text, "en") == ["info", "something.example"]
"info",
"something.example"
]

View File

@ -9,92 +9,112 @@ def test_tokens():
# (He was the Chinese Wikipedia's featured article of the day when I # (He was the Chinese Wikipedia's featured article of the day when I
# wrote this test.) # wrote this test.)
hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè". hobart = "加勒特·霍巴特" # Garret Hobart, or "jiā lè tè huò bā tè".
# He was the sixth American vice president to die in office. # He was the sixth American vice president to die in office.
fact_simplified = '他是历史上第六位在任期内去世的美国副总统。' fact_simplified = "他是历史上第六位在任期内去世的美国副总统。"
fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。' fact_traditional = "他是歷史上第六位在任期內去世的美國副總統。"
# His name breaks into five pieces, with the only piece staying together # His name breaks into five pieces, with the only piece staying together
# being the one that means 'Bart'. The dot is not included as a token. # being the one that means 'Bart'. The dot is not included as a token.
assert tokenize(hobart, 'zh') == ['', '', '', '', '巴特'] assert tokenize(hobart, "zh") == ["", "", "", "", "巴特"]
assert tokenize(fact_simplified, 'zh') == [ assert tokenize(fact_simplified, "zh") == [
# he / is / history / in / #6 / counter for people # he / is / history / in / #6 / counter for people
'', '', '历史', '', '第六', '', "",
"",
"历史",
"",
"第六",
"",
# during / term of office / in / die # during / term of office / in / die
'', '任期', '', '去世', "",
"任期",
"",
"去世",
# of / U.S. / deputy / president # of / U.S. / deputy / president
'', '美国', '', '总统' "",
"美国",
"",
"总统",
] ]
# Jieba's original tokenizer knows a lot of names, it seems. # Jieba's original tokenizer knows a lot of names, it seems.
assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特'] assert tokenize(hobart, "zh", external_wordlist=True) == ["加勒特", "霍巴特"]
# We get almost the same tokens from the sentence using Jieba's own # We get almost the same tokens from the sentence using Jieba's own
# wordlist, but it tokenizes "in history" as two words and # wordlist, but it tokenizes "in history" as two words and
# "sixth person" as one. # "sixth person" as one.
assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [ assert tokenize(fact_simplified, "zh", external_wordlist=True) == [
# he / is / history / in / sixth person # he / is / history / in / sixth person
'', '', '历史', '', '第六位', "",
"",
"历史",
"",
"第六位",
# during / term of office / in / die # during / term of office / in / die
'', '任期', '', '去世', "",
"任期",
"",
"去世",
# of / U.S. / deputy / president # of / U.S. / deputy / president
'', '美国', '', '总统' "",
"美国",
"",
"总统",
] ]
# Check that Traditional Chinese works at all # Check that Traditional Chinese works at all
assert word_frequency(fact_traditional, 'zh') > 0 assert word_frequency(fact_traditional, "zh") > 0
# You get the same token lengths if you look it up in Traditional Chinese, # You get the same token lengths if you look it up in Traditional Chinese,
# but the words are different # but the words are different
simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True) simp_tokens = tokenize(fact_simplified, "zh", include_punctuation=True)
trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True) trad_tokens = tokenize(fact_traditional, "zh", include_punctuation=True)
assert ''.join(simp_tokens) == fact_simplified assert "".join(simp_tokens) == fact_simplified
assert ''.join(trad_tokens) == fact_traditional assert "".join(trad_tokens) == fact_traditional
simp_lengths = [len(token) for token in simp_tokens] simp_lengths = [len(token) for token in simp_tokens]
trad_lengths = [len(token) for token in trad_tokens] trad_lengths = [len(token) for token in trad_tokens]
assert simp_lengths == trad_lengths assert simp_lengths == trad_lengths
def test_combination(): def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks" xiexie_freq = word_frequency("谢谢", "zh") # "Thanks"
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01) assert word_frequency("谢谢谢谢", "zh") == pytest.approx(xiexie_freq / 20, rel=0.01)
def test_alternate_codes(): def test_alternate_codes():
# Tokenization of Chinese works when you use other language codes # Tokenization of Chinese works when you use other language codes
# that are not equal to 'zh'. # that are not equal to 'zh'.
tokens = ['谢谢', '谢谢'] tokens = ["谢谢", "谢谢"]
# Code with a region attached # Code with a region attached
assert tokenize('谢谢谢谢', 'zh-CN') == tokens assert tokenize("谢谢谢谢", "zh-CN") == tokens
# Over-long codes for Chinese # Over-long codes for Chinese
assert tokenize('谢谢谢谢', 'chi') == tokens assert tokenize("谢谢谢谢", "chi") == tokens
assert tokenize('谢谢谢谢', 'zho') == tokens assert tokenize("谢谢谢谢", "zho") == tokens
# Separate codes for Mandarin and Cantonese # Separate codes for Mandarin and Cantonese
assert tokenize('谢谢谢谢', 'cmn') == tokens assert tokenize("谢谢谢谢", "cmn") == tokens
assert tokenize('谢谢谢谢', 'yue') == tokens assert tokenize("谢谢谢谢", "yue") == tokens
def test_unreasonably_long(): def test_unreasonably_long():
# This crashed earlier versions of wordfreq due to an overflow in # This crashed earlier versions of wordfreq due to an overflow in
# exponentiation. We've now changed the sequence of operations so it # exponentiation. We've now changed the sequence of operations so it
# will underflow instead. # will underflow instead.
lots_of_ls = 'l' * 800 lots_of_ls = "l" * 800
assert word_frequency(lots_of_ls, 'zh') == 0. assert word_frequency(lots_of_ls, "zh") == 0.0
assert zipf_frequency(lots_of_ls, 'zh') == 0. assert zipf_frequency(lots_of_ls, "zh") == 0.0
def test_hyphens(): def test_hyphens():
# An edge case of Chinese tokenization that changed sometime around # An edge case of Chinese tokenization that changed sometime around
# jieba 0.42. # jieba 0.42.
tok = tokenize('--------', 'zh', include_punctuation=True) tok = tokenize("--------", "zh", include_punctuation=True)
assert tok == ['-'] * 8 assert tok == ["-"] * 8
tok = tokenize('--------', 'zh', include_punctuation=True, external_wordlist=True)
assert tok == ['--------']
tok = tokenize("--------", "zh", include_punctuation=True, external_wordlist=True)
assert tok == ["--------"]

View File

@ -1,16 +1,22 @@
from wordfreq import ( from wordfreq import (
word_frequency, available_languages, cB_to_freq, word_frequency,
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize available_languages,
cB_to_freq,
top_n_list,
random_words,
random_ascii_words,
tokenize,
lossy_tokenize,
) )
import pytest import pytest
def test_freq_examples(): def test_freq_examples():
# Stopwords are most common in the correct language # Stopwords are most common in the correct language
assert word_frequency('the', 'en') > word_frequency('de', 'en') assert word_frequency("the", "en") > word_frequency("de", "en")
assert word_frequency('de', 'es') > word_frequency('the', 'es') assert word_frequency("de", "es") > word_frequency("the", "es")
# We get word frequencies from the 'large' list when available # We get word frequencies from the 'large' list when available
assert word_frequency('infrequency', 'en') > 0. assert word_frequency("infrequency", "en") > 0.0
def test_languages(): def test_languages():
@ -20,33 +26,33 @@ def test_languages():
assert len(avail) >= 34 assert len(avail) >= 34
# 'small' covers the same languages, but with some different lists # 'small' covers the same languages, but with some different lists
avail_small = available_languages('small') avail_small = available_languages("small")
assert len(avail_small) == len(avail) assert len(avail_small) == len(avail)
assert avail_small != avail assert avail_small != avail
# 'combined' is the same as 'small' # 'combined' is the same as 'small'
avail_old_name = available_languages('combined') avail_old_name = available_languages("combined")
assert avail_old_name == avail_small assert avail_old_name == avail_small
# 'large' covers fewer languages # 'large' covers fewer languages
avail_large = available_languages('large') avail_large = available_languages("large")
assert len(avail_large) >= 14 assert len(avail_large) >= 14
assert len(avail) > len(avail_large) assert len(avail) > len(avail_large)
# Look up the digit '2' in the main word list for each language # Look up the digit '2' in the main word list for each language
for lang in avail: for lang in avail:
assert word_frequency('2', lang) > 0 assert word_frequency("2", lang) > 0
# Make up a weirdly verbose language code and make sure # Make up a weirdly verbose language code and make sure
# we still get it # we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper() new_lang_code = "%s-001-x-fake-ext" % lang.upper()
assert word_frequency('2', new_lang_code) > 0 assert word_frequency("2", new_lang_code) > 0
def test_minimums(): def test_minimums():
assert word_frequency('esquivalience', 'en') == 0 assert word_frequency("esquivalience", "en") == 0
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6 assert word_frequency("esquivalience", "en", minimum=1e-6) == 1e-6
assert word_frequency('the', 'en', minimum=1) == 1 assert word_frequency("the", "en", minimum=1) == 1
def test_most_common_words(): def test_most_common_words():
@ -59,61 +65,61 @@ def test_most_common_words():
""" """
return top_n_list(lang, 1)[0] return top_n_list(lang, 1)[0]
assert get_most_common('ar') == 'في' assert get_most_common("ar") == "في"
assert get_most_common('bg') == 'на' assert get_most_common("bg") == "на"
assert get_most_common('bn') == 'না' assert get_most_common("bn") == "না"
assert get_most_common('ca') == 'de' assert get_most_common("ca") == "de"
assert get_most_common('cs') == 'a' assert get_most_common("cs") == "a"
assert get_most_common('da') == 'i' assert get_most_common("da") == "i"
assert get_most_common('el') == 'και' assert get_most_common("el") == "και"
assert get_most_common('de') == 'die' assert get_most_common("de") == "die"
assert get_most_common('en') == 'the' assert get_most_common("en") == "the"
assert get_most_common('es') == 'de' assert get_most_common("es") == "de"
assert get_most_common('fi') == 'ja' assert get_most_common("fi") == "ja"
assert get_most_common('fil') == 'sa' assert get_most_common("fil") == "sa"
assert get_most_common('fr') == 'de' assert get_most_common("fr") == "de"
assert get_most_common('he') == 'את' assert get_most_common("he") == "את"
assert get_most_common('hi') == 'के' assert get_most_common("hi") == "के"
assert get_most_common('hu') == 'a' assert get_most_common("hu") == "a"
assert get_most_common('id') == 'yang' assert get_most_common("id") == "yang"
assert get_most_common('is') == 'og' assert get_most_common("is") == "og"
assert get_most_common('it') == 'di' assert get_most_common("it") == "di"
assert get_most_common('ja') == '' assert get_most_common("ja") == ""
assert get_most_common('ko') == '' assert get_most_common("ko") == ""
assert get_most_common('lt') == 'ir' assert get_most_common("lt") == "ir"
assert get_most_common('lv') == 'un' assert get_most_common("lv") == "un"
assert get_most_common('mk') == 'на' assert get_most_common("mk") == "на"
assert get_most_common('ms') == 'yang' assert get_most_common("ms") == "yang"
assert get_most_common('nb') == 'i' assert get_most_common("nb") == "i"
assert get_most_common('nl') == 'de' assert get_most_common("nl") == "de"
assert get_most_common('pl') == 'w' assert get_most_common("pl") == "w"
assert get_most_common('pt') == 'de' assert get_most_common("pt") == "de"
assert get_most_common('ro') == 'de' assert get_most_common("ro") == "de"
assert get_most_common('ru') == 'в' assert get_most_common("ru") == "в"
assert get_most_common('sh') == 'je' assert get_most_common("sh") == "je"
assert get_most_common('sk') == 'a' assert get_most_common("sk") == "a"
assert get_most_common('sl') == 'je' assert get_most_common("sl") == "je"
assert get_most_common('sv') == 'är' assert get_most_common("sv") == "är"
assert get_most_common('ta') == 'ஒரு' assert get_most_common("ta") == "ஒரு"
assert get_most_common('tr') == 've' assert get_most_common("tr") == "ve"
assert get_most_common('uk') == 'в' assert get_most_common("uk") == "в"
assert get_most_common('ur') == 'کے' assert get_most_common("ur") == "کے"
assert get_most_common('vi') == '' assert get_most_common("vi") == ""
assert get_most_common('zh') == '' assert get_most_common("zh") == ""
def test_language_matching(): def test_language_matching():
freq = word_frequency('', 'zh') freq = word_frequency("", "zh")
assert word_frequency('', 'zh-TW') == freq assert word_frequency("", "zh-TW") == freq
assert word_frequency('', 'zh-CN') == freq assert word_frequency("", "zh-CN") == freq
assert word_frequency('', 'zh-Hant') == freq assert word_frequency("", "zh-Hant") == freq
assert word_frequency('', 'zh-Hans') == freq assert word_frequency("", "zh-Hans") == freq
assert word_frequency('', 'yue-HK') == freq assert word_frequency("", "yue-CN") == freq
assert word_frequency('', 'cmn') == freq assert word_frequency("", "cmn") == freq
def test_cB_conversion(): def test_cB_conversion():
assert cB_to_freq(0) == 1. assert cB_to_freq(0) == 1.0
assert cB_to_freq(-100) == pytest.approx(0.1) assert cB_to_freq(-100) == pytest.approx(0.1)
assert cB_to_freq(-600) == pytest.approx(1e-6) assert cB_to_freq(-600) == pytest.approx(1e-6)
@ -126,101 +132,125 @@ def test_failed_cB_conversion():
def test_tokenization(): def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the # We preserve apostrophes within words, so "can't" is a single word in the
# data # data
assert ( assert tokenize("I don't split at apostrophes, you see.", "en") == [
tokenize("I don't split at apostrophes, you see.", 'en') "i",
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'] "don't",
) "split",
"at",
"apostrophes",
"you",
"see",
]
assert ( assert tokenize(
tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True) "I don't split at apostrophes, you see.", "en", include_punctuation=True
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'] ) == ["i", "don't", "split", "at", "apostrophes", ",", "you", "see", "."]
)
# Certain punctuation does not inherently split a word. # Certain punctuation does not inherently split a word.
assert ( assert tokenize("Anything is possible at zombo.com", "en") == [
tokenize("Anything is possible at zombo.com", 'en') "anything",
== ['anything', 'is', 'possible', 'at', 'zombo.com'] "is",
) "possible",
"at",
"zombo.com",
]
# Splits occur after symbols, and at splitting punctuation such as hyphens. # Splits occur after symbols, and at splitting punctuation such as hyphens.
assert tokenize('😂test', 'en') == ['😂', 'test'] assert tokenize("😂test", "en") == ["😂", "test"]
assert tokenize("flip-flop", 'en') == ['flip', 'flop'] assert tokenize("flip-flop", "en") == ["flip", "flop"]
assert ( assert tokenize(
tokenize('this text has... punctuation :)', 'en', include_punctuation=True) "this text has... punctuation :)", "en", include_punctuation=True
== ['this', 'text', 'has', '...', 'punctuation', ':)'] ) == ["this", "text", "has", "...", "punctuation", ":)"]
)
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf' # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
# and 'David Bowie' stay together, because our Unicode segmentation algorithm # and 'David Bowie' stay together, because our Unicode segmentation algorithm
# is up to date # is up to date
assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽'] assert tokenize("emoji test 🧕🏽", "en") == ["emoji", "test", "🧕🏽"]
assert ( assert tokenize(
tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en') "👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", "en"
== ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's", ) == [
'nothing', 'i', 'can', 'do', '🌎', '🚀'] "👨‍🎤",
) "planet",
"earth",
"is",
"blue",
"and",
"there's",
"nothing",
"i",
"can",
"do",
"🌎",
"🚀",
]
# Water wave, surfer, flag of California (indicates ridiculously complete support # Water wave, surfer, flag of California (indicates ridiculously complete support
# for Unicode 10 and Emoji 5.0) # for Unicode 10 and Emoji 5.0)
assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"] assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'", "en") == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
def test_casefolding(): def test_casefolding():
assert tokenize('WEISS', 'de') == ['weiss'] assert tokenize("WEISS", "de") == ["weiss"]
assert tokenize('weiß', 'de') == ['weiss'] assert tokenize("weiß", "de") == ["weiss"]
assert tokenize('İstanbul', 'tr') == ['istanbul'] assert tokenize("İstanbul", "tr") == ["istanbul"]
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca'] assert tokenize("SIKISINCA", "tr") == ["sıkısınca"]
def test_number_smashing(): def test_normalization():
assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver'] assert tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver'] "715",
assert ( "crσσks",
lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True) "by",
== ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'] "bon",
) "iver",
assert lossy_tokenize('1', 'en') == ['1'] ]
assert lossy_tokenize('3.14', 'en') == ['0.00'] assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
assert lossy_tokenize('24601', 'en') == ['00000'] "715",
assert word_frequency('24601', 'en') == word_frequency('90210', 'en') "crσσks",
"by",
"bon",
"iver",
]
def test_uncurl_quotes(): def test_uncurl_quotes():
assert lossy_tokenize("lets", 'en') == ["let's"] assert lossy_tokenize("lets", "en") == ["let's"]
assert word_frequency("lets", 'en') == word_frequency("let's", 'en') assert word_frequency("lets", "en") == word_frequency("let's", "en")
def test_phrase_freq(): def test_phrase_freq():
ff = word_frequency("flip-flop", 'en') ff = word_frequency("flip-flop", "en")
assert ff > 0 assert ff > 0
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en') phrase_freq = 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency(
"flop", "en"
)
assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01) assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
def test_not_really_random(): def test_not_really_random():
# If your xkcd-style password comes out like this, maybe you shouldn't # If your xkcd-style password comes out like this, maybe you shouldn't
# use it # use it
assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the' assert random_words(nwords=4, lang="en", bits_per_word=0) == "the the the the"
# This not only tests random_ascii_words, it makes sure we didn't end # This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word # up with 'eos' as a very common Japanese word
assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '00 00 00 00' assert random_ascii_words(nwords=4, lang="ja", bits_per_word=0) == "1 1 1 1"
def test_not_enough_ascii(): def test_not_enough_ascii():
with pytest.raises(ValueError): with pytest.raises(ValueError):
random_ascii_words(lang='zh', bits_per_word=16) random_ascii_words(lang="zh", bits_per_word=16)
def test_arabic(): def test_arabic():
# Remove tatweels # Remove tatweels
assert tokenize('متــــــــعب', 'ar') == ['متعب'] assert tokenize("متــــــــعب", "ar") == ["متعب"]
# Remove combining marks # Remove combining marks
assert tokenize('حَرَكَات', 'ar') == ['حركات'] assert tokenize("حَرَكَات", "ar") == ["حركات"]
# An Arabic ligature that is affected by NFKC normalization # An Arabic ligature that is affected by NFKC normalization
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627'] assert tokenize("\ufefb", "ar") == ["\u0644\u0627"]
def test_ideographic_fallback(): def test_ideographic_fallback():
@ -228,28 +258,33 @@ def test_ideographic_fallback():
# #
# More complex examples like this, involving the multiple scripts of Japanese, # More complex examples like this, involving the multiple scripts of Japanese,
# are in test_japanese.py. # are in test_japanese.py.
assert tokenize('中国文字', 'en') == ['中国文字'] assert tokenize("中国文字", "en") == ["中国文字"]
def test_other_languages(): def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support, # Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase. # we would actually split this into a three-word phrase.
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี'] assert tokenize("การเล่นดนตรี", "th") == ["การเล่นดนตรี"]
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music'] assert tokenize('"การเล่นดนตรี" means "playing music"', "en") == [
"การเล่นดนตรี",
"means",
"playing",
"music",
]
# Test Khmer, a script similar to Thai # Test Khmer, a script similar to Thai
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍'] assert tokenize("សូមស្វាគមន៍", "km") == ["សូមស្វាគមន៍"]
# Test Hindi -- tokens split where there are spaces, and not where there aren't # Test Hindi -- tokens split where there are spaces, and not where there aren't
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी'] assert tokenize("हिन्दी विक्षनरी", "hi") == ["हिन्दी", "विक्षनरी"]
# Remove vowel points in Hebrew # Remove vowel points in Hebrew
assert tokenize('דֻּגְמָה', 'he') == ['דגמה'] assert tokenize("דֻּגְמָה", "he") == ["דגמה"]
# Deal with commas, cedillas, and I's in Turkish # Deal with commas, cedillas, and I's in Turkish
assert tokenize('kișinin', 'tr') == ['kişinin'] assert tokenize("kișinin", "tr") == ["kişinin"]
assert tokenize('KİȘİNİN', 'tr') == ['kişinin'] assert tokenize("KİȘİNİN", "tr") == ["kişinin"]
# Deal with cedillas that should be commas-below in Romanian # Deal with cedillas that should be commas-below in Romanian
assert tokenize('acelaşi', 'ro') == ['același'] assert tokenize("acelaşi", "ro") == ["același"]
assert tokenize('ACELAŞI', 'ro') == ['același'] assert tokenize("ACELAŞI", "ro") == ["același"]

View File

@ -3,7 +3,7 @@ import pytest
def test_tokens(): def test_tokens():
assert tokenize('おはようございます', 'ja') == ['おはよう', 'ござい', 'ます'] assert tokenize("おはようございます", "ja") == ["おはよう", "ござい", "ます"]
def test_simple_tokenize(): def test_simple_tokenize():
@ -17,13 +17,12 @@ def test_simple_tokenize():
# #
# We used to try to infer word boundaries between hiragana and katakana, # We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary. # but this leads to edge cases that are unsolvable without a dictionary.
ja_text = 'ひらがなカタカナromaji' ja_text = "ひらがなカタカナromaji"
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji'] assert simple_tokenize(ja_text) == ["ひらがなカタカナ", "romaji"]
# An example that would be multiple tokens if tokenized as 'ja' via MeCab, # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize # but sticks together in simple_tokenize
assert simple_tokenize('おはようございます') == ['おはようございます'] assert simple_tokenize("おはようございます") == ["おはようございます"]
# Names that use the weird possessive marker ヶ, which is technically a # Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one # katakana even though it's being used like a kanji, stay together as one
@ -43,17 +42,13 @@ def test_simple_tokenize():
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"] assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
def test_combination(): def test_combination():
ohayou_freq = word_frequency('おはよう', 'ja') ohayou_freq = word_frequency("おはよう", "ja")
gozai_freq = word_frequency('ござい', 'ja') gozai_freq = word_frequency("ござい", "ja")
masu_freq = word_frequency('ます', 'ja') masu_freq = word_frequency("ます", "ja")
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01) assert word_frequency("おはようおはよう", "ja") == pytest.approx(ohayou_freq / 2, rel=0.01)
assert ( assert 1.0 / word_frequency("おはようございます", "ja") == pytest.approx(
1.0 / word_frequency('おはようございます', 'ja') == 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01)
) )

View File

@ -3,16 +3,14 @@ import pytest
def test_tokens(): def test_tokens():
assert tokenize('감사합니다', 'ko') == ['감사', '합니다'] assert tokenize("감사합니다", "ko") == ["감사", "합니다"]
def test_combination(): def test_combination():
gamsa_freq = word_frequency('감사', 'ko') gamsa_freq = word_frequency("감사", "ko")
habnida_freq = word_frequency('합니다', 'ko') habnida_freq = word_frequency("합니다", "ko")
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01) assert word_frequency("감사감사", "ko") == pytest.approx(gamsa_freq / 2, rel=0.01)
assert ( assert 1.0 / word_frequency("감사합니다", "ko") == pytest.approx(
1.0 / word_frequency('감사합니다', 'ko') == 1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
) )

58
tests/test_numbers.py Normal file
View File

@ -0,0 +1,58 @@
from wordfreq import word_frequency
from wordfreq.numbers import digit_freq, smash_numbers
from pytest import approx
def test_number_smashing():
assert smash_numbers("1") == "1"
assert smash_numbers("3.14") == "0.00"
assert smash_numbers("24601") == "00000"
def test_decimals():
assert word_frequency("3.14", "el") > word_frequency("4.14", "el")
assert word_frequency("3.14", "el") == word_frequency("3.15", "el")
assert word_frequency("3,14", "de") > word_frequency("4,14", "de")
assert word_frequency("3,14", "de") == word_frequency("3,15", "de")
def test_year_distribution():
assert word_frequency("2010", "en") > word_frequency("1010", "en")
assert word_frequency("2010", "en") > word_frequency("3010", "en")
def test_boundaries():
assert word_frequency("9", "en") > word_frequency("10", "en")
assert word_frequency("99", "en") > word_frequency("100", "en")
assert word_frequency("999", "en") > word_frequency("1000", "en")
assert word_frequency("9999", "en") > word_frequency("10000", "en")
def test_multiple_words():
once = word_frequency("2015b", "en")
twice = word_frequency("2015b 2015b", "en")
assert once == approx(2 * twice)
def test_distribution():
assert word_frequency("24601", "en") > word_frequency("90210", "en")
assert word_frequency("7", "en") > word_frequency("007", "en")
assert word_frequency("404", "en") == word_frequency("418", "en")
def test_3digit_sum():
"""
Test that the probability distribution given you have a 4-digit sequence
adds up to approximately 1.
"""
three_digit_sum = sum(digit_freq(f"{num:03d}") for num in range(0, 1000))
assert three_digit_sum == approx(1.0)
def test_4digit_sum():
"""
Test that the probability distribution given you have a 4-digit sequence
adds up to approximately 1.
"""
four_digit_sum = sum(digit_freq(f"{num:04d}") for num in range(0, 10000))
assert 0.999 < four_digit_sum < 1.0

View File

@ -5,14 +5,26 @@ from wordfreq.preprocess import preprocess_text
def test_transliteration(): def test_transliteration():
# "Well, there's a lot of things you do not understand." # "Well, there's a lot of things you do not understand."
# (from somewhere in OpenSubtitles # (from somewhere in OpenSubtitles
assert ( assert tokenize("Па, има ту много ствари које не схваташ.", "sr") == [
tokenize("Па, има ту много ствари које не схваташ.", 'sr') == "pa",
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'] "ima",
) "tu",
assert ( "mnogo",
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == "stvari",
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'] "koje",
) "ne",
"shvataš",
]
assert tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", "sr") == [
"pa",
"ima",
"tu",
"mnogo",
"stvari",
"koje",
"ne",
"shvataš",
]
# I don't have examples of complete sentences in Azerbaijani that are # I don't have examples of complete sentences in Azerbaijani that are
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
@ -20,14 +32,14 @@ def test_transliteration():
# So here are some individual words. # So here are some individual words.
# 'library' in Azerbaijani Cyrillic # 'library' in Azerbaijani Cyrillic
assert preprocess_text('китабхана', 'az') == 'kitabxana' assert preprocess_text("китабхана", "az") == "kitabxana"
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana' assert preprocess_text("КИТАБХАНА", "az") == "kitabxana"
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana' assert preprocess_text("KİTABXANA", "az") == "kitabxana"
# 'scream' in Azerbaijani Cyrillic # 'scream' in Azerbaijani Cyrillic
assert preprocess_text('бағырты', 'az') == 'bağırtı' assert preprocess_text("бағырты", "az") == "bağırtı"
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı' assert preprocess_text("БАҒЫРТЫ", "az") == "bağırtı"
assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı' assert preprocess_text("BAĞIRTI", "az") == "bağırtı"
def test_actually_russian(): def test_actually_russian():
@ -38,13 +50,12 @@ def test_actually_russian():
# We make sure to handle this case so we don't end up with a mixed-script # We make sure to handle this case so we don't end up with a mixed-script
# word like "pacanы". # word like "pacanы".
assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany'] assert tokenize("сто из ста, пацаны!", "sr") == ["sto", "iz", "sta", "pacany"]
assert tokenize("культуры", 'sr') == ["kul'tury"] assert tokenize("культуры", "sr") == ["kul'tury"]
def test_alternate_codes(): def test_alternate_codes():
# Try language codes for Serbo-Croatian that have been split, and now # Try language codes for Serbo-Croatian that have been split, and now
# are canonically mapped to Serbian # are canonically mapped to Serbian
assert tokenize("культуры", 'sh') == ["kul'tury"] assert tokenize("культуры", "sh") == ["kul'tury"]
assert tokenize("культуры", 'hbs') == ["kul'tury"] assert tokenize("культуры", "hbs") == ["kul'tury"]

View File

@ -13,7 +13,7 @@ import warnings
from .tokens import tokenize, simple_tokenize, lossy_tokenize from .tokens import tokenize, simple_tokenize, lossy_tokenize
from .language_info import get_language_info from .language_info import get_language_info
from .numbers import digit_freq from .numbers import digit_freq, has_digit_sequence, smash_numbers
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -234,7 +234,7 @@ _wf_cache: Dict[Tuple[str, str, str, float], float] = {}
def _word_frequency(word: str, lang: str, wordlist: str, minimum: float) -> float: def _word_frequency(word: str, lang: str, wordlist: str, minimum: float) -> float:
tokens = lossy_tokenize(word, lang) tokens = lossy_tokenize(word, lang)
dfreq = digit_freq(word)
if not tokens: if not tokens:
return minimum return minimum
@ -245,13 +245,20 @@ def _word_frequency(word: str, lang: str, wordlist: str, minimum: float) -> floa
freqs = get_frequency_dict(lang, wordlist) freqs = get_frequency_dict(lang, wordlist)
one_over_result = 0.0 one_over_result = 0.0
for token in tokens: for token in tokens:
if token not in freqs: smashed = smash_numbers(token)
if smashed not in freqs:
# If any word is missing, just return the default value # If any word is missing, just return the default value
return minimum return minimum
# spread the frequency of digits over all digit combinations freq = freqs[smashed]
freq = freqs[token] if smashed != token:
# If there is a digit sequence in the token, the digits are
# internally replaced by 0s to aggregate their probabilities
# together. We then assign a specific frequency to the digit
# sequence using the `digit_freq` distribution.
freq *= digit_freq(token)
one_over_result += 1.0 / freq one_over_result += 1.0 / freq
# Combine the frequencies of tokens we looked up.
freq = 1.0 / one_over_result freq = 1.0 / one_over_result
if get_language_info(lang)["tokenizer"] == "jieba": if get_language_info(lang)["tokenizer"] == "jieba":
@ -334,10 +341,15 @@ def top_n_list(
Return a frequency list of length `n` in descending order of frequency. Return a frequency list of length `n` in descending order of frequency.
This list contains words from `wordlist`, of the given language. This list contains words from `wordlist`, of the given language.
If `ascii_only`, then only ascii words are considered. If `ascii_only`, then only ascii words are considered.
The frequency list will not contain multi-digit sequences, because we
estimate the frequencies of those using the functions in `numbers.py`,
not using a wordlist that contains all of them.
""" """
results = [] results = []
for word in iter_wordlist(lang, wordlist): for word in iter_wordlist(lang, wordlist):
if (not ascii_only) or max(word) <= "~": if (not ascii_only) or max(word) <= "~":
if not has_digit_sequence(word):
results.append(word) results.append(word)
if len(results) >= n: if len(results) >= n:
break break

View File

@ -1,4 +1,4 @@
from .preprocess import MULTI_DIGIT_RE import regex
# Frequencies of leading digits, according to Benford's law, sort of. # Frequencies of leading digits, according to Benford's law, sort of.
# Benford's law doesn't describe numbers with leading zeroes, because "007" # Benford's law doesn't describe numbers with leading zeroes, because "007"
@ -11,23 +11,37 @@ DIGIT_FREQS = [0.009, 0.300, 0.175, 0.124, 0.096, 0.078, 0.066, 0.057, 0.050, 0.
# #
# We do this with a piecewise exponential function whose peak is a plateau covering # We do this with a piecewise exponential function whose peak is a plateau covering
# the years 2019 to 2039. # the years 2019 to 2039.
#
# YEAR_LOG_PEAK is chosen by experimentation to make this probability add up to about
# .994. Here, that represents P(token represents a year) | P(token is 4 digits).
# The other .006 represents P(token does not represent a year) | P(token is 4 digits).
YEAR_LOG_PEAK = -1.875 # Determined by experimentation: makes the probabilities of all years add up to 90%.
NOT_YEAR_PROB = 0.006 # The other 10% goes to NOT_YEAR_PROB. tests/test_numbers.py confirms that this
# probability distribution adds up to 1.
YEAR_LOG_PEAK = -1.9185
NOT_YEAR_PROB = 0.1
REFERENCE_YEAR = 2019 REFERENCE_YEAR = 2019
PLATEAU_WIDTH = 20 PLATEAU_WIDTH = 20
DIGIT_RE = regex.compile(r"\d")
MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
PURE_DIGIT_RE = regex.compile(r"\d+")
def benford_freq(text: str) -> float: def benford_freq(text: str) -> float:
"""
Estimate the frequency of a digit sequence according to Benford's law.
"""
first_digit = int(text[0]) first_digit = int(text[0])
return DIGIT_FREQS[first_digit] / 10 ** (len(text) - 1) return DIGIT_FREQS[first_digit] / 10 ** (len(text) - 1)
def year_freq(text: str) -> float: def year_freq(text: str) -> float:
"""
Estimate the relative frequency of a particular 4-digit sequence representing
a year.
For example, suppose text == "1985". We're estimating the probability that a
randomly-selected token from a large corpus will be "1985" and refer to the
year, _given_ that it is 4 digits. Tokens that are not 4 digits are not involved
in the probability distribution.
"""
year = int(text) year = int(text)
# Fitting a line to the curve seen at # Fitting a line to the curve seen at
@ -60,13 +74,38 @@ def year_freq(text: str) -> float:
def digit_freq(text: str) -> float: def digit_freq(text: str) -> float:
"""
Get the relative frequency of a string of digits, using our estimates.
"""
freq = 1.0 freq = 1.0
for match in MULTI_DIGIT_RE.findall(text): for match in MULTI_DIGIT_RE.findall(text):
if len(match) == 4: for submatch in PURE_DIGIT_RE.findall(match):
freq *= year_freq(match) if len(submatch) == 4:
freq *= year_freq(submatch)
else: else:
freq *= benford_freq(match) freq *= benford_freq(submatch)
return freq return freq
print(sum(digit_freq("%04d" % year) for year in range(0, 10000))) def has_digit_sequence(text: str) -> bool:
"""
Returns True iff the text has a digit sequence that will be normalized out
and handled with `digit_freq`.
"""
return bool(MULTI_DIGIT_RE.match(text))
def _sub_zeroes(match: regex.Match) -> str:
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub("0", match.group(0))
def smash_numbers(text: str) -> str:
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(_sub_zeroes, text)

View File

@ -7,10 +7,6 @@ from langcodes import Language
MARK_RE = regex.compile(r"[\p{Mn}\N{ARABIC TATWEEL}]", regex.V1) MARK_RE = regex.compile(r"[\p{Mn}\N{ARABIC TATWEEL}]", regex.V1)
DIGIT_RE = regex.compile(r"\d")
MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
def preprocess_text(text: str, language: Language) -> str: def preprocess_text(text: str, language: Language) -> str:
""" """
This function applies pre-processing steps that convert forms of words This function applies pre-processing steps that convert forms of words
@ -251,19 +247,3 @@ def cedillas_to_commas(text: str) -> str:
"\N{LATIN SMALL LETTER T WITH CEDILLA}", "\N{LATIN SMALL LETTER T WITH CEDILLA}",
"\N{LATIN SMALL LETTER T WITH COMMA BELOW}", "\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
) )
def _sub_zeroes(match: regex.Match) -> str:
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub("0", match.group(0))
def smash_numbers(text: str) -> str:
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(_sub_zeroes, text)

View File

@ -10,7 +10,7 @@ from .language_info import (
SPACELESS_SCRIPTS, SPACELESS_SCRIPTS,
EXTRA_JAPANESE_CHARACTERS, EXTRA_JAPANESE_CHARACTERS,
) )
from .preprocess import preprocess_text, smash_numbers from .preprocess import preprocess_text
# Placeholders for CJK functions that we'll import on demand # Placeholders for CJK functions that we'll import on demand
_mecab_tokenize = None _mecab_tokenize = None
@ -309,13 +309,6 @@ def lossy_tokenize(
In particular: In particular:
- Any sequence of 2 or more adjacent digits, possibly with intervening
punctuation such as a decimal point, will replace each digit with '0'
so that frequencies for numbers don't have to be counted separately.
This is similar to but not quite identical to the word2vec Google News
data, which replaces digits with '#' in tokens with more than one digit.
- In Chinese, unless Traditional Chinese is specifically requested using - In Chinese, unless Traditional Chinese is specifically requested using
'zh-Hant', all characters will be converted to Simplified Chinese. 'zh-Hant', all characters will be converted to Simplified Chinese.
@ -334,4 +327,4 @@ def lossy_tokenize(
tokens = [_simplify_chinese(token) for token in tokens] tokens = [_simplify_chinese(token) for token in tokens]
return [uncurl_quotes(smash_numbers(token)) for token in tokens] return [uncurl_quotes(token) for token in tokens]