mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
estimate the freq distribution of numbers
This commit is contained in:
parent
4e373750e8
commit
bf05b1b1dc
244
poetry.lock
generated
244
poetry.lock
generated
@ -61,7 +61,7 @@ uvloop = ["uvloop (>=0.15.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "click"
|
||||
version = "8.0.3"
|
||||
version = "8.0.4"
|
||||
description = "Composable command line interface toolkit"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -103,17 +103,14 @@ pyflakes = ">=2.4.0,<2.5.0"
|
||||
|
||||
[[package]]
|
||||
name = "ftfy"
|
||||
version = "6.0.3"
|
||||
description = "Fixes some problems with Unicode text after the fact"
|
||||
version = "6.1.1"
|
||||
description = "Fixes mojibake and other problems with Unicode, after the fact"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.7,<4"
|
||||
|
||||
[package.dependencies]
|
||||
wcwidth = "*"
|
||||
|
||||
[package.extras]
|
||||
docs = ["furo", "sphinx"]
|
||||
wcwidth = ">=0.2.5"
|
||||
|
||||
[[package]]
|
||||
name = "importlib-metadata"
|
||||
@ -149,7 +146,7 @@ python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "ipython"
|
||||
version = "7.31.1"
|
||||
version = "7.32.0"
|
||||
description = "IPython: Productive Interactive Computing"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -242,7 +239,7 @@ python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "mecab-python3"
|
||||
version = "1.0.4"
|
||||
version = "1.0.5"
|
||||
description = "Python wrapper for the MeCab morphological analyzer for Japanese"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -338,7 +335,7 @@ python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "platformdirs"
|
||||
version = "2.5.0"
|
||||
version = "2.5.1"
|
||||
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -365,7 +362,7 @@ testing = ["pytest", "pytest-benchmark"]
|
||||
|
||||
[[package]]
|
||||
name = "prompt-toolkit"
|
||||
version = "3.0.27"
|
||||
version = "3.0.28"
|
||||
description = "Library for building powerful interactive command lines in Python"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -449,11 +446,11 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xm
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "2022.1.18"
|
||||
version = "2022.3.2"
|
||||
description = "Alternative regular expression module, to replace re."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
python-versions = ">=3.6"
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
@ -492,7 +489,7 @@ python-versions = ">=3.6"
|
||||
|
||||
[[package]]
|
||||
name = "types-setuptools"
|
||||
version = "57.4.9"
|
||||
version = "57.4.10"
|
||||
description = "Typing stubs for setuptools"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -500,7 +497,7 @@ python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.0.1"
|
||||
version = "4.1.1"
|
||||
description = "Backported and Experimental Type Hints for Python 3.6+"
|
||||
category = "main"
|
||||
optional = false
|
||||
@ -529,7 +526,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.7"
|
||||
content-hash = "a3b1a9c3b80e338764f1907a77e31f59d6e1e231092b7813182e09e55d7c2f45"
|
||||
content-hash = "8507a13e0c8c79c30e911cc5f32bdc35284304246ae50531917df6197d7dcab8"
|
||||
|
||||
[metadata.files]
|
||||
appnope = [
|
||||
@ -574,8 +571,8 @@ black = [
|
||||
{file = "black-22.1.0.tar.gz", hash = "sha256:a7c0192d35635f6fc1174be575cb7915e92e5dd629ee79fdaf0dcfa41a80afb5"},
|
||||
]
|
||||
click = [
|
||||
{file = "click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3"},
|
||||
{file = "click-8.0.3.tar.gz", hash = "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"},
|
||||
{file = "click-8.0.4-py3-none-any.whl", hash = "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1"},
|
||||
{file = "click-8.0.4.tar.gz", hash = "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"},
|
||||
]
|
||||
colorama = [
|
||||
{file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
|
||||
@ -590,7 +587,8 @@ flake8 = [
|
||||
{file = "flake8-4.0.1.tar.gz", hash = "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"},
|
||||
]
|
||||
ftfy = [
|
||||
{file = "ftfy-6.0.3.tar.gz", hash = "sha256:ba71121a9c8d7790d3e833c6c1021143f3e5c4118293ec3afb5d43ed9ca8e72b"},
|
||||
{file = "ftfy-6.1.1-py3-none-any.whl", hash = "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca"},
|
||||
{file = "ftfy-6.1.1.tar.gz", hash = "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"},
|
||||
]
|
||||
importlib-metadata = [
|
||||
{file = "importlib_metadata-4.2.0-py3-none-any.whl", hash = "sha256:057e92c15bc8d9e8109738a48db0ccb31b4d9d5cfbee5a8670879a30be66304b"},
|
||||
@ -604,8 +602,8 @@ ipadic = [
|
||||
{file = "ipadic-1.0.0.tar.gz", hash = "sha256:f5923d31eca6131acaaf18ed28d8998665b1347b640d3a6476f64650e9a71c07"},
|
||||
]
|
||||
ipython = [
|
||||
{file = "ipython-7.31.1-py3-none-any.whl", hash = "sha256:55df3e0bd0f94e715abd968bedd89d4e8a7bce4bf498fb123fed4f5398fea874"},
|
||||
{file = "ipython-7.31.1.tar.gz", hash = "sha256:b5548ec5329a4bcf054a5deed5099b0f9622eb9ea51aaa7104d215fece201d8c"},
|
||||
{file = "ipython-7.32.0-py3-none-any.whl", hash = "sha256:86df2cf291c6c70b5be6a7b608650420e89180c8ec74f376a34e2dc15c3400e7"},
|
||||
{file = "ipython-7.32.0.tar.gz", hash = "sha256:468abefc45c15419e3c8e8c0a6a5c115b2127bafa34d7c641b1d443658793909"},
|
||||
]
|
||||
jedi = [
|
||||
{file = "jedi-0.18.1-py2.py3-none-any.whl", hash = "sha256:637c9635fcf47945ceb91cd7f320234a7be540ded6f3e99a50cb6febdfd1ba8d"},
|
||||
@ -630,23 +628,27 @@ mecab-ko-dic = [
|
||||
{file = "mecab-ko-dic-1.0.0.tar.gz", hash = "sha256:3ba22858736e02e8a0e92f2a7f099528c733ae47701b29d12c75e982a85d1f11"},
|
||||
]
|
||||
mecab-python3 = [
|
||||
{file = "mecab-python3-1.0.4.tar.gz", hash = "sha256:b150ad5fe4260539b4ef184657e552ef81307fbbe60ae1f258bc814549ea90f8"},
|
||||
{file = "mecab_python3-1.0.4-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:3c7e87c65160e5e4edb08cb80dbce50f4e711c53f45063321aab72ab2566ffe4"},
|
||||
{file = "mecab_python3-1.0.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2fbed960ef82f4192b31efd88af1f3c24cd1692b62720ed70d7e314a50f581e"},
|
||||
{file = "mecab_python3-1.0.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cb6eb6cc47e3937a2edfaa9595dc2d165ed9f025e3a53bd0a5033a12fa6bcdcf"},
|
||||
{file = "mecab_python3-1.0.4-cp36-cp36m-win_amd64.whl", hash = "sha256:b149b51f0f62c9512d219c9e79c6db2eb66e70863a97eb412d8fc3ba7a25f351"},
|
||||
{file = "mecab_python3-1.0.4-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c1606b35df0136b3e9dc7add2e69d2c1151e69fd5675c0cde62d0b017b2319e7"},
|
||||
{file = "mecab_python3-1.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53b0b899ef03f364bfd7fa28f260ee1e893e4f47ff90a141a522709b892f0a4e"},
|
||||
{file = "mecab_python3-1.0.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:52a789c708f8b89044236201eb03c7fe5517fad5210a9de2230c7d99a2a8c760"},
|
||||
{file = "mecab_python3-1.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:d6ca73c0dec72038290faa6de17d57d771535eb47c22346e170dffcb82d696bb"},
|
||||
{file = "mecab_python3-1.0.4-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:18e14dfe3d8c66cfa1c9f49e3bc8ac480b79a433ec9e5b5d2c1fb73f36ec7c3e"},
|
||||
{file = "mecab_python3-1.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:221256b84be0ee29dc8fa450210236b40707b9d63cfc70de5102d2531622d062"},
|
||||
{file = "mecab_python3-1.0.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:de39b82f44d97fc0fd636644ad14c9662f51afcd73775379d5a8b1eb20ee85a6"},
|
||||
{file = "mecab_python3-1.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:96d9e8c098401fb4b5bd32258f4952f3b22cdb30ab291f5ff82eae1d0941cbed"},
|
||||
{file = "mecab_python3-1.0.4-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:dcd62ebf2eecde1263119b92ff5379a046bb8231cb999fafda00f0925dfcb67e"},
|
||||
{file = "mecab_python3-1.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178b632b717e3249054a7ad4c0fbc60ce8493d357afa7673d535ffa11e45eaba"},
|
||||
{file = "mecab_python3-1.0.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fbfad60261ad3b9390b8615528fc013302a3e8febba220f799216c1a1154ee7e"},
|
||||
{file = "mecab_python3-1.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:445b4f5ee5674d85f6de2726ec28991801844ff71eb096129da5f5ba077d5a87"},
|
||||
{file = "mecab-python3-1.0.5.tar.gz", hash = "sha256:e703d78c88a671abb8170351644850015d9bbfab31530a3b40d12481a6779a11"},
|
||||
{file = "mecab_python3-1.0.5-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:8a64bd228704ed9b24da5cbd6c4e325ef22310227153ef481f9037183351aa10"},
|
||||
{file = "mecab_python3-1.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf083884116fa05ca0394c4c8d62013a4954fbac414c33a1931906ddf0f3585a"},
|
||||
{file = "mecab_python3-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fe020df27b249f43df3d38b84473d226e36d6d4a31f951cedbddabfcc450e36"},
|
||||
{file = "mecab_python3-1.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:644f781de083311fcf81f7d55f21a756ceef7ebae7c111bd50a2c9d0855c1927"},
|
||||
{file = "mecab_python3-1.0.5-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:4309a91f0d5b66d3f0e8c9ba5a4d3cf7dbac1334269338704599820e051d1d7f"},
|
||||
{file = "mecab_python3-1.0.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be2d1cd2ecd1f04b91eb0e26c906f21b50b8526e977f7f01f3901f9a6306944"},
|
||||
{file = "mecab_python3-1.0.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:644bbde31ab1244ff18fb1dcac1e5fee8121f8b27a5c3e041c01ebc301df9266"},
|
||||
{file = "mecab_python3-1.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:401a2d1608b6503cb755d7d864ad74b64a7a4346309235f84577de807bb29050"},
|
||||
{file = "mecab_python3-1.0.5-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:5f91d5d8a9ac0ea7351e5e2423df98dd463b02013e006b18096cd365de37b2a9"},
|
||||
{file = "mecab_python3-1.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc8ce0151b973f4ca15e651619264442011568ebe48c6fce51d55e64f7e5c2e1"},
|
||||
{file = "mecab_python3-1.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e381df4c55f3ec5bccbb5625c65c54ecf982c215574d1102aff2803ac1a24cd"},
|
||||
{file = "mecab_python3-1.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:8eaaa78227f470c4cf1d6c2a87b92889041f317517fbe65e635b86ea0c84a194"},
|
||||
{file = "mecab_python3-1.0.5-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:dd8601565dd1331ee5cd67bcc45f713cebc14b730ee2e956ed120a0ec6e4fd8a"},
|
||||
{file = "mecab_python3-1.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76a40f717f9592bd12edc7bcf1fa869f4c8058e5d0b80d4cc6c301435afb1f96"},
|
||||
{file = "mecab_python3-1.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f299d6ef96495371f5a622a7004a205e303dabba1fc3a7f9a07e741e315ed2b"},
|
||||
{file = "mecab_python3-1.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:4cdb07edbbd508d9b98ac9529e0ff0b89d93e50a6beeb7b8b946439594bf5e01"},
|
||||
{file = "mecab_python3-1.0.5-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:eb412a25e485e33d7ab69262b58f7365b727f8c447e4c9c1c56b5fd91414ecd2"},
|
||||
{file = "mecab_python3-1.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91e8ac11ef4440418312dd4f1f200f7957fdc0148bb49dc049264c5d07bed527"},
|
||||
{file = "mecab_python3-1.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae1c126cf4982035794042280998066c8b6d26eb89136731078d9105a7070c13"},
|
||||
{file = "mecab_python3-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:34a196c6a410e57f975ee077d075ac994b94bb6930b04e207e59e7c7521ecb58"},
|
||||
]
|
||||
msgpack = [
|
||||
{file = "msgpack-1.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:96acc674bb9c9be63fa8b6dabc3248fdc575c4adc005c440ad02f87ca7edd079"},
|
||||
@ -731,16 +733,16 @@ pickleshare = [
|
||||
{file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"},
|
||||
]
|
||||
platformdirs = [
|
||||
{file = "platformdirs-2.5.0-py3-none-any.whl", hash = "sha256:30671902352e97b1eafd74ade8e4a694782bd3471685e78c32d0fdfd3aa7e7bb"},
|
||||
{file = "platformdirs-2.5.0.tar.gz", hash = "sha256:8ec11dfba28ecc0715eb5fb0147a87b1bf325f349f3da9aab2cd6b50b96b692b"},
|
||||
{file = "platformdirs-2.5.1-py3-none-any.whl", hash = "sha256:bcae7cab893c2d310a711b70b24efb93334febe65f8de776ee320b517471e227"},
|
||||
{file = "platformdirs-2.5.1.tar.gz", hash = "sha256:7535e70dfa32e84d4b34996ea99c5e432fa29a708d0f4e394bbcb2a8faa4f16d"},
|
||||
]
|
||||
pluggy = [
|
||||
{file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
|
||||
{file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
|
||||
]
|
||||
prompt-toolkit = [
|
||||
{file = "prompt_toolkit-3.0.27-py3-none-any.whl", hash = "sha256:cb7dae7d2c59188c85a1d6c944fad19aded6a26bd9c8ae115a4e1c20eb90b713"},
|
||||
{file = "prompt_toolkit-3.0.27.tar.gz", hash = "sha256:f2b6a8067a4fb959d3677d1ed764cc4e63e0f6f565b9a4fc7edc2b18bf80217b"},
|
||||
{file = "prompt_toolkit-3.0.28-py3-none-any.whl", hash = "sha256:30129d870dcb0b3b6a53efdc9d0a83ea96162ffd28ffe077e94215b233dc670c"},
|
||||
{file = "prompt_toolkit-3.0.28.tar.gz", hash = "sha256:9f1cd16b1e86c2968f2519d7fb31dd9d669916f515612c269d14e9ed52b51650"},
|
||||
]
|
||||
ptyprocess = [
|
||||
{file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
|
||||
@ -771,80 +773,80 @@ pytest = [
|
||||
{file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
|
||||
]
|
||||
regex = [
|
||||
{file = "regex-2022.1.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-win32.whl", hash = "sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d"},
|
||||
{file = "regex-2022.1.18-cp310-cp310-win_amd64.whl", hash = "sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-win32.whl", hash = "sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3"},
|
||||
{file = "regex-2022.1.18-cp36-cp36m-win_amd64.whl", hash = "sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-win32.whl", hash = "sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6"},
|
||||
{file = "regex-2022.1.18-cp37-cp37m-win_amd64.whl", hash = "sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-win32.whl", hash = "sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02"},
|
||||
{file = "regex-2022.1.18-cp38-cp38-win_amd64.whl", hash = "sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-win32.whl", hash = "sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1"},
|
||||
{file = "regex-2022.1.18-cp39-cp39-win_amd64.whl", hash = "sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442"},
|
||||
{file = "regex-2022.1.18.tar.gz", hash = "sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-win32.whl", hash = "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834"},
|
||||
{file = "regex-2022.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-win32.whl", hash = "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863"},
|
||||
{file = "regex-2022.3.2-cp36-cp36m-win_amd64.whl", hash = "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-win32.whl", hash = "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f"},
|
||||
{file = "regex-2022.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-win32.whl", hash = "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0"},
|
||||
{file = "regex-2022.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-win32.whl", hash = "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1"},
|
||||
{file = "regex-2022.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7"},
|
||||
{file = "regex-2022.3.2.tar.gz", hash = "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b"},
|
||||
]
|
||||
toml = [
|
||||
{file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
|
||||
@ -885,12 +887,12 @@ typed-ast = [
|
||||
{file = "typed_ast-1.5.2.tar.gz", hash = "sha256:525a2d4088e70a9f75b08b3f87a51acc9cde640e19cc523c7e41aa355564ae27"},
|
||||
]
|
||||
types-setuptools = [
|
||||
{file = "types-setuptools-57.4.9.tar.gz", hash = "sha256:536ef74744f8e1e4be4fc719887f886e74e4cf3c792b4a06984320be4df450b5"},
|
||||
{file = "types_setuptools-57.4.9-py3-none-any.whl", hash = "sha256:948dc6863373750e2cd0b223a84f1fb608414cde5e55cf38ea657b93aeb411d2"},
|
||||
{file = "types-setuptools-57.4.10.tar.gz", hash = "sha256:9a13513679c640f6616e2d9ab50d431c99ca8ae9848a97243f887c80fd5cf294"},
|
||||
{file = "types_setuptools-57.4.10-py3-none-any.whl", hash = "sha256:ddc98da82c12e1208012d65276641a132d3aadc78ecfff68fd3e17d85933a3c1"},
|
||||
]
|
||||
typing-extensions = [
|
||||
{file = "typing_extensions-4.0.1-py3-none-any.whl", hash = "sha256:7f001e5ac290a0c0401508864c7ec868be4e701886d5b573a9528ed3973d9d3b"},
|
||||
{file = "typing_extensions-4.0.1.tar.gz", hash = "sha256:4ca091dea149f945ec56afb48dae714f21e8692ef22a395223bcd328961b6a0e"},
|
||||
{file = "typing_extensions-4.1.1-py3-none-any.whl", hash = "sha256:21c85e0fe4b9a155d0799430b0ad741cdce7e359660ccbd8b530613e8df88ce2"},
|
||||
{file = "typing_extensions-4.1.1.tar.gz", hash = "sha256:1a9462dcc3347a79b1f1c0271fbe79e844580bb598bafa1ed208b94da3cdcd42"},
|
||||
]
|
||||
wcwidth = [
|
||||
{file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"},
|
||||
|
@ -4,6 +4,7 @@ version = "2.6.0"
|
||||
description = "Look up the frequencies of words in many languages, based on many sources of data."
|
||||
authors = ["Robyn Speer <rspeer@arborelia.net>"]
|
||||
license = "MIT"
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.7"
|
||||
|
@ -3,17 +3,21 @@ from wordfreq import tokenize, word_frequency
|
||||
|
||||
def test_apostrophes():
|
||||
# Test that we handle apostrophes in French reasonably.
|
||||
assert tokenize("qu'un", 'fr') == ['qu', 'un']
|
||||
assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
|
||||
assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
|
||||
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
|
||||
assert tokenize("l'heure", 'fr') == ['l', 'heure']
|
||||
assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
|
||||
assert tokenize("l'anima", 'it') == ['l', 'anima']
|
||||
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
|
||||
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
|
||||
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
|
||||
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
|
||||
assert tokenize("qu'un", "fr") == ["qu", "un"]
|
||||
assert tokenize("qu'un", "fr", include_punctuation=True) == ["qu'", "un"]
|
||||
assert tokenize("langues d'oïl", "fr") == ["langues", "d", "oïl"]
|
||||
assert tokenize("langues d'oïl", "fr", include_punctuation=True) == [
|
||||
"langues",
|
||||
"d'",
|
||||
"oïl",
|
||||
]
|
||||
assert tokenize("l'heure", "fr") == ["l", "heure"]
|
||||
assert tokenize("l'ànima", "ca") == ["l", "ànima"]
|
||||
assert tokenize("l'anima", "it") == ["l", "anima"]
|
||||
assert tokenize("l'heure", "fr", include_punctuation=True) == ["l'", "heure"]
|
||||
assert tokenize("L'Hôpital", "fr", include_punctuation=True) == ["l'", "hôpital"]
|
||||
assert tokenize("aujourd'hui", "fr") == ["aujourd'hui"]
|
||||
assert tokenize("This isn't French", "en") == ["this", "isn't", "french"]
|
||||
|
||||
# This next behavior is not ideal -- we would prefer "dell'" to be handled
|
||||
# the same as "l'" -- but this is the most consistent result we can get without
|
||||
@ -21,26 +25,28 @@ def test_apostrophes():
|
||||
#
|
||||
# Versions of regex from 2019 and earlier would give ['dell', 'anima'], which
|
||||
# is better but inconsistent.
|
||||
assert tokenize("dell'anima", 'it') == ["dell'anima"]
|
||||
assert tokenize("dell'anima", "it") == ["dell'anima"]
|
||||
|
||||
# Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's
|
||||
# an example of why we don't want the apostrophe-vowel fix to apply everywhere.
|
||||
assert tokenize("hawai'i", 'en') == ["hawai'i"]
|
||||
assert tokenize("hawai'i", "en") == ["hawai'i"]
|
||||
|
||||
|
||||
def test_catastrophes():
|
||||
# More apostrophes, but this time they're in Catalan, and there's other
|
||||
# mid-word punctuation going on too.
|
||||
assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
|
||||
assert (
|
||||
tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
|
||||
["m'", 'acabo', "d'", 'instal·lar', '.']
|
||||
)
|
||||
assert tokenize("M'acabo d'instal·lar.", "ca") == ["m", "acabo", "d", "instal·lar"]
|
||||
assert tokenize("M'acabo d'instal·lar.", "ca", include_punctuation=True) == [
|
||||
"m'",
|
||||
"acabo",
|
||||
"d'",
|
||||
"instal·lar",
|
||||
".",
|
||||
]
|
||||
|
||||
|
||||
def test_alternate_codes():
|
||||
# Try over-long language codes for French and Catalan
|
||||
assert tokenize("qu'un", 'fra') == ['qu', 'un']
|
||||
assert tokenize("qu'un", 'fre') == ['qu', 'un']
|
||||
assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']
|
||||
|
||||
assert tokenize("qu'un", "fra") == ["qu", "un"]
|
||||
assert tokenize("qu'un", "fre") == ["qu", "un"]
|
||||
assert tokenize("M'acabo d'instal·lar.", "cat") == ["m", "acabo", "d", "instal·lar"]
|
||||
|
@ -14,12 +14,12 @@ def test_gender_neutral_at():
|
||||
"tod@s",
|
||||
"l@s",
|
||||
"trabajador@s",
|
||||
"migrantes"
|
||||
"migrantes",
|
||||
]
|
||||
|
||||
text = "el distrito 22@ de Barcelona"
|
||||
assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
|
||||
assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
|
||||
assert tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
|
||||
assert lossy_tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
|
||||
|
||||
# It also appears in Portuguese
|
||||
text = "direitos e deveres para @s membr@s da comunidade virtual"
|
||||
@ -32,7 +32,7 @@ def test_gender_neutral_at():
|
||||
"membr@s",
|
||||
"da",
|
||||
"comunidade",
|
||||
"virtual"
|
||||
"virtual",
|
||||
]
|
||||
|
||||
# Because this is part of our tokenization, the language code doesn't
|
||||
@ -43,10 +43,10 @@ def test_gender_neutral_at():
|
||||
|
||||
def test_at_in_corpus():
|
||||
# We have a word frequency for "l@s"
|
||||
assert word_frequency('l@s', 'es') > 0
|
||||
assert word_frequency("l@s", "es") > 0
|
||||
|
||||
# It's not just treated as a word break
|
||||
assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
|
||||
assert word_frequency("l@s", "es") < word_frequency("l s", "es")
|
||||
|
||||
|
||||
def test_punctuation_at():
|
||||
@ -65,7 +65,7 @@ def test_punctuation_at():
|
||||
"ao",
|
||||
"lado",
|
||||
"do",
|
||||
"nick"
|
||||
"nick",
|
||||
]
|
||||
|
||||
assert tokenize(text, "pt", include_punctuation=True) == [
|
||||
@ -83,7 +83,7 @@ def test_punctuation_at():
|
||||
"ao",
|
||||
"lado",
|
||||
"do",
|
||||
"nick"
|
||||
"nick",
|
||||
]
|
||||
|
||||
# If the @ is not at the end of the word or part of the word ending '@s',
|
||||
@ -98,12 +98,9 @@ def test_punctuation_at():
|
||||
"la",
|
||||
"línea",
|
||||
"all:all",
|
||||
"all"
|
||||
"all",
|
||||
]
|
||||
|
||||
# Make sure not to catch e-mail addresses
|
||||
text = "info@something.example"
|
||||
assert tokenize(text, "en") == [
|
||||
"info",
|
||||
"something.example"
|
||||
]
|
||||
assert tokenize(text, "en") == ["info", "something.example"]
|
||||
|
@ -9,92 +9,112 @@ def test_tokens():
|
||||
# (He was the Chinese Wikipedia's featured article of the day when I
|
||||
# wrote this test.)
|
||||
|
||||
hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè".
|
||||
hobart = "加勒特·霍巴特" # Garret Hobart, or "jiā lè tè huò bā tè".
|
||||
|
||||
# He was the sixth American vice president to die in office.
|
||||
fact_simplified = '他是历史上第六位在任期内去世的美国副总统。'
|
||||
fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'
|
||||
fact_simplified = "他是历史上第六位在任期内去世的美国副总统。"
|
||||
fact_traditional = "他是歷史上第六位在任期內去世的美國副總統。"
|
||||
|
||||
# His name breaks into five pieces, with the only piece staying together
|
||||
# being the one that means 'Bart'. The dot is not included as a token.
|
||||
assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特']
|
||||
assert tokenize(hobart, "zh") == ["加", "勒", "特", "霍", "巴特"]
|
||||
|
||||
assert tokenize(fact_simplified, 'zh') == [
|
||||
assert tokenize(fact_simplified, "zh") == [
|
||||
# he / is / history / in / #6 / counter for people
|
||||
'他', '是', '历史', '上', '第六', '位',
|
||||
"他",
|
||||
"是",
|
||||
"历史",
|
||||
"上",
|
||||
"第六",
|
||||
"位",
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
"在",
|
||||
"任期",
|
||||
"内",
|
||||
"去世",
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
"的",
|
||||
"美国",
|
||||
"副",
|
||||
"总统",
|
||||
]
|
||||
|
||||
# Jieba's original tokenizer knows a lot of names, it seems.
|
||||
assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']
|
||||
assert tokenize(hobart, "zh", external_wordlist=True) == ["加勒特", "霍巴特"]
|
||||
|
||||
# We get almost the same tokens from the sentence using Jieba's own
|
||||
# wordlist, but it tokenizes "in history" as two words and
|
||||
# "sixth person" as one.
|
||||
assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
|
||||
assert tokenize(fact_simplified, "zh", external_wordlist=True) == [
|
||||
# he / is / history / in / sixth person
|
||||
'他', '是', '历史', '上', '第六位',
|
||||
"他",
|
||||
"是",
|
||||
"历史",
|
||||
"上",
|
||||
"第六位",
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
"在",
|
||||
"任期",
|
||||
"内",
|
||||
"去世",
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
"的",
|
||||
"美国",
|
||||
"副",
|
||||
"总统",
|
||||
]
|
||||
|
||||
# Check that Traditional Chinese works at all
|
||||
assert word_frequency(fact_traditional, 'zh') > 0
|
||||
assert word_frequency(fact_traditional, "zh") > 0
|
||||
|
||||
# You get the same token lengths if you look it up in Traditional Chinese,
|
||||
# but the words are different
|
||||
simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
|
||||
trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
|
||||
assert ''.join(simp_tokens) == fact_simplified
|
||||
assert ''.join(trad_tokens) == fact_traditional
|
||||
simp_tokens = tokenize(fact_simplified, "zh", include_punctuation=True)
|
||||
trad_tokens = tokenize(fact_traditional, "zh", include_punctuation=True)
|
||||
assert "".join(simp_tokens) == fact_simplified
|
||||
assert "".join(trad_tokens) == fact_traditional
|
||||
simp_lengths = [len(token) for token in simp_tokens]
|
||||
trad_lengths = [len(token) for token in trad_tokens]
|
||||
assert simp_lengths == trad_lengths
|
||||
|
||||
|
||||
def test_combination():
|
||||
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
||||
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
|
||||
xiexie_freq = word_frequency("谢谢", "zh") # "Thanks"
|
||||
assert word_frequency("谢谢谢谢", "zh") == pytest.approx(xiexie_freq / 20, rel=0.01)
|
||||
|
||||
|
||||
def test_alternate_codes():
|
||||
# Tokenization of Chinese works when you use other language codes
|
||||
# that are not equal to 'zh'.
|
||||
tokens = ['谢谢', '谢谢']
|
||||
tokens = ["谢谢", "谢谢"]
|
||||
|
||||
# Code with a region attached
|
||||
assert tokenize('谢谢谢谢', 'zh-CN') == tokens
|
||||
assert tokenize("谢谢谢谢", "zh-CN") == tokens
|
||||
|
||||
# Over-long codes for Chinese
|
||||
assert tokenize('谢谢谢谢', 'chi') == tokens
|
||||
assert tokenize('谢谢谢谢', 'zho') == tokens
|
||||
assert tokenize("谢谢谢谢", "chi") == tokens
|
||||
assert tokenize("谢谢谢谢", "zho") == tokens
|
||||
|
||||
# Separate codes for Mandarin and Cantonese
|
||||
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
||||
assert tokenize('谢谢谢谢', 'yue') == tokens
|
||||
assert tokenize("谢谢谢谢", "cmn") == tokens
|
||||
assert tokenize("谢谢谢谢", "yue") == tokens
|
||||
|
||||
|
||||
def test_unreasonably_long():
|
||||
# This crashed earlier versions of wordfreq due to an overflow in
|
||||
# exponentiation. We've now changed the sequence of operations so it
|
||||
# will underflow instead.
|
||||
lots_of_ls = 'l' * 800
|
||||
assert word_frequency(lots_of_ls, 'zh') == 0.
|
||||
assert zipf_frequency(lots_of_ls, 'zh') == 0.
|
||||
lots_of_ls = "l" * 800
|
||||
assert word_frequency(lots_of_ls, "zh") == 0.0
|
||||
assert zipf_frequency(lots_of_ls, "zh") == 0.0
|
||||
|
||||
|
||||
def test_hyphens():
|
||||
# An edge case of Chinese tokenization that changed sometime around
|
||||
# jieba 0.42.
|
||||
|
||||
tok = tokenize('--------', 'zh', include_punctuation=True)
|
||||
assert tok == ['-'] * 8
|
||||
|
||||
tok = tokenize('--------', 'zh', include_punctuation=True, external_wordlist=True)
|
||||
assert tok == ['--------']
|
||||
tok = tokenize("--------", "zh", include_punctuation=True)
|
||||
assert tok == ["-"] * 8
|
||||
|
||||
tok = tokenize("--------", "zh", include_punctuation=True, external_wordlist=True)
|
||||
assert tok == ["--------"]
|
||||
|
@ -1,16 +1,22 @@
|
||||
from wordfreq import (
|
||||
word_frequency, available_languages, cB_to_freq,
|
||||
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
|
||||
word_frequency,
|
||||
available_languages,
|
||||
cB_to_freq,
|
||||
top_n_list,
|
||||
random_words,
|
||||
random_ascii_words,
|
||||
tokenize,
|
||||
lossy_tokenize,
|
||||
)
|
||||
import pytest
|
||||
|
||||
|
||||
def test_freq_examples():
|
||||
# Stopwords are most common in the correct language
|
||||
assert word_frequency('the', 'en') > word_frequency('de', 'en')
|
||||
assert word_frequency('de', 'es') > word_frequency('the', 'es')
|
||||
assert word_frequency("the", "en") > word_frequency("de", "en")
|
||||
assert word_frequency("de", "es") > word_frequency("the", "es")
|
||||
# We get word frequencies from the 'large' list when available
|
||||
assert word_frequency('infrequency', 'en') > 0.
|
||||
assert word_frequency("infrequency", "en") > 0.0
|
||||
|
||||
|
||||
def test_languages():
|
||||
@ -20,33 +26,33 @@ def test_languages():
|
||||
assert len(avail) >= 34
|
||||
|
||||
# 'small' covers the same languages, but with some different lists
|
||||
avail_small = available_languages('small')
|
||||
avail_small = available_languages("small")
|
||||
assert len(avail_small) == len(avail)
|
||||
assert avail_small != avail
|
||||
|
||||
# 'combined' is the same as 'small'
|
||||
avail_old_name = available_languages('combined')
|
||||
avail_old_name = available_languages("combined")
|
||||
assert avail_old_name == avail_small
|
||||
|
||||
# 'large' covers fewer languages
|
||||
avail_large = available_languages('large')
|
||||
avail_large = available_languages("large")
|
||||
assert len(avail_large) >= 14
|
||||
assert len(avail) > len(avail_large)
|
||||
|
||||
# Look up the digit '2' in the main word list for each language
|
||||
for lang in avail:
|
||||
assert word_frequency('2', lang) > 0
|
||||
assert word_frequency("2", lang) > 0
|
||||
|
||||
# Make up a weirdly verbose language code and make sure
|
||||
# we still get it
|
||||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||||
assert word_frequency('2', new_lang_code) > 0
|
||||
new_lang_code = "%s-001-x-fake-ext" % lang.upper()
|
||||
assert word_frequency("2", new_lang_code) > 0
|
||||
|
||||
|
||||
def test_minimums():
|
||||
assert word_frequency('esquivalience', 'en') == 0
|
||||
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
|
||||
assert word_frequency('the', 'en', minimum=1) == 1
|
||||
assert word_frequency("esquivalience", "en") == 0
|
||||
assert word_frequency("esquivalience", "en", minimum=1e-6) == 1e-6
|
||||
assert word_frequency("the", "en", minimum=1) == 1
|
||||
|
||||
|
||||
def test_most_common_words():
|
||||
@ -59,61 +65,61 @@ def test_most_common_words():
|
||||
"""
|
||||
return top_n_list(lang, 1)[0]
|
||||
|
||||
assert get_most_common('ar') == 'في'
|
||||
assert get_most_common('bg') == 'на'
|
||||
assert get_most_common('bn') == 'না'
|
||||
assert get_most_common('ca') == 'de'
|
||||
assert get_most_common('cs') == 'a'
|
||||
assert get_most_common('da') == 'i'
|
||||
assert get_most_common('el') == 'και'
|
||||
assert get_most_common('de') == 'die'
|
||||
assert get_most_common('en') == 'the'
|
||||
assert get_most_common('es') == 'de'
|
||||
assert get_most_common('fi') == 'ja'
|
||||
assert get_most_common('fil') == 'sa'
|
||||
assert get_most_common('fr') == 'de'
|
||||
assert get_most_common('he') == 'את'
|
||||
assert get_most_common('hi') == 'के'
|
||||
assert get_most_common('hu') == 'a'
|
||||
assert get_most_common('id') == 'yang'
|
||||
assert get_most_common('is') == 'og'
|
||||
assert get_most_common('it') == 'di'
|
||||
assert get_most_common('ja') == 'の'
|
||||
assert get_most_common('ko') == '이'
|
||||
assert get_most_common('lt') == 'ir'
|
||||
assert get_most_common('lv') == 'un'
|
||||
assert get_most_common('mk') == 'на'
|
||||
assert get_most_common('ms') == 'yang'
|
||||
assert get_most_common('nb') == 'i'
|
||||
assert get_most_common('nl') == 'de'
|
||||
assert get_most_common('pl') == 'w'
|
||||
assert get_most_common('pt') == 'de'
|
||||
assert get_most_common('ro') == 'de'
|
||||
assert get_most_common('ru') == 'в'
|
||||
assert get_most_common('sh') == 'je'
|
||||
assert get_most_common('sk') == 'a'
|
||||
assert get_most_common('sl') == 'je'
|
||||
assert get_most_common('sv') == 'är'
|
||||
assert get_most_common('ta') == 'ஒரு'
|
||||
assert get_most_common('tr') == 've'
|
||||
assert get_most_common('uk') == 'в'
|
||||
assert get_most_common('ur') == 'کے'
|
||||
assert get_most_common('vi') == 'là'
|
||||
assert get_most_common('zh') == '的'
|
||||
assert get_most_common("ar") == "في"
|
||||
assert get_most_common("bg") == "на"
|
||||
assert get_most_common("bn") == "না"
|
||||
assert get_most_common("ca") == "de"
|
||||
assert get_most_common("cs") == "a"
|
||||
assert get_most_common("da") == "i"
|
||||
assert get_most_common("el") == "και"
|
||||
assert get_most_common("de") == "die"
|
||||
assert get_most_common("en") == "the"
|
||||
assert get_most_common("es") == "de"
|
||||
assert get_most_common("fi") == "ja"
|
||||
assert get_most_common("fil") == "sa"
|
||||
assert get_most_common("fr") == "de"
|
||||
assert get_most_common("he") == "את"
|
||||
assert get_most_common("hi") == "के"
|
||||
assert get_most_common("hu") == "a"
|
||||
assert get_most_common("id") == "yang"
|
||||
assert get_most_common("is") == "og"
|
||||
assert get_most_common("it") == "di"
|
||||
assert get_most_common("ja") == "の"
|
||||
assert get_most_common("ko") == "이"
|
||||
assert get_most_common("lt") == "ir"
|
||||
assert get_most_common("lv") == "un"
|
||||
assert get_most_common("mk") == "на"
|
||||
assert get_most_common("ms") == "yang"
|
||||
assert get_most_common("nb") == "i"
|
||||
assert get_most_common("nl") == "de"
|
||||
assert get_most_common("pl") == "w"
|
||||
assert get_most_common("pt") == "de"
|
||||
assert get_most_common("ro") == "de"
|
||||
assert get_most_common("ru") == "в"
|
||||
assert get_most_common("sh") == "je"
|
||||
assert get_most_common("sk") == "a"
|
||||
assert get_most_common("sl") == "je"
|
||||
assert get_most_common("sv") == "är"
|
||||
assert get_most_common("ta") == "ஒரு"
|
||||
assert get_most_common("tr") == "ve"
|
||||
assert get_most_common("uk") == "в"
|
||||
assert get_most_common("ur") == "کے"
|
||||
assert get_most_common("vi") == "là"
|
||||
assert get_most_common("zh") == "的"
|
||||
|
||||
|
||||
def test_language_matching():
|
||||
freq = word_frequency('的', 'zh')
|
||||
assert word_frequency('的', 'zh-TW') == freq
|
||||
assert word_frequency('的', 'zh-CN') == freq
|
||||
assert word_frequency('的', 'zh-Hant') == freq
|
||||
assert word_frequency('的', 'zh-Hans') == freq
|
||||
assert word_frequency('的', 'yue-HK') == freq
|
||||
assert word_frequency('的', 'cmn') == freq
|
||||
freq = word_frequency("的", "zh")
|
||||
assert word_frequency("的", "zh-TW") == freq
|
||||
assert word_frequency("的", "zh-CN") == freq
|
||||
assert word_frequency("的", "zh-Hant") == freq
|
||||
assert word_frequency("的", "zh-Hans") == freq
|
||||
assert word_frequency("的", "yue-CN") == freq
|
||||
assert word_frequency("的", "cmn") == freq
|
||||
|
||||
|
||||
def test_cB_conversion():
|
||||
assert cB_to_freq(0) == 1.
|
||||
assert cB_to_freq(0) == 1.0
|
||||
assert cB_to_freq(-100) == pytest.approx(0.1)
|
||||
assert cB_to_freq(-600) == pytest.approx(1e-6)
|
||||
|
||||
@ -126,101 +132,125 @@ def test_failed_cB_conversion():
|
||||
def test_tokenization():
|
||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||
# data
|
||||
assert (
|
||||
tokenize("I don't split at apostrophes, you see.", 'en')
|
||||
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
|
||||
)
|
||||
assert tokenize("I don't split at apostrophes, you see.", "en") == [
|
||||
"i",
|
||||
"don't",
|
||||
"split",
|
||||
"at",
|
||||
"apostrophes",
|
||||
"you",
|
||||
"see",
|
||||
]
|
||||
|
||||
assert (
|
||||
tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
|
||||
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
|
||||
)
|
||||
assert tokenize(
|
||||
"I don't split at apostrophes, you see.", "en", include_punctuation=True
|
||||
) == ["i", "don't", "split", "at", "apostrophes", ",", "you", "see", "."]
|
||||
|
||||
# Certain punctuation does not inherently split a word.
|
||||
assert (
|
||||
tokenize("Anything is possible at zombo.com", 'en')
|
||||
== ['anything', 'is', 'possible', 'at', 'zombo.com']
|
||||
)
|
||||
assert tokenize("Anything is possible at zombo.com", "en") == [
|
||||
"anything",
|
||||
"is",
|
||||
"possible",
|
||||
"at",
|
||||
"zombo.com",
|
||||
]
|
||||
|
||||
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
||||
assert tokenize('😂test', 'en') == ['😂', 'test']
|
||||
assert tokenize("flip-flop", 'en') == ['flip', 'flop']
|
||||
assert (
|
||||
tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
|
||||
== ['this', 'text', 'has', '...', 'punctuation', ':)']
|
||||
)
|
||||
assert tokenize("😂test", "en") == ["😂", "test"]
|
||||
assert tokenize("flip-flop", "en") == ["flip", "flop"]
|
||||
assert tokenize(
|
||||
"this text has... punctuation :)", "en", include_punctuation=True
|
||||
) == ["this", "text", "has", "...", "punctuation", ":)"]
|
||||
|
||||
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
|
||||
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
|
||||
# is up to date
|
||||
assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
|
||||
assert (
|
||||
tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
|
||||
== ['👨🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
|
||||
'nothing', 'i', 'can', 'do', '🌎', '🚀']
|
||||
)
|
||||
assert tokenize("emoji test 🧕🏽", "en") == ["emoji", "test", "🧕🏽"]
|
||||
assert tokenize(
|
||||
"👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", "en"
|
||||
) == [
|
||||
"👨🎤",
|
||||
"planet",
|
||||
"earth",
|
||||
"is",
|
||||
"blue",
|
||||
"and",
|
||||
"there's",
|
||||
"nothing",
|
||||
"i",
|
||||
"can",
|
||||
"do",
|
||||
"🌎",
|
||||
"🚀",
|
||||
]
|
||||
|
||||
# Water wave, surfer, flag of California (indicates ridiculously complete support
|
||||
# for Unicode 10 and Emoji 5.0)
|
||||
assert tokenize("Surf's up 🌊🏄🏴'",'en') == ["surf's", "up", "🌊", "🏄", "🏴"]
|
||||
assert tokenize("Surf's up 🌊🏄🏴'", "en") == ["surf's", "up", "🌊", "🏄", "🏴"]
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
assert tokenize('WEISS', 'de') == ['weiss']
|
||||
assert tokenize('weiß', 'de') == ['weiss']
|
||||
assert tokenize('İstanbul', 'tr') == ['istanbul']
|
||||
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
|
||||
assert tokenize("WEISS", "de") == ["weiss"]
|
||||
assert tokenize("weiß", "de") == ["weiss"]
|
||||
assert tokenize("İstanbul", "tr") == ["istanbul"]
|
||||
assert tokenize("SIKISINCA", "tr") == ["sıkısınca"]
|
||||
|
||||
|
||||
def test_number_smashing():
|
||||
assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
|
||||
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
|
||||
assert (
|
||||
lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
|
||||
== ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
|
||||
)
|
||||
assert lossy_tokenize('1', 'en') == ['1']
|
||||
assert lossy_tokenize('3.14', 'en') == ['0.00']
|
||||
assert lossy_tokenize('24601', 'en') == ['00000']
|
||||
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
|
||||
def test_normalization():
|
||||
assert tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
|
||||
"715",
|
||||
"crσσks",
|
||||
"by",
|
||||
"bon",
|
||||
"iver",
|
||||
]
|
||||
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
|
||||
"715",
|
||||
"crσσks",
|
||||
"by",
|
||||
"bon",
|
||||
"iver",
|
||||
]
|
||||
|
||||
|
||||
def test_uncurl_quotes():
|
||||
assert lossy_tokenize("let’s", 'en') == ["let's"]
|
||||
assert word_frequency("let’s", 'en') == word_frequency("let's", 'en')
|
||||
assert lossy_tokenize("let’s", "en") == ["let's"]
|
||||
assert word_frequency("let’s", "en") == word_frequency("let's", "en")
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
ff = word_frequency("flip-flop", 'en')
|
||||
ff = word_frequency("flip-flop", "en")
|
||||
assert ff > 0
|
||||
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||||
phrase_freq = 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency(
|
||||
"flop", "en"
|
||||
)
|
||||
assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
|
||||
|
||||
|
||||
def test_not_really_random():
|
||||
# If your xkcd-style password comes out like this, maybe you shouldn't
|
||||
# use it
|
||||
assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
|
||||
assert random_words(nwords=4, lang="en", bits_per_word=0) == "the the the the"
|
||||
|
||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||
# up with 'eos' as a very common Japanese word
|
||||
assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '00 00 00 00'
|
||||
assert random_ascii_words(nwords=4, lang="ja", bits_per_word=0) == "1 1 1 1"
|
||||
|
||||
|
||||
def test_not_enough_ascii():
|
||||
with pytest.raises(ValueError):
|
||||
random_ascii_words(lang='zh', bits_per_word=16)
|
||||
random_ascii_words(lang="zh", bits_per_word=16)
|
||||
|
||||
|
||||
def test_arabic():
|
||||
# Remove tatweels
|
||||
assert tokenize('متــــــــعب', 'ar') == ['متعب']
|
||||
assert tokenize("متــــــــعب", "ar") == ["متعب"]
|
||||
|
||||
# Remove combining marks
|
||||
assert tokenize('حَرَكَات', 'ar') == ['حركات']
|
||||
assert tokenize("حَرَكَات", "ar") == ["حركات"]
|
||||
|
||||
# An Arabic ligature that is affected by NFKC normalization
|
||||
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
|
||||
assert tokenize("\ufefb", "ar") == ["\u0644\u0627"]
|
||||
|
||||
|
||||
def test_ideographic_fallback():
|
||||
@ -228,28 +258,33 @@ def test_ideographic_fallback():
|
||||
#
|
||||
# More complex examples like this, involving the multiple scripts of Japanese,
|
||||
# are in test_japanese.py.
|
||||
assert tokenize('中国文字', 'en') == ['中国文字']
|
||||
assert tokenize("中国文字", "en") == ["中国文字"]
|
||||
|
||||
|
||||
def test_other_languages():
|
||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||
# we would actually split this into a three-word phrase.
|
||||
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
|
||||
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
|
||||
assert tokenize("การเล่นดนตรี", "th") == ["การเล่นดนตรี"]
|
||||
assert tokenize('"การเล่นดนตรี" means "playing music"', "en") == [
|
||||
"การเล่นดนตรี",
|
||||
"means",
|
||||
"playing",
|
||||
"music",
|
||||
]
|
||||
|
||||
# Test Khmer, a script similar to Thai
|
||||
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
|
||||
assert tokenize("សូមស្វាគមន៍", "km") == ["សូមស្វាគមន៍"]
|
||||
|
||||
# Test Hindi -- tokens split where there are spaces, and not where there aren't
|
||||
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
|
||||
assert tokenize("हिन्दी विक्षनरी", "hi") == ["हिन्दी", "विक्षनरी"]
|
||||
|
||||
# Remove vowel points in Hebrew
|
||||
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
|
||||
assert tokenize("דֻּגְמָה", "he") == ["דגמה"]
|
||||
|
||||
# Deal with commas, cedillas, and I's in Turkish
|
||||
assert tokenize('kișinin', 'tr') == ['kişinin']
|
||||
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
|
||||
assert tokenize("kișinin", "tr") == ["kişinin"]
|
||||
assert tokenize("KİȘİNİN", "tr") == ["kişinin"]
|
||||
|
||||
# Deal with cedillas that should be commas-below in Romanian
|
||||
assert tokenize('acelaşi', 'ro') == ['același']
|
||||
assert tokenize('ACELAŞI', 'ro') == ['același']
|
||||
assert tokenize("acelaşi", "ro") == ["același"]
|
||||
assert tokenize("ACELAŞI", "ro") == ["același"]
|
||||
|
@ -3,7 +3,7 @@ import pytest
|
||||
|
||||
|
||||
def test_tokens():
|
||||
assert tokenize('おはようございます', 'ja') == ['おはよう', 'ござい', 'ます']
|
||||
assert tokenize("おはようございます", "ja") == ["おはよう", "ござい", "ます"]
|
||||
|
||||
|
||||
def test_simple_tokenize():
|
||||
@ -17,13 +17,12 @@ def test_simple_tokenize():
|
||||
#
|
||||
# We used to try to infer word boundaries between hiragana and katakana,
|
||||
# but this leads to edge cases that are unsolvable without a dictionary.
|
||||
ja_text = 'ひらがなカタカナromaji'
|
||||
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
|
||||
|
||||
ja_text = "ひらがなカタカナromaji"
|
||||
assert simple_tokenize(ja_text) == ["ひらがなカタカナ", "romaji"]
|
||||
|
||||
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
|
||||
# but sticks together in simple_tokenize
|
||||
assert simple_tokenize('おはようございます') == ['おはようございます']
|
||||
assert simple_tokenize("おはようございます") == ["おはようございます"]
|
||||
|
||||
# Names that use the weird possessive marker ヶ, which is technically a
|
||||
# katakana even though it's being used like a kanji, stay together as one
|
||||
@ -43,17 +42,13 @@ def test_simple_tokenize():
|
||||
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
|
||||
|
||||
|
||||
|
||||
def test_combination():
|
||||
ohayou_freq = word_frequency('おはよう', 'ja')
|
||||
gozai_freq = word_frequency('ござい', 'ja')
|
||||
masu_freq = word_frequency('ます', 'ja')
|
||||
ohayou_freq = word_frequency("おはよう", "ja")
|
||||
gozai_freq = word_frequency("ござい", "ja")
|
||||
masu_freq = word_frequency("ます", "ja")
|
||||
|
||||
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01)
|
||||
|
||||
assert (
|
||||
1.0 / word_frequency('おはようございます', 'ja') ==
|
||||
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01)
|
||||
assert word_frequency("おはようおはよう", "ja") == pytest.approx(ohayou_freq / 2, rel=0.01)
|
||||
|
||||
assert 1.0 / word_frequency("おはようございます", "ja") == pytest.approx(
|
||||
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01
|
||||
)
|
||||
|
||||
|
||||
|
@ -3,16 +3,14 @@ import pytest
|
||||
|
||||
|
||||
def test_tokens():
|
||||
assert tokenize('감사합니다', 'ko') == ['감사', '합니다']
|
||||
assert tokenize("감사합니다", "ko") == ["감사", "합니다"]
|
||||
|
||||
|
||||
def test_combination():
|
||||
gamsa_freq = word_frequency('감사', 'ko')
|
||||
habnida_freq = word_frequency('합니다', 'ko')
|
||||
gamsa_freq = word_frequency("감사", "ko")
|
||||
habnida_freq = word_frequency("합니다", "ko")
|
||||
|
||||
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
|
||||
assert (
|
||||
1.0 / word_frequency('감사합니다', 'ko') ==
|
||||
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
|
||||
assert word_frequency("감사감사", "ko") == pytest.approx(gamsa_freq / 2, rel=0.01)
|
||||
assert 1.0 / word_frequency("감사합니다", "ko") == pytest.approx(
|
||||
1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01
|
||||
)
|
||||
|
||||
|
58
tests/test_numbers.py
Normal file
58
tests/test_numbers.py
Normal file
@ -0,0 +1,58 @@
|
||||
from wordfreq import word_frequency
|
||||
from wordfreq.numbers import digit_freq, smash_numbers
|
||||
from pytest import approx
|
||||
|
||||
|
||||
def test_number_smashing():
|
||||
assert smash_numbers("1") == "1"
|
||||
assert smash_numbers("3.14") == "0.00"
|
||||
assert smash_numbers("24601") == "00000"
|
||||
|
||||
|
||||
def test_decimals():
|
||||
assert word_frequency("3.14", "el") > word_frequency("4.14", "el")
|
||||
assert word_frequency("3.14", "el") == word_frequency("3.15", "el")
|
||||
assert word_frequency("3,14", "de") > word_frequency("4,14", "de")
|
||||
assert word_frequency("3,14", "de") == word_frequency("3,15", "de")
|
||||
|
||||
|
||||
def test_year_distribution():
|
||||
assert word_frequency("2010", "en") > word_frequency("1010", "en")
|
||||
assert word_frequency("2010", "en") > word_frequency("3010", "en")
|
||||
|
||||
|
||||
def test_boundaries():
|
||||
assert word_frequency("9", "en") > word_frequency("10", "en")
|
||||
assert word_frequency("99", "en") > word_frequency("100", "en")
|
||||
assert word_frequency("999", "en") > word_frequency("1000", "en")
|
||||
assert word_frequency("9999", "en") > word_frequency("10000", "en")
|
||||
|
||||
|
||||
def test_multiple_words():
|
||||
once = word_frequency("2015b", "en")
|
||||
twice = word_frequency("2015b 2015b", "en")
|
||||
assert once == approx(2 * twice)
|
||||
|
||||
|
||||
def test_distribution():
|
||||
assert word_frequency("24601", "en") > word_frequency("90210", "en")
|
||||
assert word_frequency("7", "en") > word_frequency("007", "en")
|
||||
assert word_frequency("404", "en") == word_frequency("418", "en")
|
||||
|
||||
|
||||
def test_3digit_sum():
|
||||
"""
|
||||
Test that the probability distribution given you have a 4-digit sequence
|
||||
adds up to approximately 1.
|
||||
"""
|
||||
three_digit_sum = sum(digit_freq(f"{num:03d}") for num in range(0, 1000))
|
||||
assert three_digit_sum == approx(1.0)
|
||||
|
||||
|
||||
def test_4digit_sum():
|
||||
"""
|
||||
Test that the probability distribution given you have a 4-digit sequence
|
||||
adds up to approximately 1.
|
||||
"""
|
||||
four_digit_sum = sum(digit_freq(f"{num:04d}") for num in range(0, 10000))
|
||||
assert 0.999 < four_digit_sum < 1.0
|
@ -5,14 +5,26 @@ from wordfreq.preprocess import preprocess_text
|
||||
def test_transliteration():
|
||||
# "Well, there's a lot of things you do not understand."
|
||||
# (from somewhere in OpenSubtitles
|
||||
assert (
|
||||
tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
|
||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
|
||||
)
|
||||
assert (
|
||||
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
|
||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
|
||||
)
|
||||
assert tokenize("Па, има ту много ствари које не схваташ.", "sr") == [
|
||||
"pa",
|
||||
"ima",
|
||||
"tu",
|
||||
"mnogo",
|
||||
"stvari",
|
||||
"koje",
|
||||
"ne",
|
||||
"shvataš",
|
||||
]
|
||||
assert tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", "sr") == [
|
||||
"pa",
|
||||
"ima",
|
||||
"tu",
|
||||
"mnogo",
|
||||
"stvari",
|
||||
"koje",
|
||||
"ne",
|
||||
"shvataš",
|
||||
]
|
||||
|
||||
# I don't have examples of complete sentences in Azerbaijani that are
|
||||
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
|
||||
@ -20,14 +32,14 @@ def test_transliteration():
|
||||
# So here are some individual words.
|
||||
|
||||
# 'library' in Azerbaijani Cyrillic
|
||||
assert preprocess_text('китабхана', 'az') == 'kitabxana'
|
||||
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
|
||||
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'
|
||||
assert preprocess_text("китабхана", "az") == "kitabxana"
|
||||
assert preprocess_text("КИТАБХАНА", "az") == "kitabxana"
|
||||
assert preprocess_text("KİTABXANA", "az") == "kitabxana"
|
||||
|
||||
# 'scream' in Azerbaijani Cyrillic
|
||||
assert preprocess_text('бағырты', 'az') == 'bağırtı'
|
||||
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
|
||||
assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
|
||||
assert preprocess_text("бағырты", "az") == "bağırtı"
|
||||
assert preprocess_text("БАҒЫРТЫ", "az") == "bağırtı"
|
||||
assert preprocess_text("BAĞIRTI", "az") == "bağırtı"
|
||||
|
||||
|
||||
def test_actually_russian():
|
||||
@ -38,13 +50,12 @@ def test_actually_russian():
|
||||
# We make sure to handle this case so we don't end up with a mixed-script
|
||||
# word like "pacanы".
|
||||
|
||||
assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany']
|
||||
assert tokenize("культуры", 'sr') == ["kul'tury"]
|
||||
assert tokenize("сто из ста, пацаны!", "sr") == ["sto", "iz", "sta", "pacany"]
|
||||
assert tokenize("культуры", "sr") == ["kul'tury"]
|
||||
|
||||
|
||||
def test_alternate_codes():
|
||||
# Try language codes for Serbo-Croatian that have been split, and now
|
||||
# are canonically mapped to Serbian
|
||||
assert tokenize("культуры", 'sh') == ["kul'tury"]
|
||||
assert tokenize("культуры", 'hbs') == ["kul'tury"]
|
||||
|
||||
assert tokenize("культуры", "sh") == ["kul'tury"]
|
||||
assert tokenize("культуры", "hbs") == ["kul'tury"]
|
||||
|
@ -13,7 +13,7 @@ import warnings
|
||||
|
||||
from .tokens import tokenize, simple_tokenize, lossy_tokenize
|
||||
from .language_info import get_language_info
|
||||
from .numbers import digit_freq
|
||||
from .numbers import digit_freq, has_digit_sequence, smash_numbers
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -234,7 +234,7 @@ _wf_cache: Dict[Tuple[str, str, str, float], float] = {}
|
||||
|
||||
def _word_frequency(word: str, lang: str, wordlist: str, minimum: float) -> float:
|
||||
tokens = lossy_tokenize(word, lang)
|
||||
dfreq = digit_freq(word)
|
||||
|
||||
if not tokens:
|
||||
return minimum
|
||||
|
||||
@ -245,13 +245,20 @@ def _word_frequency(word: str, lang: str, wordlist: str, minimum: float) -> floa
|
||||
freqs = get_frequency_dict(lang, wordlist)
|
||||
one_over_result = 0.0
|
||||
for token in tokens:
|
||||
if token not in freqs:
|
||||
smashed = smash_numbers(token)
|
||||
if smashed not in freqs:
|
||||
# If any word is missing, just return the default value
|
||||
return minimum
|
||||
# spread the frequency of digits over all digit combinations
|
||||
freq = freqs[token]
|
||||
freq = freqs[smashed]
|
||||
if smashed != token:
|
||||
# If there is a digit sequence in the token, the digits are
|
||||
# internally replaced by 0s to aggregate their probabilities
|
||||
# together. We then assign a specific frequency to the digit
|
||||
# sequence using the `digit_freq` distribution.
|
||||
freq *= digit_freq(token)
|
||||
one_over_result += 1.0 / freq
|
||||
|
||||
# Combine the frequencies of tokens we looked up.
|
||||
freq = 1.0 / one_over_result
|
||||
|
||||
if get_language_info(lang)["tokenizer"] == "jieba":
|
||||
@ -334,13 +341,18 @@ def top_n_list(
|
||||
Return a frequency list of length `n` in descending order of frequency.
|
||||
This list contains words from `wordlist`, of the given language.
|
||||
If `ascii_only`, then only ascii words are considered.
|
||||
|
||||
The frequency list will not contain multi-digit sequences, because we
|
||||
estimate the frequencies of those using the functions in `numbers.py`,
|
||||
not using a wordlist that contains all of them.
|
||||
"""
|
||||
results = []
|
||||
for word in iter_wordlist(lang, wordlist):
|
||||
if (not ascii_only) or max(word) <= "~":
|
||||
results.append(word)
|
||||
if len(results) >= n:
|
||||
break
|
||||
if not has_digit_sequence(word):
|
||||
results.append(word)
|
||||
if len(results) >= n:
|
||||
break
|
||||
return results
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from .preprocess import MULTI_DIGIT_RE
|
||||
import regex
|
||||
|
||||
# Frequencies of leading digits, according to Benford's law, sort of.
|
||||
# Benford's law doesn't describe numbers with leading zeroes, because "007"
|
||||
@ -11,23 +11,37 @@ DIGIT_FREQS = [0.009, 0.300, 0.175, 0.124, 0.096, 0.078, 0.066, 0.057, 0.050, 0.
|
||||
#
|
||||
# We do this with a piecewise exponential function whose peak is a plateau covering
|
||||
# the years 2019 to 2039.
|
||||
#
|
||||
# YEAR_LOG_PEAK is chosen by experimentation to make this probability add up to about
|
||||
# .994. Here, that represents P(token represents a year) | P(token is 4 digits).
|
||||
# The other .006 represents P(token does not represent a year) | P(token is 4 digits).
|
||||
|
||||
YEAR_LOG_PEAK = -1.875
|
||||
NOT_YEAR_PROB = 0.006
|
||||
# Determined by experimentation: makes the probabilities of all years add up to 90%.
|
||||
# The other 10% goes to NOT_YEAR_PROB. tests/test_numbers.py confirms that this
|
||||
# probability distribution adds up to 1.
|
||||
YEAR_LOG_PEAK = -1.9185
|
||||
NOT_YEAR_PROB = 0.1
|
||||
REFERENCE_YEAR = 2019
|
||||
PLATEAU_WIDTH = 20
|
||||
|
||||
DIGIT_RE = regex.compile(r"\d")
|
||||
MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
|
||||
PURE_DIGIT_RE = regex.compile(r"\d+")
|
||||
|
||||
def benford_freq(text: str) -> float:
|
||||
"""
|
||||
Estimate the frequency of a digit sequence according to Benford's law.
|
||||
"""
|
||||
first_digit = int(text[0])
|
||||
return DIGIT_FREQS[first_digit] / 10 ** (len(text) - 1)
|
||||
|
||||
|
||||
def year_freq(text: str) -> float:
|
||||
"""
|
||||
Estimate the relative frequency of a particular 4-digit sequence representing
|
||||
a year.
|
||||
|
||||
For example, suppose text == "1985". We're estimating the probability that a
|
||||
randomly-selected token from a large corpus will be "1985" and refer to the
|
||||
year, _given_ that it is 4 digits. Tokens that are not 4 digits are not involved
|
||||
in the probability distribution.
|
||||
"""
|
||||
year = int(text)
|
||||
|
||||
# Fitting a line to the curve seen at
|
||||
@ -60,13 +74,38 @@ def year_freq(text: str) -> float:
|
||||
|
||||
|
||||
def digit_freq(text: str) -> float:
|
||||
"""
|
||||
Get the relative frequency of a string of digits, using our estimates.
|
||||
"""
|
||||
freq = 1.0
|
||||
for match in MULTI_DIGIT_RE.findall(text):
|
||||
if len(match) == 4:
|
||||
freq *= year_freq(match)
|
||||
else:
|
||||
freq *= benford_freq(match)
|
||||
for submatch in PURE_DIGIT_RE.findall(match):
|
||||
if len(submatch) == 4:
|
||||
freq *= year_freq(submatch)
|
||||
else:
|
||||
freq *= benford_freq(submatch)
|
||||
return freq
|
||||
|
||||
|
||||
print(sum(digit_freq("%04d" % year) for year in range(0, 10000)))
|
||||
def has_digit_sequence(text: str) -> bool:
|
||||
"""
|
||||
Returns True iff the text has a digit sequence that will be normalized out
|
||||
and handled with `digit_freq`.
|
||||
"""
|
||||
return bool(MULTI_DIGIT_RE.match(text))
|
||||
|
||||
|
||||
def _sub_zeroes(match: regex.Match) -> str:
|
||||
"""
|
||||
Given a regex match, return what it matched with digits replaced by
|
||||
zeroes.
|
||||
"""
|
||||
return DIGIT_RE.sub("0", match.group(0))
|
||||
|
||||
|
||||
def smash_numbers(text: str) -> str:
|
||||
"""
|
||||
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||
distinguish the frequencies of thousands of numbers.
|
||||
"""
|
||||
return MULTI_DIGIT_RE.sub(_sub_zeroes, text)
|
||||
|
@ -7,10 +7,6 @@ from langcodes import Language
|
||||
|
||||
MARK_RE = regex.compile(r"[\p{Mn}\N{ARABIC TATWEEL}]", regex.V1)
|
||||
|
||||
DIGIT_RE = regex.compile(r"\d")
|
||||
MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
|
||||
|
||||
|
||||
def preprocess_text(text: str, language: Language) -> str:
|
||||
"""
|
||||
This function applies pre-processing steps that convert forms of words
|
||||
@ -251,19 +247,3 @@ def cedillas_to_commas(text: str) -> str:
|
||||
"\N{LATIN SMALL LETTER T WITH CEDILLA}",
|
||||
"\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
|
||||
)
|
||||
|
||||
|
||||
def _sub_zeroes(match: regex.Match) -> str:
|
||||
"""
|
||||
Given a regex match, return what it matched with digits replaced by
|
||||
zeroes.
|
||||
"""
|
||||
return DIGIT_RE.sub("0", match.group(0))
|
||||
|
||||
|
||||
def smash_numbers(text: str) -> str:
|
||||
"""
|
||||
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||
distinguish the frequencies of thousands of numbers.
|
||||
"""
|
||||
return MULTI_DIGIT_RE.sub(_sub_zeroes, text)
|
||||
|
@ -10,7 +10,7 @@ from .language_info import (
|
||||
SPACELESS_SCRIPTS,
|
||||
EXTRA_JAPANESE_CHARACTERS,
|
||||
)
|
||||
from .preprocess import preprocess_text, smash_numbers
|
||||
from .preprocess import preprocess_text
|
||||
|
||||
# Placeholders for CJK functions that we'll import on demand
|
||||
_mecab_tokenize = None
|
||||
@ -309,13 +309,6 @@ def lossy_tokenize(
|
||||
|
||||
In particular:
|
||||
|
||||
- Any sequence of 2 or more adjacent digits, possibly with intervening
|
||||
punctuation such as a decimal point, will replace each digit with '0'
|
||||
so that frequencies for numbers don't have to be counted separately.
|
||||
|
||||
This is similar to but not quite identical to the word2vec Google News
|
||||
data, which replaces digits with '#' in tokens with more than one digit.
|
||||
|
||||
- In Chinese, unless Traditional Chinese is specifically requested using
|
||||
'zh-Hant', all characters will be converted to Simplified Chinese.
|
||||
|
||||
@ -334,4 +327,4 @@ def lossy_tokenize(
|
||||
|
||||
tokens = [_simplify_chinese(token) for token in tokens]
|
||||
|
||||
return [uncurl_quotes(smash_numbers(token)) for token in tokens]
|
||||
return [uncurl_quotes(token) for token in tokens]
|
||||
|
Loading…
Reference in New Issue
Block a user