estimate the freq distribution of numbers

This commit is contained in:
Elia Robyn Lake 2022-03-10 18:33:42 -05:00
parent 4e373750e8
commit bf05b1b1dc
14 changed files with 552 additions and 405 deletions

244
poetry.lock generated
View File

@ -61,7 +61,7 @@ uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "click"
version = "8.0.3"
version = "8.0.4"
description = "Composable command line interface toolkit"
category = "dev"
optional = false
@ -103,17 +103,14 @@ pyflakes = ">=2.4.0,<2.5.0"
[[package]]
name = "ftfy"
version = "6.0.3"
description = "Fixes some problems with Unicode text after the fact"
version = "6.1.1"
description = "Fixes mojibake and other problems with Unicode, after the fact"
category = "main"
optional = false
python-versions = ">=3.6"
python-versions = ">=3.7,<4"
[package.dependencies]
wcwidth = "*"
[package.extras]
docs = ["furo", "sphinx"]
wcwidth = ">=0.2.5"
[[package]]
name = "importlib-metadata"
@ -149,7 +146,7 @@ python-versions = "*"
[[package]]
name = "ipython"
version = "7.31.1"
version = "7.32.0"
description = "IPython: Productive Interactive Computing"
category = "dev"
optional = false
@ -242,7 +239,7 @@ python-versions = "*"
[[package]]
name = "mecab-python3"
version = "1.0.4"
version = "1.0.5"
description = "Python wrapper for the MeCab morphological analyzer for Japanese"
category = "dev"
optional = false
@ -338,7 +335,7 @@ python-versions = "*"
[[package]]
name = "platformdirs"
version = "2.5.0"
version = "2.5.1"
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "dev"
optional = false
@ -365,7 +362,7 @@ testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "prompt-toolkit"
version = "3.0.27"
version = "3.0.28"
description = "Library for building powerful interactive command lines in Python"
category = "dev"
optional = false
@ -449,11 +446,11 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xm
[[package]]
name = "regex"
version = "2022.1.18"
version = "2022.3.2"
description = "Alternative regular expression module, to replace re."
category = "main"
optional = false
python-versions = "*"
python-versions = ">=3.6"
[[package]]
name = "toml"
@ -492,7 +489,7 @@ python-versions = ">=3.6"
[[package]]
name = "types-setuptools"
version = "57.4.9"
version = "57.4.10"
description = "Typing stubs for setuptools"
category = "dev"
optional = false
@ -500,7 +497,7 @@ python-versions = "*"
[[package]]
name = "typing-extensions"
version = "4.0.1"
version = "4.1.1"
description = "Backported and Experimental Type Hints for Python 3.6+"
category = "main"
optional = false
@ -529,7 +526,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
[metadata]
lock-version = "1.1"
python-versions = "^3.7"
content-hash = "a3b1a9c3b80e338764f1907a77e31f59d6e1e231092b7813182e09e55d7c2f45"
content-hash = "8507a13e0c8c79c30e911cc5f32bdc35284304246ae50531917df6197d7dcab8"
[metadata.files]
appnope = [
@ -574,8 +571,8 @@ black = [
{file = "black-22.1.0.tar.gz", hash = "sha256:a7c0192d35635f6fc1174be575cb7915e92e5dd629ee79fdaf0dcfa41a80afb5"},
]
click = [
{file = "click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3"},
{file = "click-8.0.3.tar.gz", hash = "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"},
{file = "click-8.0.4-py3-none-any.whl", hash = "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1"},
{file = "click-8.0.4.tar.gz", hash = "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"},
]
colorama = [
{file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
@ -590,7 +587,8 @@ flake8 = [
{file = "flake8-4.0.1.tar.gz", hash = "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"},
]
ftfy = [
{file = "ftfy-6.0.3.tar.gz", hash = "sha256:ba71121a9c8d7790d3e833c6c1021143f3e5c4118293ec3afb5d43ed9ca8e72b"},
{file = "ftfy-6.1.1-py3-none-any.whl", hash = "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca"},
{file = "ftfy-6.1.1.tar.gz", hash = "sha256:bfc2019f84fcd851419152320a6375604a0f1459c281b5b199b2cd0d2e727f8f"},
]
importlib-metadata = [
{file = "importlib_metadata-4.2.0-py3-none-any.whl", hash = "sha256:057e92c15bc8d9e8109738a48db0ccb31b4d9d5cfbee5a8670879a30be66304b"},
@ -604,8 +602,8 @@ ipadic = [
{file = "ipadic-1.0.0.tar.gz", hash = "sha256:f5923d31eca6131acaaf18ed28d8998665b1347b640d3a6476f64650e9a71c07"},
]
ipython = [
{file = "ipython-7.31.1-py3-none-any.whl", hash = "sha256:55df3e0bd0f94e715abd968bedd89d4e8a7bce4bf498fb123fed4f5398fea874"},
{file = "ipython-7.31.1.tar.gz", hash = "sha256:b5548ec5329a4bcf054a5deed5099b0f9622eb9ea51aaa7104d215fece201d8c"},
{file = "ipython-7.32.0-py3-none-any.whl", hash = "sha256:86df2cf291c6c70b5be6a7b608650420e89180c8ec74f376a34e2dc15c3400e7"},
{file = "ipython-7.32.0.tar.gz", hash = "sha256:468abefc45c15419e3c8e8c0a6a5c115b2127bafa34d7c641b1d443658793909"},
]
jedi = [
{file = "jedi-0.18.1-py2.py3-none-any.whl", hash = "sha256:637c9635fcf47945ceb91cd7f320234a7be540ded6f3e99a50cb6febdfd1ba8d"},
@ -630,23 +628,27 @@ mecab-ko-dic = [
{file = "mecab-ko-dic-1.0.0.tar.gz", hash = "sha256:3ba22858736e02e8a0e92f2a7f099528c733ae47701b29d12c75e982a85d1f11"},
]
mecab-python3 = [
{file = "mecab-python3-1.0.4.tar.gz", hash = "sha256:b150ad5fe4260539b4ef184657e552ef81307fbbe60ae1f258bc814549ea90f8"},
{file = "mecab_python3-1.0.4-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:3c7e87c65160e5e4edb08cb80dbce50f4e711c53f45063321aab72ab2566ffe4"},
{file = "mecab_python3-1.0.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2fbed960ef82f4192b31efd88af1f3c24cd1692b62720ed70d7e314a50f581e"},
{file = "mecab_python3-1.0.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cb6eb6cc47e3937a2edfaa9595dc2d165ed9f025e3a53bd0a5033a12fa6bcdcf"},
{file = "mecab_python3-1.0.4-cp36-cp36m-win_amd64.whl", hash = "sha256:b149b51f0f62c9512d219c9e79c6db2eb66e70863a97eb412d8fc3ba7a25f351"},
{file = "mecab_python3-1.0.4-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c1606b35df0136b3e9dc7add2e69d2c1151e69fd5675c0cde62d0b017b2319e7"},
{file = "mecab_python3-1.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53b0b899ef03f364bfd7fa28f260ee1e893e4f47ff90a141a522709b892f0a4e"},
{file = "mecab_python3-1.0.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:52a789c708f8b89044236201eb03c7fe5517fad5210a9de2230c7d99a2a8c760"},
{file = "mecab_python3-1.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:d6ca73c0dec72038290faa6de17d57d771535eb47c22346e170dffcb82d696bb"},
{file = "mecab_python3-1.0.4-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:18e14dfe3d8c66cfa1c9f49e3bc8ac480b79a433ec9e5b5d2c1fb73f36ec7c3e"},
{file = "mecab_python3-1.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:221256b84be0ee29dc8fa450210236b40707b9d63cfc70de5102d2531622d062"},
{file = "mecab_python3-1.0.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:de39b82f44d97fc0fd636644ad14c9662f51afcd73775379d5a8b1eb20ee85a6"},
{file = "mecab_python3-1.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:96d9e8c098401fb4b5bd32258f4952f3b22cdb30ab291f5ff82eae1d0941cbed"},
{file = "mecab_python3-1.0.4-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:dcd62ebf2eecde1263119b92ff5379a046bb8231cb999fafda00f0925dfcb67e"},
{file = "mecab_python3-1.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178b632b717e3249054a7ad4c0fbc60ce8493d357afa7673d535ffa11e45eaba"},
{file = "mecab_python3-1.0.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fbfad60261ad3b9390b8615528fc013302a3e8febba220f799216c1a1154ee7e"},
{file = "mecab_python3-1.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:445b4f5ee5674d85f6de2726ec28991801844ff71eb096129da5f5ba077d5a87"},
{file = "mecab-python3-1.0.5.tar.gz", hash = "sha256:e703d78c88a671abb8170351644850015d9bbfab31530a3b40d12481a6779a11"},
{file = "mecab_python3-1.0.5-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:8a64bd228704ed9b24da5cbd6c4e325ef22310227153ef481f9037183351aa10"},
{file = "mecab_python3-1.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf083884116fa05ca0394c4c8d62013a4954fbac414c33a1931906ddf0f3585a"},
{file = "mecab_python3-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fe020df27b249f43df3d38b84473d226e36d6d4a31f951cedbddabfcc450e36"},
{file = "mecab_python3-1.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:644f781de083311fcf81f7d55f21a756ceef7ebae7c111bd50a2c9d0855c1927"},
{file = "mecab_python3-1.0.5-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:4309a91f0d5b66d3f0e8c9ba5a4d3cf7dbac1334269338704599820e051d1d7f"},
{file = "mecab_python3-1.0.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be2d1cd2ecd1f04b91eb0e26c906f21b50b8526e977f7f01f3901f9a6306944"},
{file = "mecab_python3-1.0.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:644bbde31ab1244ff18fb1dcac1e5fee8121f8b27a5c3e041c01ebc301df9266"},
{file = "mecab_python3-1.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:401a2d1608b6503cb755d7d864ad74b64a7a4346309235f84577de807bb29050"},
{file = "mecab_python3-1.0.5-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:5f91d5d8a9ac0ea7351e5e2423df98dd463b02013e006b18096cd365de37b2a9"},
{file = "mecab_python3-1.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc8ce0151b973f4ca15e651619264442011568ebe48c6fce51d55e64f7e5c2e1"},
{file = "mecab_python3-1.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e381df4c55f3ec5bccbb5625c65c54ecf982c215574d1102aff2803ac1a24cd"},
{file = "mecab_python3-1.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:8eaaa78227f470c4cf1d6c2a87b92889041f317517fbe65e635b86ea0c84a194"},
{file = "mecab_python3-1.0.5-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:dd8601565dd1331ee5cd67bcc45f713cebc14b730ee2e956ed120a0ec6e4fd8a"},
{file = "mecab_python3-1.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76a40f717f9592bd12edc7bcf1fa869f4c8058e5d0b80d4cc6c301435afb1f96"},
{file = "mecab_python3-1.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f299d6ef96495371f5a622a7004a205e303dabba1fc3a7f9a07e741e315ed2b"},
{file = "mecab_python3-1.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:4cdb07edbbd508d9b98ac9529e0ff0b89d93e50a6beeb7b8b946439594bf5e01"},
{file = "mecab_python3-1.0.5-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:eb412a25e485e33d7ab69262b58f7365b727f8c447e4c9c1c56b5fd91414ecd2"},
{file = "mecab_python3-1.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91e8ac11ef4440418312dd4f1f200f7957fdc0148bb49dc049264c5d07bed527"},
{file = "mecab_python3-1.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae1c126cf4982035794042280998066c8b6d26eb89136731078d9105a7070c13"},
{file = "mecab_python3-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:34a196c6a410e57f975ee077d075ac994b94bb6930b04e207e59e7c7521ecb58"},
]
msgpack = [
{file = "msgpack-1.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:96acc674bb9c9be63fa8b6dabc3248fdc575c4adc005c440ad02f87ca7edd079"},
@ -731,16 +733,16 @@ pickleshare = [
{file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"},
]
platformdirs = [
{file = "platformdirs-2.5.0-py3-none-any.whl", hash = "sha256:30671902352e97b1eafd74ade8e4a694782bd3471685e78c32d0fdfd3aa7e7bb"},
{file = "platformdirs-2.5.0.tar.gz", hash = "sha256:8ec11dfba28ecc0715eb5fb0147a87b1bf325f349f3da9aab2cd6b50b96b692b"},
{file = "platformdirs-2.5.1-py3-none-any.whl", hash = "sha256:bcae7cab893c2d310a711b70b24efb93334febe65f8de776ee320b517471e227"},
{file = "platformdirs-2.5.1.tar.gz", hash = "sha256:7535e70dfa32e84d4b34996ea99c5e432fa29a708d0f4e394bbcb2a8faa4f16d"},
]
pluggy = [
{file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
{file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
]
prompt-toolkit = [
{file = "prompt_toolkit-3.0.27-py3-none-any.whl", hash = "sha256:cb7dae7d2c59188c85a1d6c944fad19aded6a26bd9c8ae115a4e1c20eb90b713"},
{file = "prompt_toolkit-3.0.27.tar.gz", hash = "sha256:f2b6a8067a4fb959d3677d1ed764cc4e63e0f6f565b9a4fc7edc2b18bf80217b"},
{file = "prompt_toolkit-3.0.28-py3-none-any.whl", hash = "sha256:30129d870dcb0b3b6a53efdc9d0a83ea96162ffd28ffe077e94215b233dc670c"},
{file = "prompt_toolkit-3.0.28.tar.gz", hash = "sha256:9f1cd16b1e86c2968f2519d7fb31dd9d669916f515612c269d14e9ed52b51650"},
]
ptyprocess = [
{file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
@ -771,80 +773,80 @@ pytest = [
{file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
]
regex = [
{file = "regex-2022.1.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5"},
{file = "regex-2022.1.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4"},
{file = "regex-2022.1.18-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9"},
{file = "regex-2022.1.18-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7"},
{file = "regex-2022.1.18-cp310-cp310-win32.whl", hash = "sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d"},
{file = "regex-2022.1.18-cp310-cp310-win_amd64.whl", hash = "sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184"},
{file = "regex-2022.1.18-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3"},
{file = "regex-2022.1.18-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43"},
{file = "regex-2022.1.18-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a"},
{file = "regex-2022.1.18-cp36-cp36m-win32.whl", hash = "sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3"},
{file = "regex-2022.1.18-cp36-cp36m-win_amd64.whl", hash = "sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633"},
{file = "regex-2022.1.18-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8"},
{file = "regex-2022.1.18-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c"},
{file = "regex-2022.1.18-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a"},
{file = "regex-2022.1.18-cp37-cp37m-win32.whl", hash = "sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6"},
{file = "regex-2022.1.18-cp37-cp37m-win_amd64.whl", hash = "sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118"},
{file = "regex-2022.1.18-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899"},
{file = "regex-2022.1.18-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288"},
{file = "regex-2022.1.18-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c"},
{file = "regex-2022.1.18-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d"},
{file = "regex-2022.1.18-cp38-cp38-win32.whl", hash = "sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02"},
{file = "regex-2022.1.18-cp38-cp38-win_amd64.whl", hash = "sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e"},
{file = "regex-2022.1.18-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d"},
{file = "regex-2022.1.18-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d"},
{file = "regex-2022.1.18-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a"},
{file = "regex-2022.1.18-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093"},
{file = "regex-2022.1.18-cp39-cp39-win32.whl", hash = "sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1"},
{file = "regex-2022.1.18-cp39-cp39-win_amd64.whl", hash = "sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442"},
{file = "regex-2022.1.18.tar.gz", hash = "sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916"},
{file = "regex-2022.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7"},
{file = "regex-2022.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493"},
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d"},
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0"},
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635"},
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86"},
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c"},
{file = "regex-2022.3.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f"},
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d"},
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729"},
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2"},
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8"},
{file = "regex-2022.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd"},
{file = "regex-2022.3.2-cp310-cp310-win32.whl", hash = "sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834"},
{file = "regex-2022.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7"},
{file = "regex-2022.3.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f"},
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772"},
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f"},
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9"},
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14"},
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204"},
{file = "regex-2022.3.2-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7"},
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30"},
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0"},
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede"},
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4"},
{file = "regex-2022.3.2-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29"},
{file = "regex-2022.3.2-cp36-cp36m-win32.whl", hash = "sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863"},
{file = "regex-2022.3.2-cp36-cp36m-win_amd64.whl", hash = "sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b"},
{file = "regex-2022.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357"},
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57"},
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2"},
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5"},
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b"},
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1"},
{file = "regex-2022.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3"},
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231"},
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f"},
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960"},
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737"},
{file = "regex-2022.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec"},
{file = "regex-2022.3.2-cp37-cp37m-win32.whl", hash = "sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f"},
{file = "regex-2022.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0"},
{file = "regex-2022.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640"},
{file = "regex-2022.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a"},
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93"},
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e"},
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3"},
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573"},
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66"},
{file = "regex-2022.3.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178"},
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571"},
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07"},
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f"},
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9"},
{file = "regex-2022.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056"},
{file = "regex-2022.3.2-cp38-cp38-win32.whl", hash = "sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0"},
{file = "regex-2022.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47"},
{file = "regex-2022.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3"},
{file = "regex-2022.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1"},
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008"},
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed"},
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"},
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac"},
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee"},
{file = "regex-2022.3.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b"},
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c"},
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5"},
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4"},
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447"},
{file = "regex-2022.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015"},
{file = "regex-2022.3.2-cp39-cp39-win32.whl", hash = "sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1"},
{file = "regex-2022.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7"},
{file = "regex-2022.3.2.tar.gz", hash = "sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b"},
]
toml = [
{file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
@ -885,12 +887,12 @@ typed-ast = [
{file = "typed_ast-1.5.2.tar.gz", hash = "sha256:525a2d4088e70a9f75b08b3f87a51acc9cde640e19cc523c7e41aa355564ae27"},
]
types-setuptools = [
{file = "types-setuptools-57.4.9.tar.gz", hash = "sha256:536ef74744f8e1e4be4fc719887f886e74e4cf3c792b4a06984320be4df450b5"},
{file = "types_setuptools-57.4.9-py3-none-any.whl", hash = "sha256:948dc6863373750e2cd0b223a84f1fb608414cde5e55cf38ea657b93aeb411d2"},
{file = "types-setuptools-57.4.10.tar.gz", hash = "sha256:9a13513679c640f6616e2d9ab50d431c99ca8ae9848a97243f887c80fd5cf294"},
{file = "types_setuptools-57.4.10-py3-none-any.whl", hash = "sha256:ddc98da82c12e1208012d65276641a132d3aadc78ecfff68fd3e17d85933a3c1"},
]
typing-extensions = [
{file = "typing_extensions-4.0.1-py3-none-any.whl", hash = "sha256:7f001e5ac290a0c0401508864c7ec868be4e701886d5b573a9528ed3973d9d3b"},
{file = "typing_extensions-4.0.1.tar.gz", hash = "sha256:4ca091dea149f945ec56afb48dae714f21e8692ef22a395223bcd328961b6a0e"},
{file = "typing_extensions-4.1.1-py3-none-any.whl", hash = "sha256:21c85e0fe4b9a155d0799430b0ad741cdce7e359660ccbd8b530613e8df88ce2"},
{file = "typing_extensions-4.1.1.tar.gz", hash = "sha256:1a9462dcc3347a79b1f1c0271fbe79e844580bb598bafa1ed208b94da3cdcd42"},
]
wcwidth = [
{file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"},

View File

@ -4,6 +4,7 @@ version = "2.6.0"
description = "Look up the frequencies of words in many languages, based on many sources of data."
authors = ["Robyn Speer <rspeer@arborelia.net>"]
license = "MIT"
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.7"

View File

@ -3,17 +3,21 @@ from wordfreq import tokenize, word_frequency
def test_apostrophes():
# Test that we handle apostrophes in French reasonably.
assert tokenize("qu'un", 'fr') == ['qu', 'un']
assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
assert tokenize("l'heure", 'fr') == ['l', 'heure']
assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
assert tokenize("l'anima", 'it') == ['l', 'anima']
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
assert tokenize("qu'un", "fr") == ["qu", "un"]
assert tokenize("qu'un", "fr", include_punctuation=True) == ["qu'", "un"]
assert tokenize("langues d'oïl", "fr") == ["langues", "d", "oïl"]
assert tokenize("langues d'oïl", "fr", include_punctuation=True) == [
"langues",
"d'",
"oïl",
]
assert tokenize("l'heure", "fr") == ["l", "heure"]
assert tokenize("l'ànima", "ca") == ["l", "ànima"]
assert tokenize("l'anima", "it") == ["l", "anima"]
assert tokenize("l'heure", "fr", include_punctuation=True) == ["l'", "heure"]
assert tokenize("L'Hôpital", "fr", include_punctuation=True) == ["l'", "hôpital"]
assert tokenize("aujourd'hui", "fr") == ["aujourd'hui"]
assert tokenize("This isn't French", "en") == ["this", "isn't", "french"]
# This next behavior is not ideal -- we would prefer "dell'" to be handled
# the same as "l'" -- but this is the most consistent result we can get without
@ -21,26 +25,28 @@ def test_apostrophes():
#
# Versions of regex from 2019 and earlier would give ['dell', 'anima'], which
# is better but inconsistent.
assert tokenize("dell'anima", 'it') == ["dell'anima"]
assert tokenize("dell'anima", "it") == ["dell'anima"]
# Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's
# an example of why we don't want the apostrophe-vowel fix to apply everywhere.
assert tokenize("hawai'i", 'en') == ["hawai'i"]
assert tokenize("hawai'i", "en") == ["hawai'i"]
def test_catastrophes():
# More apostrophes, but this time they're in Catalan, and there's other
# mid-word punctuation going on too.
assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
assert (
tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
["m'", 'acabo', "d'", 'instal·lar', '.']
)
assert tokenize("M'acabo d'instal·lar.", "ca") == ["m", "acabo", "d", "instal·lar"]
assert tokenize("M'acabo d'instal·lar.", "ca", include_punctuation=True) == [
"m'",
"acabo",
"d'",
"instal·lar",
".",
]
def test_alternate_codes():
# Try over-long language codes for French and Catalan
assert tokenize("qu'un", 'fra') == ['qu', 'un']
assert tokenize("qu'un", 'fre') == ['qu', 'un']
assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']
assert tokenize("qu'un", "fra") == ["qu", "un"]
assert tokenize("qu'un", "fre") == ["qu", "un"]
assert tokenize("M'acabo d'instal·lar.", "cat") == ["m", "acabo", "d", "instal·lar"]

View File

@ -14,12 +14,12 @@ def test_gender_neutral_at():
"tod@s",
"l@s",
"trabajador@s",
"migrantes"
"migrantes",
]
text = "el distrito 22@ de Barcelona"
assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
assert tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
assert lossy_tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
# It also appears in Portuguese
text = "direitos e deveres para @s membr@s da comunidade virtual"
@ -32,7 +32,7 @@ def test_gender_neutral_at():
"membr@s",
"da",
"comunidade",
"virtual"
"virtual",
]
# Because this is part of our tokenization, the language code doesn't
@ -43,10 +43,10 @@ def test_gender_neutral_at():
def test_at_in_corpus():
# We have a word frequency for "l@s"
assert word_frequency('l@s', 'es') > 0
assert word_frequency("l@s", "es") > 0
# It's not just treated as a word break
assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
assert word_frequency("l@s", "es") < word_frequency("l s", "es")
def test_punctuation_at():
@ -65,7 +65,7 @@ def test_punctuation_at():
"ao",
"lado",
"do",
"nick"
"nick",
]
assert tokenize(text, "pt", include_punctuation=True) == [
@ -83,7 +83,7 @@ def test_punctuation_at():
"ao",
"lado",
"do",
"nick"
"nick",
]
# If the @ is not at the end of the word or part of the word ending '@s',
@ -98,12 +98,9 @@ def test_punctuation_at():
"la",
"línea",
"all:all",
"all"
"all",
]
# Make sure not to catch e-mail addresses
text = "info@something.example"
assert tokenize(text, "en") == [
"info",
"something.example"
]
assert tokenize(text, "en") == ["info", "something.example"]

View File

@ -9,92 +9,112 @@ def test_tokens():
# (He was the Chinese Wikipedia's featured article of the day when I
# wrote this test.)
hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè".
hobart = "加勒特·霍巴特" # Garret Hobart, or "jiā lè tè huò bā tè".
# He was the sixth American vice president to die in office.
fact_simplified = '他是历史上第六位在任期内去世的美国副总统。'
fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'
fact_simplified = "他是历史上第六位在任期内去世的美国副总统。"
fact_traditional = "他是歷史上第六位在任期內去世的美國副總統。"
# His name breaks into five pieces, with the only piece staying together
# being the one that means 'Bart'. The dot is not included as a token.
assert tokenize(hobart, 'zh') == ['', '', '', '', '巴特']
assert tokenize(hobart, "zh") == ["", "", "", "", "巴特"]
assert tokenize(fact_simplified, 'zh') == [
assert tokenize(fact_simplified, "zh") == [
# he / is / history / in / #6 / counter for people
'', '', '历史', '', '第六', '',
"",
"",
"历史",
"",
"第六",
"",
# during / term of office / in / die
'', '任期', '', '去世',
"",
"任期",
"",
"去世",
# of / U.S. / deputy / president
'', '美国', '', '总统'
"",
"美国",
"",
"总统",
]
# Jieba's original tokenizer knows a lot of names, it seems.
assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']
assert tokenize(hobart, "zh", external_wordlist=True) == ["加勒特", "霍巴特"]
# We get almost the same tokens from the sentence using Jieba's own
# wordlist, but it tokenizes "in history" as two words and
# "sixth person" as one.
assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
assert tokenize(fact_simplified, "zh", external_wordlist=True) == [
# he / is / history / in / sixth person
'', '', '历史', '', '第六位',
"",
"",
"历史",
"",
"第六位",
# during / term of office / in / die
'', '任期', '', '去世',
"",
"任期",
"",
"去世",
# of / U.S. / deputy / president
'', '美国', '', '总统'
"",
"美国",
"",
"总统",
]
# Check that Traditional Chinese works at all
assert word_frequency(fact_traditional, 'zh') > 0
assert word_frequency(fact_traditional, "zh") > 0
# You get the same token lengths if you look it up in Traditional Chinese,
# but the words are different
simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
assert ''.join(simp_tokens) == fact_simplified
assert ''.join(trad_tokens) == fact_traditional
simp_tokens = tokenize(fact_simplified, "zh", include_punctuation=True)
trad_tokens = tokenize(fact_traditional, "zh", include_punctuation=True)
assert "".join(simp_tokens) == fact_simplified
assert "".join(trad_tokens) == fact_traditional
simp_lengths = [len(token) for token in simp_tokens]
trad_lengths = [len(token) for token in trad_tokens]
assert simp_lengths == trad_lengths
def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
xiexie_freq = word_frequency("谢谢", "zh") # "Thanks"
assert word_frequency("谢谢谢谢", "zh") == pytest.approx(xiexie_freq / 20, rel=0.01)
def test_alternate_codes():
# Tokenization of Chinese works when you use other language codes
# that are not equal to 'zh'.
tokens = ['谢谢', '谢谢']
tokens = ["谢谢", "谢谢"]
# Code with a region attached
assert tokenize('谢谢谢谢', 'zh-CN') == tokens
assert tokenize("谢谢谢谢", "zh-CN") == tokens
# Over-long codes for Chinese
assert tokenize('谢谢谢谢', 'chi') == tokens
assert tokenize('谢谢谢谢', 'zho') == tokens
assert tokenize("谢谢谢谢", "chi") == tokens
assert tokenize("谢谢谢谢", "zho") == tokens
# Separate codes for Mandarin and Cantonese
assert tokenize('谢谢谢谢', 'cmn') == tokens
assert tokenize('谢谢谢谢', 'yue') == tokens
assert tokenize("谢谢谢谢", "cmn") == tokens
assert tokenize("谢谢谢谢", "yue") == tokens
def test_unreasonably_long():
# This crashed earlier versions of wordfreq due to an overflow in
# exponentiation. We've now changed the sequence of operations so it
# will underflow instead.
lots_of_ls = 'l' * 800
assert word_frequency(lots_of_ls, 'zh') == 0.
assert zipf_frequency(lots_of_ls, 'zh') == 0.
lots_of_ls = "l" * 800
assert word_frequency(lots_of_ls, "zh") == 0.0
assert zipf_frequency(lots_of_ls, "zh") == 0.0
def test_hyphens():
# An edge case of Chinese tokenization that changed sometime around
# jieba 0.42.
tok = tokenize('--------', 'zh', include_punctuation=True)
assert tok == ['-'] * 8
tok = tokenize('--------', 'zh', include_punctuation=True, external_wordlist=True)
assert tok == ['--------']
tok = tokenize("--------", "zh", include_punctuation=True)
assert tok == ["-"] * 8
tok = tokenize("--------", "zh", include_punctuation=True, external_wordlist=True)
assert tok == ["--------"]

View File

@ -1,16 +1,22 @@
from wordfreq import (
word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
word_frequency,
available_languages,
cB_to_freq,
top_n_list,
random_words,
random_ascii_words,
tokenize,
lossy_tokenize,
)
import pytest
def test_freq_examples():
# Stopwords are most common in the correct language
assert word_frequency('the', 'en') > word_frequency('de', 'en')
assert word_frequency('de', 'es') > word_frequency('the', 'es')
assert word_frequency("the", "en") > word_frequency("de", "en")
assert word_frequency("de", "es") > word_frequency("the", "es")
# We get word frequencies from the 'large' list when available
assert word_frequency('infrequency', 'en') > 0.
assert word_frequency("infrequency", "en") > 0.0
def test_languages():
@ -20,33 +26,33 @@ def test_languages():
assert len(avail) >= 34
# 'small' covers the same languages, but with some different lists
avail_small = available_languages('small')
avail_small = available_languages("small")
assert len(avail_small) == len(avail)
assert avail_small != avail
# 'combined' is the same as 'small'
avail_old_name = available_languages('combined')
avail_old_name = available_languages("combined")
assert avail_old_name == avail_small
# 'large' covers fewer languages
avail_large = available_languages('large')
avail_large = available_languages("large")
assert len(avail_large) >= 14
assert len(avail) > len(avail_large)
# Look up the digit '2' in the main word list for each language
for lang in avail:
assert word_frequency('2', lang) > 0
assert word_frequency("2", lang) > 0
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert word_frequency('2', new_lang_code) > 0
new_lang_code = "%s-001-x-fake-ext" % lang.upper()
assert word_frequency("2", new_lang_code) > 0
def test_minimums():
assert word_frequency('esquivalience', 'en') == 0
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
assert word_frequency('the', 'en', minimum=1) == 1
assert word_frequency("esquivalience", "en") == 0
assert word_frequency("esquivalience", "en", minimum=1e-6) == 1e-6
assert word_frequency("the", "en", minimum=1) == 1
def test_most_common_words():
@ -59,61 +65,61 @@ def test_most_common_words():
"""
return top_n_list(lang, 1)[0]
assert get_most_common('ar') == 'في'
assert get_most_common('bg') == 'на'
assert get_most_common('bn') == 'না'
assert get_most_common('ca') == 'de'
assert get_most_common('cs') == 'a'
assert get_most_common('da') == 'i'
assert get_most_common('el') == 'και'
assert get_most_common('de') == 'die'
assert get_most_common('en') == 'the'
assert get_most_common('es') == 'de'
assert get_most_common('fi') == 'ja'
assert get_most_common('fil') == 'sa'
assert get_most_common('fr') == 'de'
assert get_most_common('he') == 'את'
assert get_most_common('hi') == 'के'
assert get_most_common('hu') == 'a'
assert get_most_common('id') == 'yang'
assert get_most_common('is') == 'og'
assert get_most_common('it') == 'di'
assert get_most_common('ja') == ''
assert get_most_common('ko') == ''
assert get_most_common('lt') == 'ir'
assert get_most_common('lv') == 'un'
assert get_most_common('mk') == 'на'
assert get_most_common('ms') == 'yang'
assert get_most_common('nb') == 'i'
assert get_most_common('nl') == 'de'
assert get_most_common('pl') == 'w'
assert get_most_common('pt') == 'de'
assert get_most_common('ro') == 'de'
assert get_most_common('ru') == 'в'
assert get_most_common('sh') == 'je'
assert get_most_common('sk') == 'a'
assert get_most_common('sl') == 'je'
assert get_most_common('sv') == 'är'
assert get_most_common('ta') == 'ஒரு'
assert get_most_common('tr') == 've'
assert get_most_common('uk') == 'в'
assert get_most_common('ur') == 'کے'
assert get_most_common('vi') == ''
assert get_most_common('zh') == ''
assert get_most_common("ar") == "في"
assert get_most_common("bg") == "на"
assert get_most_common("bn") == "না"
assert get_most_common("ca") == "de"
assert get_most_common("cs") == "a"
assert get_most_common("da") == "i"
assert get_most_common("el") == "και"
assert get_most_common("de") == "die"
assert get_most_common("en") == "the"
assert get_most_common("es") == "de"
assert get_most_common("fi") == "ja"
assert get_most_common("fil") == "sa"
assert get_most_common("fr") == "de"
assert get_most_common("he") == "את"
assert get_most_common("hi") == "के"
assert get_most_common("hu") == "a"
assert get_most_common("id") == "yang"
assert get_most_common("is") == "og"
assert get_most_common("it") == "di"
assert get_most_common("ja") == ""
assert get_most_common("ko") == ""
assert get_most_common("lt") == "ir"
assert get_most_common("lv") == "un"
assert get_most_common("mk") == "на"
assert get_most_common("ms") == "yang"
assert get_most_common("nb") == "i"
assert get_most_common("nl") == "de"
assert get_most_common("pl") == "w"
assert get_most_common("pt") == "de"
assert get_most_common("ro") == "de"
assert get_most_common("ru") == "в"
assert get_most_common("sh") == "je"
assert get_most_common("sk") == "a"
assert get_most_common("sl") == "je"
assert get_most_common("sv") == "är"
assert get_most_common("ta") == "ஒரு"
assert get_most_common("tr") == "ve"
assert get_most_common("uk") == "в"
assert get_most_common("ur") == "کے"
assert get_most_common("vi") == ""
assert get_most_common("zh") == ""
def test_language_matching():
freq = word_frequency('', 'zh')
assert word_frequency('', 'zh-TW') == freq
assert word_frequency('', 'zh-CN') == freq
assert word_frequency('', 'zh-Hant') == freq
assert word_frequency('', 'zh-Hans') == freq
assert word_frequency('', 'yue-HK') == freq
assert word_frequency('', 'cmn') == freq
freq = word_frequency("", "zh")
assert word_frequency("", "zh-TW") == freq
assert word_frequency("", "zh-CN") == freq
assert word_frequency("", "zh-Hant") == freq
assert word_frequency("", "zh-Hans") == freq
assert word_frequency("", "yue-CN") == freq
assert word_frequency("", "cmn") == freq
def test_cB_conversion():
assert cB_to_freq(0) == 1.
assert cB_to_freq(0) == 1.0
assert cB_to_freq(-100) == pytest.approx(0.1)
assert cB_to_freq(-600) == pytest.approx(1e-6)
@ -126,101 +132,125 @@ def test_failed_cB_conversion():
def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the
# data
assert (
tokenize("I don't split at apostrophes, you see.", 'en')
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
)
assert tokenize("I don't split at apostrophes, you see.", "en") == [
"i",
"don't",
"split",
"at",
"apostrophes",
"you",
"see",
]
assert (
tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
)
assert tokenize(
"I don't split at apostrophes, you see.", "en", include_punctuation=True
) == ["i", "don't", "split", "at", "apostrophes", ",", "you", "see", "."]
# Certain punctuation does not inherently split a word.
assert (
tokenize("Anything is possible at zombo.com", 'en')
== ['anything', 'is', 'possible', 'at', 'zombo.com']
)
assert tokenize("Anything is possible at zombo.com", "en") == [
"anything",
"is",
"possible",
"at",
"zombo.com",
]
# Splits occur after symbols, and at splitting punctuation such as hyphens.
assert tokenize('😂test', 'en') == ['😂', 'test']
assert tokenize("flip-flop", 'en') == ['flip', 'flop']
assert (
tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
== ['this', 'text', 'has', '...', 'punctuation', ':)']
)
assert tokenize("😂test", "en") == ["😂", "test"]
assert tokenize("flip-flop", "en") == ["flip", "flop"]
assert tokenize(
"this text has... punctuation :)", "en", include_punctuation=True
) == ["this", "text", "has", "...", "punctuation", ":)"]
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
# is up to date
assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
assert (
tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
== ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
'nothing', 'i', 'can', 'do', '🌎', '🚀']
)
assert tokenize("emoji test 🧕🏽", "en") == ["emoji", "test", "🧕🏽"]
assert tokenize(
"👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", "en"
) == [
"👨‍🎤",
"planet",
"earth",
"is",
"blue",
"and",
"there's",
"nothing",
"i",
"can",
"do",
"🌎",
"🚀",
]
# Water wave, surfer, flag of California (indicates ridiculously complete support
# for Unicode 10 and Emoji 5.0)
assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'", "en") == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
def test_casefolding():
assert tokenize('WEISS', 'de') == ['weiss']
assert tokenize('weiß', 'de') == ['weiss']
assert tokenize('İstanbul', 'tr') == ['istanbul']
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
assert tokenize("WEISS", "de") == ["weiss"]
assert tokenize("weiß", "de") == ["weiss"]
assert tokenize("İstanbul", "tr") == ["istanbul"]
assert tokenize("SIKISINCA", "tr") == ["sıkısınca"]
def test_number_smashing():
assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
assert (
lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
== ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
)
assert lossy_tokenize('1', 'en') == ['1']
assert lossy_tokenize('3.14', 'en') == ['0.00']
assert lossy_tokenize('24601', 'en') == ['00000']
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def test_normalization():
assert tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
"715",
"crσσks",
"by",
"bon",
"iver",
]
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
"715",
"crσσks",
"by",
"bon",
"iver",
]
def test_uncurl_quotes():
assert lossy_tokenize("lets", 'en') == ["let's"]
assert word_frequency("lets", 'en') == word_frequency("let's", 'en')
assert lossy_tokenize("lets", "en") == ["let's"]
assert word_frequency("lets", "en") == word_frequency("let's", "en")
def test_phrase_freq():
ff = word_frequency("flip-flop", 'en')
ff = word_frequency("flip-flop", "en")
assert ff > 0
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
phrase_freq = 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency(
"flop", "en"
)
assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
def test_not_really_random():
# If your xkcd-style password comes out like this, maybe you shouldn't
# use it
assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
assert random_words(nwords=4, lang="en", bits_per_word=0) == "the the the the"
# This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word
assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '00 00 00 00'
assert random_ascii_words(nwords=4, lang="ja", bits_per_word=0) == "1 1 1 1"
def test_not_enough_ascii():
with pytest.raises(ValueError):
random_ascii_words(lang='zh', bits_per_word=16)
random_ascii_words(lang="zh", bits_per_word=16)
def test_arabic():
# Remove tatweels
assert tokenize('متــــــــعب', 'ar') == ['متعب']
assert tokenize("متــــــــعب", "ar") == ["متعب"]
# Remove combining marks
assert tokenize('حَرَكَات', 'ar') == ['حركات']
assert tokenize("حَرَكَات", "ar") == ["حركات"]
# An Arabic ligature that is affected by NFKC normalization
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
assert tokenize("\ufefb", "ar") == ["\u0644\u0627"]
def test_ideographic_fallback():
@ -228,28 +258,33 @@ def test_ideographic_fallback():
#
# More complex examples like this, involving the multiple scripts of Japanese,
# are in test_japanese.py.
assert tokenize('中国文字', 'en') == ['中国文字']
assert tokenize("中国文字", "en") == ["中国文字"]
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
assert tokenize("การเล่นดนตรี", "th") == ["การเล่นดนตรี"]
assert tokenize('"การเล่นดนตรี" means "playing music"', "en") == [
"การเล่นดนตรี",
"means",
"playing",
"music",
]
# Test Khmer, a script similar to Thai
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
assert tokenize("សូមស្វាគមន៍", "km") == ["សូមស្វាគមន៍"]
# Test Hindi -- tokens split where there are spaces, and not where there aren't
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
assert tokenize("हिन्दी विक्षनरी", "hi") == ["हिन्दी", "विक्षनरी"]
# Remove vowel points in Hebrew
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
assert tokenize("דֻּגְמָה", "he") == ["דגמה"]
# Deal with commas, cedillas, and I's in Turkish
assert tokenize('kișinin', 'tr') == ['kişinin']
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
assert tokenize("kișinin", "tr") == ["kişinin"]
assert tokenize("KİȘİNİN", "tr") == ["kişinin"]
# Deal with cedillas that should be commas-below in Romanian
assert tokenize('acelaşi', 'ro') == ['același']
assert tokenize('ACELAŞI', 'ro') == ['același']
assert tokenize("acelaşi", "ro") == ["același"]
assert tokenize("ACELAŞI", "ro") == ["același"]

View File

@ -3,7 +3,7 @@ import pytest
def test_tokens():
assert tokenize('おはようございます', 'ja') == ['おはよう', 'ござい', 'ます']
assert tokenize("おはようございます", "ja") == ["おはよう", "ござい", "ます"]
def test_simple_tokenize():
@ -17,13 +17,12 @@ def test_simple_tokenize():
#
# We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary.
ja_text = 'ひらがなカタカナromaji'
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
ja_text = "ひらがなカタカナromaji"
assert simple_tokenize(ja_text) == ["ひらがなカタカナ", "romaji"]
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
assert simple_tokenize('おはようございます') == ['おはようございます']
assert simple_tokenize("おはようございます") == ["おはようございます"]
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
@ -43,17 +42,13 @@ def test_simple_tokenize():
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
def test_combination():
ohayou_freq = word_frequency('おはよう', 'ja')
gozai_freq = word_frequency('ござい', 'ja')
masu_freq = word_frequency('ます', 'ja')
ohayou_freq = word_frequency("おはよう", "ja")
gozai_freq = word_frequency("ござい", "ja")
masu_freq = word_frequency("ます", "ja")
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01)
assert word_frequency("おはようおはよう", "ja") == pytest.approx(ohayou_freq / 2, rel=0.01)
assert (
1.0 / word_frequency('おはようございます', 'ja') ==
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01)
assert 1.0 / word_frequency("おはようございます", "ja") == pytest.approx(
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01
)

View File

@ -3,16 +3,14 @@ import pytest
def test_tokens():
assert tokenize('감사합니다', 'ko') == ['감사', '합니다']
assert tokenize("감사합니다", "ko") == ["감사", "합니다"]
def test_combination():
gamsa_freq = word_frequency('감사', 'ko')
habnida_freq = word_frequency('합니다', 'ko')
gamsa_freq = word_frequency("감사", "ko")
habnida_freq = word_frequency("합니다", "ko")
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
assert (
1.0 / word_frequency('감사합니다', 'ko') ==
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
assert word_frequency("감사감사", "ko") == pytest.approx(gamsa_freq / 2, rel=0.01)
assert 1.0 / word_frequency("감사합니다", "ko") == pytest.approx(
1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01
)

58
tests/test_numbers.py Normal file
View File

@ -0,0 +1,58 @@
from wordfreq import word_frequency
from wordfreq.numbers import digit_freq, smash_numbers
from pytest import approx
def test_number_smashing():
assert smash_numbers("1") == "1"
assert smash_numbers("3.14") == "0.00"
assert smash_numbers("24601") == "00000"
def test_decimals():
assert word_frequency("3.14", "el") > word_frequency("4.14", "el")
assert word_frequency("3.14", "el") == word_frequency("3.15", "el")
assert word_frequency("3,14", "de") > word_frequency("4,14", "de")
assert word_frequency("3,14", "de") == word_frequency("3,15", "de")
def test_year_distribution():
assert word_frequency("2010", "en") > word_frequency("1010", "en")
assert word_frequency("2010", "en") > word_frequency("3010", "en")
def test_boundaries():
assert word_frequency("9", "en") > word_frequency("10", "en")
assert word_frequency("99", "en") > word_frequency("100", "en")
assert word_frequency("999", "en") > word_frequency("1000", "en")
assert word_frequency("9999", "en") > word_frequency("10000", "en")
def test_multiple_words():
once = word_frequency("2015b", "en")
twice = word_frequency("2015b 2015b", "en")
assert once == approx(2 * twice)
def test_distribution():
assert word_frequency("24601", "en") > word_frequency("90210", "en")
assert word_frequency("7", "en") > word_frequency("007", "en")
assert word_frequency("404", "en") == word_frequency("418", "en")
def test_3digit_sum():
"""
Test that the probability distribution given you have a 4-digit sequence
adds up to approximately 1.
"""
three_digit_sum = sum(digit_freq(f"{num:03d}") for num in range(0, 1000))
assert three_digit_sum == approx(1.0)
def test_4digit_sum():
"""
Test that the probability distribution given you have a 4-digit sequence
adds up to approximately 1.
"""
four_digit_sum = sum(digit_freq(f"{num:04d}") for num in range(0, 10000))
assert 0.999 < four_digit_sum < 1.0

View File

@ -5,14 +5,26 @@ from wordfreq.preprocess import preprocess_text
def test_transliteration():
# "Well, there's a lot of things you do not understand."
# (from somewhere in OpenSubtitles
assert (
tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
)
assert (
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
)
assert tokenize("Па, има ту много ствари које не схваташ.", "sr") == [
"pa",
"ima",
"tu",
"mnogo",
"stvari",
"koje",
"ne",
"shvataš",
]
assert tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", "sr") == [
"pa",
"ima",
"tu",
"mnogo",
"stvari",
"koje",
"ne",
"shvataš",
]
# I don't have examples of complete sentences in Azerbaijani that are
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
@ -20,14 +32,14 @@ def test_transliteration():
# So here are some individual words.
# 'library' in Azerbaijani Cyrillic
assert preprocess_text('китабхана', 'az') == 'kitabxana'
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'
assert preprocess_text("китабхана", "az") == "kitabxana"
assert preprocess_text("КИТАБХАНА", "az") == "kitabxana"
assert preprocess_text("KİTABXANA", "az") == "kitabxana"
# 'scream' in Azerbaijani Cyrillic
assert preprocess_text('бағырты', 'az') == 'bağırtı'
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
assert preprocess_text("бағырты", "az") == "bağırtı"
assert preprocess_text("БАҒЫРТЫ", "az") == "bağırtı"
assert preprocess_text("BAĞIRTI", "az") == "bağırtı"
def test_actually_russian():
@ -38,13 +50,12 @@ def test_actually_russian():
# We make sure to handle this case so we don't end up with a mixed-script
# word like "pacanы".
assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany']
assert tokenize("культуры", 'sr') == ["kul'tury"]
assert tokenize("сто из ста, пацаны!", "sr") == ["sto", "iz", "sta", "pacany"]
assert tokenize("культуры", "sr") == ["kul'tury"]
def test_alternate_codes():
# Try language codes for Serbo-Croatian that have been split, and now
# are canonically mapped to Serbian
assert tokenize("культуры", 'sh') == ["kul'tury"]
assert tokenize("культуры", 'hbs') == ["kul'tury"]
assert tokenize("культуры", "sh") == ["kul'tury"]
assert tokenize("культуры", "hbs") == ["kul'tury"]

View File

@ -13,7 +13,7 @@ import warnings
from .tokens import tokenize, simple_tokenize, lossy_tokenize
from .language_info import get_language_info
from .numbers import digit_freq
from .numbers import digit_freq, has_digit_sequence, smash_numbers
logger = logging.getLogger(__name__)
@ -234,7 +234,7 @@ _wf_cache: Dict[Tuple[str, str, str, float], float] = {}
def _word_frequency(word: str, lang: str, wordlist: str, minimum: float) -> float:
tokens = lossy_tokenize(word, lang)
dfreq = digit_freq(word)
if not tokens:
return minimum
@ -245,13 +245,20 @@ def _word_frequency(word: str, lang: str, wordlist: str, minimum: float) -> floa
freqs = get_frequency_dict(lang, wordlist)
one_over_result = 0.0
for token in tokens:
if token not in freqs:
smashed = smash_numbers(token)
if smashed not in freqs:
# If any word is missing, just return the default value
return minimum
# spread the frequency of digits over all digit combinations
freq = freqs[token]
freq = freqs[smashed]
if smashed != token:
# If there is a digit sequence in the token, the digits are
# internally replaced by 0s to aggregate their probabilities
# together. We then assign a specific frequency to the digit
# sequence using the `digit_freq` distribution.
freq *= digit_freq(token)
one_over_result += 1.0 / freq
# Combine the frequencies of tokens we looked up.
freq = 1.0 / one_over_result
if get_language_info(lang)["tokenizer"] == "jieba":
@ -334,10 +341,15 @@ def top_n_list(
Return a frequency list of length `n` in descending order of frequency.
This list contains words from `wordlist`, of the given language.
If `ascii_only`, then only ascii words are considered.
The frequency list will not contain multi-digit sequences, because we
estimate the frequencies of those using the functions in `numbers.py`,
not using a wordlist that contains all of them.
"""
results = []
for word in iter_wordlist(lang, wordlist):
if (not ascii_only) or max(word) <= "~":
if not has_digit_sequence(word):
results.append(word)
if len(results) >= n:
break

View File

@ -1,4 +1,4 @@
from .preprocess import MULTI_DIGIT_RE
import regex
# Frequencies of leading digits, according to Benford's law, sort of.
# Benford's law doesn't describe numbers with leading zeroes, because "007"
@ -11,23 +11,37 @@ DIGIT_FREQS = [0.009, 0.300, 0.175, 0.124, 0.096, 0.078, 0.066, 0.057, 0.050, 0.
#
# We do this with a piecewise exponential function whose peak is a plateau covering
# the years 2019 to 2039.
#
# YEAR_LOG_PEAK is chosen by experimentation to make this probability add up to about
# .994. Here, that represents P(token represents a year) | P(token is 4 digits).
# The other .006 represents P(token does not represent a year) | P(token is 4 digits).
YEAR_LOG_PEAK = -1.875
NOT_YEAR_PROB = 0.006
# Determined by experimentation: makes the probabilities of all years add up to 90%.
# The other 10% goes to NOT_YEAR_PROB. tests/test_numbers.py confirms that this
# probability distribution adds up to 1.
YEAR_LOG_PEAK = -1.9185
NOT_YEAR_PROB = 0.1
REFERENCE_YEAR = 2019
PLATEAU_WIDTH = 20
DIGIT_RE = regex.compile(r"\d")
MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
PURE_DIGIT_RE = regex.compile(r"\d+")
def benford_freq(text: str) -> float:
"""
Estimate the frequency of a digit sequence according to Benford's law.
"""
first_digit = int(text[0])
return DIGIT_FREQS[first_digit] / 10 ** (len(text) - 1)
def year_freq(text: str) -> float:
"""
Estimate the relative frequency of a particular 4-digit sequence representing
a year.
For example, suppose text == "1985". We're estimating the probability that a
randomly-selected token from a large corpus will be "1985" and refer to the
year, _given_ that it is 4 digits. Tokens that are not 4 digits are not involved
in the probability distribution.
"""
year = int(text)
# Fitting a line to the curve seen at
@ -60,13 +74,38 @@ def year_freq(text: str) -> float:
def digit_freq(text: str) -> float:
"""
Get the relative frequency of a string of digits, using our estimates.
"""
freq = 1.0
for match in MULTI_DIGIT_RE.findall(text):
if len(match) == 4:
freq *= year_freq(match)
for submatch in PURE_DIGIT_RE.findall(match):
if len(submatch) == 4:
freq *= year_freq(submatch)
else:
freq *= benford_freq(match)
freq *= benford_freq(submatch)
return freq
print(sum(digit_freq("%04d" % year) for year in range(0, 10000)))
def has_digit_sequence(text: str) -> bool:
"""
Returns True iff the text has a digit sequence that will be normalized out
and handled with `digit_freq`.
"""
return bool(MULTI_DIGIT_RE.match(text))
def _sub_zeroes(match: regex.Match) -> str:
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub("0", match.group(0))
def smash_numbers(text: str) -> str:
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(_sub_zeroes, text)

View File

@ -7,10 +7,6 @@ from langcodes import Language
MARK_RE = regex.compile(r"[\p{Mn}\N{ARABIC TATWEEL}]", regex.V1)
DIGIT_RE = regex.compile(r"\d")
MULTI_DIGIT_RE = regex.compile(r"\d[\d.,]+")
def preprocess_text(text: str, language: Language) -> str:
"""
This function applies pre-processing steps that convert forms of words
@ -251,19 +247,3 @@ def cedillas_to_commas(text: str) -> str:
"\N{LATIN SMALL LETTER T WITH CEDILLA}",
"\N{LATIN SMALL LETTER T WITH COMMA BELOW}",
)
def _sub_zeroes(match: regex.Match) -> str:
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
"""
return DIGIT_RE.sub("0", match.group(0))
def smash_numbers(text: str) -> str:
"""
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(_sub_zeroes, text)

View File

@ -10,7 +10,7 @@ from .language_info import (
SPACELESS_SCRIPTS,
EXTRA_JAPANESE_CHARACTERS,
)
from .preprocess import preprocess_text, smash_numbers
from .preprocess import preprocess_text
# Placeholders for CJK functions that we'll import on demand
_mecab_tokenize = None
@ -309,13 +309,6 @@ def lossy_tokenize(
In particular:
- Any sequence of 2 or more adjacent digits, possibly with intervening
punctuation such as a decimal point, will replace each digit with '0'
so that frequencies for numbers don't have to be counted separately.
This is similar to but not quite identical to the word2vec Google News
data, which replaces digits with '#' in tokens with more than one digit.
- In Chinese, unless Traditional Chinese is specifically requested using
'zh-Hant', all characters will be converted to Simplified Chinese.
@ -334,4 +327,4 @@ def lossy_tokenize(
tokens = [_simplify_chinese(token) for token in tokens]
return [uncurl_quotes(smash_numbers(token)) for token in tokens]
return [uncurl_quotes(token) for token in tokens]