mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
2a41d4dc5e
This changes the version from 1.4.2 to 1.5. Things done in this update include:
* include Common Crawl; support 11 more languages
* new frequency-merging strategy
* New sources: Chinese from Wikipedia (mostly Trad.), Dutch big list
* Remove kinda bad sources, i.e. Greek Twitter (too often kaomoji are detected as Greek) and Ukrainian Common Crawl. This results in dropping Ukrainian as an available language, and causing Greek to not be a 'large' language after all.
* Add Korean tokenization, and include MeCab files in data
* Remove marks from more languages
* Deal with commas and cedillas in Turkish and Romanian
Former-commit-id: e6a8f028e3
62 lines
1.9 KiB
Python
Executable File
62 lines
1.9 KiB
Python
Executable File
#!/usr/bin/env python
|
|
from setuptools import setup
|
|
import sys
|
|
import os
|
|
|
|
if sys.version_info[0] < 3:
|
|
print("Sorry, but wordfreq no longer supports Python 2.")
|
|
sys.exit(1)
|
|
|
|
|
|
classifiers = [
|
|
'Intended Audience :: Developers',
|
|
'Intended Audience :: Science/Research',
|
|
'License :: OSI Approved :: MIT License',
|
|
'Natural Language :: English',
|
|
'Operating System :: MacOS',
|
|
'Operating System :: Microsoft :: Windows',
|
|
'Operating System :: POSIX',
|
|
'Operating System :: Unix',
|
|
'Programming Language :: Python :: 3',
|
|
'Topic :: Scientific/Engineering',
|
|
'Topic :: Software Development',
|
|
'Topic :: Text Processing :: Linguistic',
|
|
]
|
|
|
|
current_dir = os.path.dirname(__file__)
|
|
README_contents = open(os.path.join(current_dir, 'README.md'),
|
|
encoding='utf-8').read()
|
|
doclines = README_contents.split("\n")
|
|
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
|
|
if sys.version_info < (3, 4):
|
|
dependencies.append('pathlib')
|
|
|
|
|
|
setup(
|
|
name="wordfreq",
|
|
version='1.5',
|
|
maintainer='Luminoso Technologies, Inc.',
|
|
maintainer_email='info@luminoso.com',
|
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
|
license = "MIT",
|
|
platforms = ["any"],
|
|
description = doclines[0],
|
|
classifiers = classifiers,
|
|
long_description = "\n".join(doclines[2:]),
|
|
packages=['wordfreq'],
|
|
include_package_data=True,
|
|
install_requires=dependencies,
|
|
|
|
# mecab-python3 is required for looking up Japanese or Korean word
|
|
# frequencies. In turn, it depends on libmecab-dev being installed on the
|
|
# system. It's not listed under 'install_requires' because wordfreq should
|
|
# be usable in other languages without it.
|
|
#
|
|
# Similarly, jieba is required for Chinese word frequencies.
|
|
extras_require={
|
|
'mecab': 'mecab-python3',
|
|
'jieba': 'jieba'
|
|
},
|
|
tests_require=['mecab-python3', 'jieba'],
|
|
)
|