mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Tokenization in Korean, plus abjad languages (#38)
* Remove marks from more languages
* Add Korean tokenization, and include MeCab files in data
* add a Hebrew tokenization test
* fix terminology in docstrings about abjad scripts
* combine Japanese and Korean tokenization into the same function
Former-commit-id: fec6eddcc3
This commit is contained in:
parent
ac24b8eab4
commit
a0893af82e
@ -1,3 +1,8 @@
|
||||
recursive-include wordfreq/data *.gz
|
||||
include README.md
|
||||
recursive-include wordfreq/data *.txt
|
||||
recursive-include wordfreq/data *.bin
|
||||
recursive-include wordfreq/data *.def
|
||||
recursive-include wordfreq/data *.dic
|
||||
recursive-include wordfreq/data dicrc
|
||||
recursive-include wordfreq/data COPYING
|
||||
|
@ -276,7 +276,11 @@ The terms of use of this data are:
|
||||
acknowledgement of Google Books Ngram Viewer as the source, and inclusion
|
||||
of a link to http://books.google.com/ngrams, would be appreciated.
|
||||
|
||||
It also contains data derived from the following Creative Commons-licensed
|
||||
`wordfreq` uses MeCab, by Taku Kudo, plus Korean data files by Yongwoon Lee and
|
||||
Yungho Yu. The Korean data is under an Apache 2 license, a copy of which
|
||||
appears in wordfreq/data/mecab-ko-dic/COPYING.
|
||||
|
||||
`wordfreq` also contains data derived from the following Creative Commons-licensed
|
||||
sources:
|
||||
|
||||
- The Leeds Internet Corpus, from the University of Leeds Centre for Translation
|
||||
|
10
setup.py
10
setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='1.4.1',
|
||||
version='1.4.2',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
@ -47,10 +47,10 @@ setup(
|
||||
include_package_data=True,
|
||||
install_requires=dependencies,
|
||||
|
||||
# mecab-python3 is required for looking up Japanese word frequencies. In
|
||||
# turn, it depends on libmecab-dev being installed on the system. It's not
|
||||
# listed under 'install_requires' because wordfreq should be usable in
|
||||
# other languages without it.
|
||||
# mecab-python3 is required for looking up Japanese or Korean word
|
||||
# frequencies. In turn, it depends on libmecab-dev being installed on the
|
||||
# system. It's not listed under 'install_requires' because wordfreq should
|
||||
# be usable in other languages without it.
|
||||
#
|
||||
# Similarly, jieba is required for Chinese word frequencies.
|
||||
extras_require={
|
||||
|
@ -152,7 +152,7 @@ def test_not_enough_ascii():
|
||||
random_ascii_words(lang='zh')
|
||||
|
||||
|
||||
def test_ar():
|
||||
def test_arabic():
|
||||
# Remove tatweels
|
||||
eq_(
|
||||
tokenize('متــــــــعب', 'ar'),
|
||||
@ -183,6 +183,7 @@ def test_ideographic_fallback():
|
||||
['ひらがな', 'カタカナ', 'romaji']
|
||||
)
|
||||
|
||||
def test_other_languages():
|
||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||
# we would actually split this into a three-word phrase.
|
||||
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
|
||||
@ -194,3 +195,7 @@ def test_ideographic_fallback():
|
||||
|
||||
# Test Hindi -- tokens split where there are spaces, and not where there aren't
|
||||
eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
|
||||
|
||||
# Remove vowel points in Hebrew
|
||||
eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
|
||||
|
||||
|
22
tests/test_korean.py
Normal file
22
tests/test_korean.py
Normal file
@ -0,0 +1,22 @@
|
||||
from nose.tools import eq_, assert_almost_equal
|
||||
from wordfreq import tokenize, word_frequency
|
||||
|
||||
|
||||
def test_tokens():
|
||||
eq_(tokenize('감사합니다', 'ko'),
|
||||
['감사', '합니다'])
|
||||
|
||||
|
||||
def test_combination():
|
||||
gamsa_freq = word_frequency('감사', 'ko')
|
||||
habnida_freq = word_frequency('합니다', 'ko')
|
||||
|
||||
assert_almost_equal(
|
||||
word_frequency('감사감사', 'ko'),
|
||||
gamsa_freq / 2
|
||||
)
|
||||
assert_almost_equal(
|
||||
1.0 / word_frequency('감사합니다', 'ko'),
|
||||
1.0 / gamsa_freq + 1.0 / habnida_freq
|
||||
)
|
||||
|
BIN
wordfreq/data/mecab-ja-ipadic/char.bin
Normal file
BIN
wordfreq/data/mecab-ja-ipadic/char.bin
Normal file
Binary file not shown.
29
wordfreq/data/mecab-ja-ipadic/dicrc
Normal file
29
wordfreq/data/mecab-ja-ipadic/dicrc
Normal file
@ -0,0 +1,29 @@
|
||||
;
|
||||
; Configuration file of IPADIC
|
||||
;
|
||||
; $Id: dicrc,v 1.4 2006/04/08 06:41:36 taku-ku Exp $;
|
||||
;
|
||||
cost-factor = 800
|
||||
bos-feature = BOS/EOS,*,*,*,*,*,*,*,*
|
||||
eval-size = 8
|
||||
unk-eval-size = 4
|
||||
config-charset = UTF-8
|
||||
|
||||
; yomi
|
||||
node-format-yomi = %pS%f[7]
|
||||
unk-format-yomi = %M
|
||||
eos-format-yomi = \n
|
||||
|
||||
; simple
|
||||
node-format-simple = %m\t%F-[0,1,2,3]\n
|
||||
eos-format-simple = EOS\n
|
||||
|
||||
; ChaSen
|
||||
node-format-chasen = %m\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n
|
||||
unk-format-chasen = %m\t%m\t%m\t%F-[0,1,2,3]\t\t\n
|
||||
eos-format-chasen = EOS\n
|
||||
|
||||
; ChaSen (include spaces)
|
||||
node-format-chasen2 = %M\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n
|
||||
unk-format-chasen2 = %M\t%m\t%m\t%F-[0,1,2,3]\t\t\n
|
||||
eos-format-chasen2 = EOS\n
|
BIN
wordfreq/data/mecab-ja-ipadic/matrix.bin
Normal file
BIN
wordfreq/data/mecab-ja-ipadic/matrix.bin
Normal file
Binary file not shown.
1
wordfreq/data/mecab-ja-ipadic/sys.dic.REMOVED.git-id
Normal file
1
wordfreq/data/mecab-ja-ipadic/sys.dic.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
||||
c926154d533ccaef1515af6883056d69c34ca239
|
BIN
wordfreq/data/mecab-ja-ipadic/unk.dic
Normal file
BIN
wordfreq/data/mecab-ja-ipadic/unk.dic
Normal file
Binary file not shown.
201
wordfreq/data/mecab-ko-dic/COPYING
Normal file
201
wordfreq/data/mecab-ko-dic/COPYING
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
BIN
wordfreq/data/mecab-ko-dic/char.bin
Normal file
BIN
wordfreq/data/mecab-ko-dic/char.bin
Normal file
Binary file not shown.
25
wordfreq/data/mecab-ko-dic/dicrc
Normal file
25
wordfreq/data/mecab-ko-dic/dicrc
Normal file
@ -0,0 +1,25 @@
|
||||
;
|
||||
; Configuration file of mecab-ko-dic
|
||||
;
|
||||
|
||||
# 비용 값으로 변환할 때 배율 팩터입니다. 700에서 800에서 문제가 없습니다.
|
||||
cost-factor = 800
|
||||
# 문장의 시작, 문장 끝에 대한 소성(素性)입니다. CSV로 표현합니다.
|
||||
bos-feature = BOS/EOS,*,*,*,*,*,*,*,*
|
||||
# 알려진 단어의 경우 소성(素性)의 처음부터 몇 개까지 일치하면 정답으로
|
||||
# 인정하는지를 지정합니다. 일반적으로 알려진 단어는 품사 활용 등의 정보만
|
||||
# 맞추면 되기 때문에, "읽기", "발음" 소성(素性)은 무시하도록 합니다.
|
||||
# 여기에서는 3가지가 평가됩니다.
|
||||
eval-size = 4
|
||||
# 알 수 없는 단어의 경우
|
||||
# 소성의 처음부터 몇 개까지 일치하면 정답으로 인정할지를 지정합니다.
|
||||
unk-eval-size = 2
|
||||
# dicrc, char.def, unk.def, pos-id.def 파일의 문자 코드셋입니다.
|
||||
config-charset = UTF-8
|
||||
# 좌측에 공백을 포함하는 품사의 연접 비용을 늘리기 위한 설정입니다.
|
||||
# mecab-ko에서만 사용되는 설정입니다. 다음과 같은 형식을 가집니다.
|
||||
# <posid 1>,<posid 1 penalty cost>,<posid 2>,<posid 2 penalty cost>...
|
||||
#
|
||||
# 예) 120,6000 => posid가 120인 품사(조사)의 좌측에 공백을 포함할 경우
|
||||
# 연접 비용을 6000만큼 늘림
|
||||
left-space-penalty-factor = 100,3000,120,6000,172,3000,183,3000,184,3000,185,3000,200,3000,210,6000,220,3000,221,3000,222,3000,230,3000
|
2691
wordfreq/data/mecab-ko-dic/left-id.def
Normal file
2691
wordfreq/data/mecab-ko-dic/left-id.def
Normal file
File diff suppressed because it is too large
Load Diff
1
wordfreq/data/mecab-ko-dic/matrix.bin.REMOVED.git-id
Normal file
1
wordfreq/data/mecab-ko-dic/matrix.bin.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
||||
2dbb57fe707d7dddd2392526aad7cbac77378bb3
|
1
wordfreq/data/mecab-ko-dic/model.bin.REMOVED.git-id
Normal file
1
wordfreq/data/mecab-ko-dic/model.bin.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
||||
58619494b4f81190218b76d9d2090607830e51ec
|
66
wordfreq/data/mecab-ko-dic/pos-id.def
Normal file
66
wordfreq/data/mecab-ko-dic/pos-id.def
Normal file
@ -0,0 +1,66 @@
|
||||
UNKNOWN,*,*,*,*,*,*,*,* 0
|
||||
*,*,*,*,Compound,*,*,*,* 1
|
||||
*,*,*,*,Inflect,EC,*,*,* 200
|
||||
*,*,*,*,Inflect,EF,*,*,* 200
|
||||
*,*,*,*,Inflect,EP,*,*,* 200
|
||||
*,*,*,*,Inflect,ETM,*,*,* 200
|
||||
*,*,*,*,Inflect,ETN,*,*,* 200
|
||||
*,*,*,*,Inflect,JC,*,*,* 210
|
||||
*,*,*,*,Inflect,JKB,*,*,* 210
|
||||
*,*,*,*,Inflect,JKC,*,*,* 210
|
||||
*,*,*,*,Inflect,JKG,*,*,* 210
|
||||
*,*,*,*,Inflect,JKO,*,*,* 210
|
||||
*,*,*,*,Inflect,JKQ,*,*,* 210
|
||||
*,*,*,*,Inflect,JKS,*,*,* 210
|
||||
*,*,*,*,Inflect,JKV,*,*,* 210
|
||||
*,*,*,*,Inflect,JX,*,*,* 210
|
||||
*,*,*,*,Inflect,XSA,*,*,* 220
|
||||
*,*,*,*,Inflect,XSN,*,*,* 221
|
||||
*,*,*,*,Inflect,XSV,*,*,* 222
|
||||
*,*,*,*,Inflect,VCP,*,*,* 230
|
||||
*,*,*,*,Inflect,*,*,*,* 2
|
||||
*,*,*,*,Preanalysis,*,*,*,* 3
|
||||
EC,*,*,*,*,*,*,*,* 100
|
||||
EF,*,*,*,*,*,*,*,* 100
|
||||
EP,*,*,*,*,*,*,*,* 100
|
||||
ETM,*,*,*,*,*,*,*,* 100
|
||||
ETN,*,*,*,*,*,*,*,* 100
|
||||
IC,*,*,*,*,*,*,*,* 110
|
||||
JC,*,*,*,*,*,*,*,* 120
|
||||
JKB,*,*,*,*,*,*,*,* 120
|
||||
JKC,*,*,*,*,*,*,*,* 120
|
||||
JKG,*,*,*,*,*,*,*,* 120
|
||||
JKO,*,*,*,*,*,*,*,* 120
|
||||
JKQ,*,*,*,*,*,*,*,* 120
|
||||
JKS,*,*,*,*,*,*,*,* 120
|
||||
JKV,*,*,*,*,*,*,*,* 120
|
||||
JX,*,*,*,*,*,*,*,* 120
|
||||
MAG,*,*,*,*,*,*,*,* 130
|
||||
MAJ,*,*,*,*,*,*,*,* 131
|
||||
MM,*,*,*,*,*,*,*,* 140
|
||||
NNG,*,*,*,*,*,*,*,* 150
|
||||
NNP,*,*,*,*,*,*,*,* 150
|
||||
NNB,*,*,*,*,*,*,*,* 150
|
||||
NNBC,*,*,*,*,*,*,*,* 150
|
||||
NP,*,*,*,*,*,*,*,* 150
|
||||
NR,*,*,*,*,*,*,*,* 150
|
||||
SF,*,*,*,*,*,*,*,* 160
|
||||
SH,*,*,*,*,*,*,*,* 161
|
||||
SL,*,*,*,*,*,*,*,* 162
|
||||
SN,*,*,*,*,*,*,*,* 163
|
||||
SP,*,*,*,*,*,*,*,* 164
|
||||
SSC,*,*,*,*,*,*,*,* 165
|
||||
SSO,*,*,*,*,*,*,*,* 166
|
||||
SC,*,*,*,*,*,*,*,* 167
|
||||
SY,*,*,*,*,*,*,*,* 168
|
||||
SE,*,*,*,*,*,*,*,* 169
|
||||
VA,*,*,*,*,*,*,*,* 170
|
||||
VCN,*,*,*,*,*,*,*,* 171
|
||||
VCP,*,*,*,*,*,*,*,* 172
|
||||
VV,*,*,*,*,*,*,*,* 173
|
||||
VX,*,*,*,*,*,*,*,* 174
|
||||
XPN,*,*,*,*,*,*,*,* 181
|
||||
XR,*,*,*,*,*,*,*,* 182
|
||||
XSA,*,*,*,*,*,*,*,* 183
|
||||
XSN,*,*,*,*,*,*,*,* 184
|
||||
XSV,*,*,*,*,*,*,*,* 185
|
51
wordfreq/data/mecab-ko-dic/rewrite.def
Normal file
51
wordfreq/data/mecab-ko-dic/rewrite.def
Normal file
@ -0,0 +1,51 @@
|
||||
# Feature(POS) to Internal State mapping
|
||||
#
|
||||
# 소성(素性)열에서 내부 상태 소생(素生)열로 변환하는 매핑을 정의합니다.
|
||||
#
|
||||
# CRF는 unigram, 왼쪽 문맥 bigram, 오른쪽 문맥 bigram의 3가지 정보를 사용하여
|
||||
# 통계를 계산합니다
|
||||
#
|
||||
# 각 섹션 다음에 한 줄에 하나의 매핑 규칙이 계속됩니다.
|
||||
#
|
||||
# 일치패턴 대상(変換先)
|
||||
#
|
||||
# 일치패턴에는 간단한 정규식을 사용할 수 있습니다.
|
||||
#
|
||||
# * : 모든 문자열과 일치
|
||||
# (AB|CD|EF) : AB 또는 CD 또는 EF 일치
|
||||
# AB : 문자열 AB에만 완전 매치
|
||||
#
|
||||
# 대상(変換先)은 $1 $2, $3..라는 매크로를 사용하여 소생(素生)의
|
||||
# 각 요소(CSV로 표시된 요소)의 내용을 참조할 수 있습니다.
|
||||
#
|
||||
# 품사태그,종성유무,표기,타입,첫번째품사,마지막품사,원형
|
||||
#
|
||||
# Unigram 내부 상태에 대한 매핑
|
||||
[unigram rewrite]
|
||||
*,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,$9
|
||||
|
||||
# 왼쪽 문맥 bigram 내부 상태에 대한 매핑
|
||||
# 기본으로 의미부류($2)까지 표현한다.
|
||||
# 조사외 몇 가지의 품사에서는 품사,의미부로,읽기($1,$2,*,$4)로 표현한다.
|
||||
# 하늘/NNG,T,*,*,*,*,* + 은/J,*,은,*,*,*,*
|
||||
#
|
||||
[left rewrite]
|
||||
BOS/EOS,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
|
||||
SF,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
|
||||
*,*,*,*,Inflect,(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|VCP|ETM|XSN),*,*,* $6,$2,*,$4,*,*,*,*,*
|
||||
*,*,*,*,(Inflect|Preanalysis),*,*,*,* $6,$2,*,*,*,*,*,*,*
|
||||
(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|VCP|ETM|XSN),*,*,*,*,*,*,*,* $1,$2,*,$4,*,*,*,*,*
|
||||
*,*,*,*,*,*,*,*,* $1,$2,*,*,*,*,*,*,*
|
||||
|
||||
# 오른쪽 문맥 bigram 내부 상태에 대한 매핑
|
||||
# 기본으로 종성유무($3)까지 표현한다.
|
||||
# 조사외 몇 가지의 품사에서는 품사,의미,종성유무,읽기($1,$2,$3,$4)로 표현한다.
|
||||
# ex) 하늘/NN,T,*,*,*,*,* + 은/J,T,은,*,*,*,*
|
||||
[right rewrite]
|
||||
BOS/EOS,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
|
||||
SF,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
|
||||
SL,*,*,*,*,*,*,*,* NNG,$2,$3,*,*,*,*,*,*
|
||||
*,*,*,*,Inflect,*,(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|XSN),*,* $7,$2,$3,$4,*,*,*,*,*
|
||||
*,*,*,*,(Inflect|Preanalysis),*,*,*,* $7,$2,$3,*,*,*,*,*,*
|
||||
(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|XSN),*,*,*,*,*,*,*,* $1,$2,$3,$4,*,*,*,*,*
|
||||
*,*,*,*,*,*,*,*,* $1,$2,$3,*,*,*,*,*,*
|
3818
wordfreq/data/mecab-ko-dic/right-id.def
Normal file
3818
wordfreq/data/mecab-ko-dic/right-id.def
Normal file
File diff suppressed because it is too large
Load Diff
1
wordfreq/data/mecab-ko-dic/sys.dic.REMOVED.git-id
Normal file
1
wordfreq/data/mecab-ko-dic/sys.dic.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
||||
9655d23c3a0900764cbcdb8d8395d0f09ec098ed
|
BIN
wordfreq/data/mecab-ko-dic/unk.dic
Normal file
BIN
wordfreq/data/mecab-ko-dic/unk.dic
Normal file
Binary file not shown.
@ -1,21 +0,0 @@
|
||||
import MeCab
|
||||
import unicodedata
|
||||
|
||||
|
||||
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
|
||||
# Tagger.
|
||||
MECAB_ANALYZER = MeCab.Tagger()
|
||||
|
||||
|
||||
def mecab_tokenize(text):
|
||||
"""
|
||||
Use the mecab-python3 package to tokenize the given Japanese text.
|
||||
|
||||
The simplest output from mecab-python3 is the single-string form, which
|
||||
contains the same table that the command-line version of MeCab would output.
|
||||
We find the tokens in the first column of this table.
|
||||
"""
|
||||
text = unicodedata.normalize('NFKC', text.strip())
|
||||
return [line.split('\t')[0]
|
||||
for line in MECAB_ANALYZER.parse(text).split('\n')
|
||||
if line != '' and line != 'EOS']
|
28
wordfreq/mecab.py
Normal file
28
wordfreq/mecab.py
Normal file
@ -0,0 +1,28 @@
|
||||
from pkg_resources import resource_filename
|
||||
import MeCab
|
||||
import unicodedata
|
||||
|
||||
|
||||
# Instantiate the MeCab analyzers for each language.
|
||||
MECAB_ANALYZERS = {
|
||||
'ja': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ja-ipadic')),
|
||||
'ko': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ko-dic'))
|
||||
}
|
||||
|
||||
|
||||
def mecab_tokenize(text, lang):
|
||||
"""
|
||||
Use the mecab-python3 package to tokenize the given text. The `lang`
|
||||
must be 'ja' for Japanese or 'ko' for Korean.
|
||||
|
||||
The simplest output from mecab-python3 is the single-string form, which
|
||||
contains the same table that the command-line version of MeCab would output.
|
||||
We find the tokens in the first column of this table.
|
||||
"""
|
||||
if lang not in MECAB_ANALYZERS:
|
||||
raise ValueError("Can't run MeCab on language %r" % lang)
|
||||
analyzer = MECAB_ANALYZERS[lang]
|
||||
text = unicodedata.normalize('NFKC', text.strip())
|
||||
return [line.split('\t')[0]
|
||||
for line in analyzer.parse(text).split('\n')
|
||||
if line != '' and line != 'EOS']
|
@ -2,6 +2,9 @@ import regex
|
||||
import unicodedata
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
jieba_tokenize = None
|
||||
|
||||
# See the documentation inside TOKEN_RE for why we have to handle these
|
||||
# scripts specially.
|
||||
SPACELESS_SCRIPTS = [
|
||||
@ -23,7 +26,6 @@ def _make_spaceless_expr():
|
||||
|
||||
SPACELESS_EXPR = _make_spaceless_expr()
|
||||
|
||||
|
||||
TOKEN_RE = regex.compile(r"""
|
||||
# Case 1: a special case for non-spaced languages
|
||||
# -----------------------------------------------
|
||||
@ -74,7 +76,7 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||
\S(?:\B\S|\p{M})*
|
||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
|
||||
|
||||
def simple_tokenize(text, include_punctuation=False):
|
||||
@ -98,6 +100,13 @@ def simple_tokenize(text, include_punctuation=False):
|
||||
tokens.
|
||||
|
||||
- It breaks on all spaces, even the "non-breaking" ones.
|
||||
|
||||
- It aims to keep marks together with words, so that they aren't erroneously
|
||||
split off as punctuation in languages such as Hindi.
|
||||
|
||||
- It keeps Southeast Asian scripts, such as Thai, glued together. This yields
|
||||
tokens that are much too long, but the alternative is that every character
|
||||
would end up in its own token, which is worse.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
@ -114,20 +123,20 @@ def turkish_tokenize(text, include_punctuation=False):
|
||||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
def japanese_tokenize(text, include_punctuation=False):
|
||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||
"""
|
||||
Tokenize Japanese text, initializing the MeCab tokenizer if necessary.
|
||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||
"""
|
||||
global mecab_tokenize
|
||||
if lang not in {'ja', 'ko'}:
|
||||
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.japanese import mecab_tokenize
|
||||
tokens = mecab_tokenize(text)
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
tokens = mecab_tokenize(text, lang)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
|
||||
|
||||
jieba_tokenize = None
|
||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||
"""
|
||||
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
|
||||
@ -140,16 +149,16 @@ def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
|
||||
|
||||
def remove_arabic_marks(text):
|
||||
def remove_marks(text):
|
||||
"""
|
||||
Remove decorations from Arabic words:
|
||||
Remove decorations from words in abjad scripts:
|
||||
|
||||
- Combining marks of class Mn, which tend to represent non-essential
|
||||
vowel markings.
|
||||
- Tatweels, horizontal segments that are used to extend or justify a
|
||||
word.
|
||||
- Tatweels, horizontal segments that are used to extend or justify an
|
||||
Arabic word.
|
||||
"""
|
||||
return ARABIC_MARK_RE.sub('', text)
|
||||
return MARK_RE.sub('', text)
|
||||
|
||||
|
||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
@ -158,30 +167,68 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
the language. Strings that are looked up in wordfreq will be run through
|
||||
this function first, so that they can be expected to match the data.
|
||||
|
||||
Here is what the tokenizer will do, depending on the language:
|
||||
Some of the processing steps are specific to one language, such as Chinese,
|
||||
but what broadly happens to the text depends on what general writing system
|
||||
the language uses, out of these categories:
|
||||
|
||||
- Chinese will be mapped to Simplified Chinese characters and tokenized
|
||||
using the Jieba tokenizer, trained on a custom word list of words that
|
||||
can be looked up in wordfreq.
|
||||
- Alphabetic scripts: English, Spanish, Russian, etc.
|
||||
- Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
|
||||
- CJK scripts: Chinese, Japanese, Korean
|
||||
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
||||
|
||||
- Japanese will be delegated to the external mecab-python module. It will
|
||||
be NFKC normalized, which is stronger than NFC normalization.
|
||||
|
||||
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||
language will only split on punctuation and script boundaries, giving
|
||||
you untokenized globs of characters that probably represent many words.
|
||||
Alphabetic scripts
|
||||
------------------
|
||||
|
||||
- Arabic will be NFKC normalized, and will have Arabic-specific combining
|
||||
marks and tatweels removed.
|
||||
The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
|
||||
European languages, which are relatively straightforward to tokenize.
|
||||
|
||||
- Languages written in cased alphabets will be case-folded to lowercase.
|
||||
Text in these scripts will be normalized to NFC form, then passed
|
||||
through a regular expression that implements the Word Segmentation section
|
||||
of Unicode Annex #29, and then case-folded to lowercase.
|
||||
|
||||
- Turkish will use a different case-folding procedure, so that capital
|
||||
I and İ map to ı and i respectively.
|
||||
The effect is mostly to split the text on spaces and punctuation. There are
|
||||
some subtleties involving apostrophes inside words, which the regex will
|
||||
only split when they occur before a vowel. ("Hasn't" is one token, but
|
||||
"l'enfant" is two.)
|
||||
|
||||
- Languages besides Japanese and Chinese will be tokenized using a regex
|
||||
that mostly implements the Word Segmentation section of Unicode Annex
|
||||
#29. See `simple_tokenize` for details.
|
||||
If the language is Turkish, the case-folding rules will take this into
|
||||
account, so that capital I and İ map to ı and i respectively.
|
||||
|
||||
|
||||
Abjad scripts
|
||||
-------------
|
||||
|
||||
Languages in the Arabic or Hebrew scripts are written with optional vowel
|
||||
marks, and sometimes other decorative markings and ligatures. In these
|
||||
languages:
|
||||
|
||||
- The text will be NFKC-normalized, which is a stronger and lossier form
|
||||
than NFC. Here its purpose is to reduce ligatures to simpler characters.
|
||||
|
||||
- Marks will be removed, as well as the Arabic tatweel (an extension of
|
||||
a word that is used for justification or decoration).
|
||||
|
||||
After these steps, the text will go through the same process as the
|
||||
alphabetic scripts above.
|
||||
|
||||
|
||||
CJK scripts
|
||||
-----------
|
||||
|
||||
In the CJK languages, word boundaries can't usually be identified by a
|
||||
regular expression. Instead, there needs to be some language-specific
|
||||
handling.
|
||||
|
||||
- Chinese text first gets converted to a canonical representation we call
|
||||
"Oversimplified Chinese", where all characters are replaced by their
|
||||
Simplified Chinese form, no matter what, even when this misspells a word or
|
||||
a name. This representation is then tokenized using the Jieba tokenizer,
|
||||
trained on the list of Chinese words that can be looked up in wordfreq.
|
||||
|
||||
- Japanese and Korean will be NFKC-normalized, then tokenized using the
|
||||
MeCab tokenizer, using dictionary files that are included in this
|
||||
package.
|
||||
|
||||
The `external_wordlist` option only affects Chinese tokenization. If it's
|
||||
True, then wordfreq will not use its own Chinese wordlist for tokenization.
|
||||
@ -189,15 +236,36 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
and it will leave Traditional Chinese characters as is. This will probably
|
||||
give more accurate tokenization, but the resulting tokens won't necessarily
|
||||
have word frequencies that can be looked up.
|
||||
|
||||
If you end up seeing tokens that are entire phrases or sentences glued
|
||||
together, that probably means you passed in CJK text with the wrong
|
||||
language code.
|
||||
|
||||
|
||||
Brahmic scripts and other languages
|
||||
-----------------------------------
|
||||
|
||||
Any kind of language not previously mentioned will just go through the same
|
||||
tokenizer that alphabetic languages use.
|
||||
|
||||
We've tweaked this tokenizer for the case of Indic languages in Brahmic
|
||||
scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
|
||||
languages where the default Unicode algorithm wouldn't quite work.
|
||||
|
||||
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
|
||||
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
|
||||
does not support these languages yet. It will split on spaces and
|
||||
punctuation, giving tokens that are far too long.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
return japanese_tokenize(text, include_punctuation)
|
||||
if lang == 'ja' or lang == 'ko':
|
||||
return tokenize_mecab_language(text, lang, include_punctuation)
|
||||
elif lang == 'zh':
|
||||
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||
elif lang == 'tr':
|
||||
return turkish_tokenize(text, include_punctuation)
|
||||
elif lang == 'ar':
|
||||
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
|
||||
elif lang in {'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'}:
|
||||
# Abjad languages
|
||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
else:
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
|
Loading…
Reference in New Issue
Block a user