Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages * Add Korean tokenization, and include MeCab files in data * add a Hebrew tokenization test * fix terminology in docstrings about abjad scripts * combine Japanese and Korean tokenization into the same function Former-commit-id: fec6eddcc3
2024-12-23 17:31:41 +00:00 · 2016-07-15 15:10:25 -04:00 · 2016-07-15 15:10:25 -04:00 · a0893af82e
commit a0893af82e
parent ac24b8eab4
24 changed files with 7058 additions and 62 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,8 @@
 recursive-include wordfreq/data *.gz
 include README.md
 recursive-include wordfreq/data *.txt
+recursive-include wordfreq/data *.bin
+recursive-include wordfreq/data *.def
+recursive-include wordfreq/data *.dic
+recursive-include wordfreq/data dicrc
+recursive-include wordfreq/data COPYING
--- a/README.md
+++ b/README.md
@ -276,7 +276,11 @@ The terms of use of this data are:
    acknowledgement of Google Books Ngram Viewer as the source, and inclusion
    of a link to http://books.google.com/ngrams, would be appreciated.

-It also contains data derived from the following Creative Commons-licensed
+`wordfreq` uses MeCab, by Taku Kudo, plus Korean data files by Yongwoon Lee and
+Yungho Yu. The Korean data is under an Apache 2 license, a copy of which
+appears in wordfreq/data/mecab-ko-dic/COPYING.
+
+`wordfreq` also contains data derived from the following Creative Commons-licensed
 sources:

 - The Leeds Internet Corpus, from the University of Leeds Centre for Translation
--- a/setup.py
+++ b/setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='1.4.1',
+    version='1.4.2',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
@ -47,10 +47,10 @@ setup(
    include_package_data=True,
    install_requires=dependencies,

-    # mecab-python3 is required for looking up Japanese word frequencies. In
-    # turn, it depends on libmecab-dev being installed on the system. It's not
-    # listed under 'install_requires' because wordfreq should be usable in
-    # other languages without it.
+    # mecab-python3 is required for looking up Japanese or Korean word
+    # frequencies. In turn, it depends on libmecab-dev being installed on the
+    # system. It's not listed under 'install_requires' because wordfreq should
+    # be usable in other languages without it.
    #
    # Similarly, jieba is required for Chinese word frequencies.
    extras_require={
--- a/tests/test.py
+++ b/tests/test.py
@ -152,7 +152,7 @@ def test_not_enough_ascii():
    random_ascii_words(lang='zh')


-def test_ar():
+def test_arabic():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
@ -183,6 +183,7 @@ def test_ideographic_fallback():
        ['ひらがな', 'カタカナ', 'romaji']
    )

+def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
@ -194,3 +195,7 @@ def test_ideographic_fallback():

    # Test Hindi -- tokens split where there are spaces, and not where there aren't
    eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
+
+    # Remove vowel points in Hebrew
+    eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
+
--- a/tests/test_korean.py
+++ b/tests/test_korean.py
@ -0,0 +1,22 @@
+from nose.tools import eq_, assert_almost_equal
+from wordfreq import tokenize, word_frequency
+
+
+def test_tokens():
+    eq_(tokenize('감사합니다', 'ko'),
+        ['감사', '합니다'])
+
+
+def test_combination():
+    gamsa_freq = word_frequency('감사', 'ko')
+    habnida_freq = word_frequency('합니다', 'ko')
+
+    assert_almost_equal(
+        word_frequency('감사감사', 'ko'),
+        gamsa_freq / 2
+    )
+    assert_almost_equal(
+        1.0 / word_frequency('감사합니다', 'ko'),
+        1.0 / gamsa_freq + 1.0 / habnida_freq
+    )
+
--- a/wordfreq/data/mecab-ja-ipadic/char.bin
+++ b/wordfreq/data/mecab-ja-ipadic/char.bin
--- a/wordfreq/data/mecab-ja-ipadic/dicrc
+++ b/wordfreq/data/mecab-ja-ipadic/dicrc
@ -0,0 +1,29 @@
+;
+; Configuration file of IPADIC
+;
+; $Id: dicrc,v 1.4 2006/04/08 06:41:36 taku-ku Exp $;
+;
+cost-factor = 800
+bos-feature = BOS/EOS,*,*,*,*,*,*,*,*
+eval-size = 8
+unk-eval-size = 4
+config-charset = UTF-8
+
+; yomi
+node-format-yomi = %pS%f[7]
+unk-format-yomi = %M
+eos-format-yomi  = \n
+
+; simple
+node-format-simple = %m\t%F-[0,1,2,3]\n
+eos-format-simple  = EOS\n
+
+; ChaSen
+node-format-chasen = %m\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n
+unk-format-chasen  = %m\t%m\t%m\t%F-[0,1,2,3]\t\t\n
+eos-format-chasen  = EOS\n
+
+; ChaSen (include spaces)
+node-format-chasen2 = %M\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n
+unk-format-chasen2  = %M\t%m\t%m\t%F-[0,1,2,3]\t\t\n
+eos-format-chasen2  = EOS\n
--- a/wordfreq/data/mecab-ja-ipadic/matrix.bin
+++ b/wordfreq/data/mecab-ja-ipadic/matrix.bin
--- a/wordfreq/data/mecab-ja-ipadic/sys.dic.REMOVED.git-id
+++ b/wordfreq/data/mecab-ja-ipadic/sys.dic.REMOVED.git-id
@ -0,0 +1 @@
+c926154d533ccaef1515af6883056d69c34ca239
--- a/wordfreq/data/mecab-ja-ipadic/unk.dic
+++ b/wordfreq/data/mecab-ja-ipadic/unk.dic
--- a/wordfreq/data/mecab-ko-dic/COPYING
+++ b/wordfreq/data/mecab-ko-dic/COPYING
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/wordfreq/data/mecab-ko-dic/char.bin
+++ b/wordfreq/data/mecab-ko-dic/char.bin
--- a/wordfreq/data/mecab-ko-dic/dicrc
+++ b/wordfreq/data/mecab-ko-dic/dicrc
@ -0,0 +1,25 @@
+;
+; Configuration file of mecab-ko-dic
+;
+
+# 비용 값으로 변환할 때 배율 팩터입니다. 700에서 800에서 문제가 없습니다.
+cost-factor = 800
+# 문장의 시작, 문장 끝에 대한 소성(素性)입니다. CSV로 표현합니다.
+bos-feature = BOS/EOS,*,*,*,*,*,*,*,*
+# 알려진 단어의 경우 소성(素性)의 처음부터 몇 개까지 일치하면 정답으로
+# 인정하는지를 지정합니다. 일반적으로 알려진 단어는 품사 활용 등의 정보만
+# 맞추면 되기 때문에, "읽기", "발음" 소성(素性)은 무시하도록 합니다.
+# 여기에서는 3가지가 평가됩니다.
+eval-size = 4
+# 알 수 없는 단어의 경우
+# 소성의 처음부터 몇 개까지 일치하면 정답으로 인정할지를 지정합니다.
+unk-eval-size = 2
+# dicrc, char.def, unk.def, pos-id.def 파일의 문자 코드셋입니다.
+config-charset = UTF-8
+# 좌측에 공백을 포함하는 품사의 연접 비용을 늘리기 위한 설정입니다.
+# mecab-ko에서만 사용되는 설정입니다. 다음과 같은 형식을 가집니다.
+# <posid 1>,<posid 1 penalty cost>,<posid 2>,<posid 2 penalty cost>...
+# 
+# 예) 120,6000 => posid가 120인 품사(조사)의 좌측에 공백을 포함할 경우
+# 연접 비용을 6000만큼 늘림
+left-space-penalty-factor = 100,3000,120,6000,172,3000,183,3000,184,3000,185,3000,200,3000,210,6000,220,3000,221,3000,222,3000,230,3000
--- a/wordfreq/data/mecab-ko-dic/left-id.def
+++ b/wordfreq/data/mecab-ko-dic/left-id.def
--- a/wordfreq/data/mecab-ko-dic/matrix.bin.REMOVED.git-id
+++ b/wordfreq/data/mecab-ko-dic/matrix.bin.REMOVED.git-id
@ -0,0 +1 @@
+2dbb57fe707d7dddd2392526aad7cbac77378bb3
--- a/wordfreq/data/mecab-ko-dic/model.bin.REMOVED.git-id
+++ b/wordfreq/data/mecab-ko-dic/model.bin.REMOVED.git-id
@ -0,0 +1 @@
+58619494b4f81190218b76d9d2090607830e51ec
--- a/wordfreq/data/mecab-ko-dic/pos-id.def
+++ b/wordfreq/data/mecab-ko-dic/pos-id.def
@ -0,0 +1,66 @@
+UNKNOWN,*,*,*,*,*,*,*,* 0
+*,*,*,*,Compound,*,*,*,* 1
+*,*,*,*,Inflect,EC,*,*,* 200
+*,*,*,*,Inflect,EF,*,*,* 200
+*,*,*,*,Inflect,EP,*,*,* 200
+*,*,*,*,Inflect,ETM,*,*,* 200
+*,*,*,*,Inflect,ETN,*,*,* 200
+*,*,*,*,Inflect,JC,*,*,* 210
+*,*,*,*,Inflect,JKB,*,*,* 210
+*,*,*,*,Inflect,JKC,*,*,* 210
+*,*,*,*,Inflect,JKG,*,*,* 210
+*,*,*,*,Inflect,JKO,*,*,* 210
+*,*,*,*,Inflect,JKQ,*,*,* 210
+*,*,*,*,Inflect,JKS,*,*,* 210
+*,*,*,*,Inflect,JKV,*,*,* 210
+*,*,*,*,Inflect,JX,*,*,* 210
+*,*,*,*,Inflect,XSA,*,*,* 220
+*,*,*,*,Inflect,XSN,*,*,* 221
+*,*,*,*,Inflect,XSV,*,*,* 222
+*,*,*,*,Inflect,VCP,*,*,* 230
+*,*,*,*,Inflect,*,*,*,* 2
+*,*,*,*,Preanalysis,*,*,*,* 3
+EC,*,*,*,*,*,*,*,* 100
+EF,*,*,*,*,*,*,*,* 100
+EP,*,*,*,*,*,*,*,* 100
+ETM,*,*,*,*,*,*,*,* 100
+ETN,*,*,*,*,*,*,*,* 100
+IC,*,*,*,*,*,*,*,* 110
+JC,*,*,*,*,*,*,*,* 120
+JKB,*,*,*,*,*,*,*,* 120
+JKC,*,*,*,*,*,*,*,* 120
+JKG,*,*,*,*,*,*,*,* 120
+JKO,*,*,*,*,*,*,*,* 120
+JKQ,*,*,*,*,*,*,*,* 120
+JKS,*,*,*,*,*,*,*,* 120
+JKV,*,*,*,*,*,*,*,* 120
+JX,*,*,*,*,*,*,*,* 120
+MAG,*,*,*,*,*,*,*,* 130
+MAJ,*,*,*,*,*,*,*,* 131
+MM,*,*,*,*,*,*,*,* 140
+NNG,*,*,*,*,*,*,*,* 150
+NNP,*,*,*,*,*,*,*,* 150
+NNB,*,*,*,*,*,*,*,* 150
+NNBC,*,*,*,*,*,*,*,* 150
+NP,*,*,*,*,*,*,*,* 150
+NR,*,*,*,*,*,*,*,* 150
+SF,*,*,*,*,*,*,*,* 160
+SH,*,*,*,*,*,*,*,* 161
+SL,*,*,*,*,*,*,*,* 162
+SN,*,*,*,*,*,*,*,* 163
+SP,*,*,*,*,*,*,*,* 164
+SSC,*,*,*,*,*,*,*,* 165
+SSO,*,*,*,*,*,*,*,* 166
+SC,*,*,*,*,*,*,*,* 167
+SY,*,*,*,*,*,*,*,* 168
+SE,*,*,*,*,*,*,*,* 169
+VA,*,*,*,*,*,*,*,* 170
+VCN,*,*,*,*,*,*,*,* 171
+VCP,*,*,*,*,*,*,*,* 172
+VV,*,*,*,*,*,*,*,* 173
+VX,*,*,*,*,*,*,*,* 174
+XPN,*,*,*,*,*,*,*,* 181
+XR,*,*,*,*,*,*,*,* 182
+XSA,*,*,*,*,*,*,*,* 183
+XSN,*,*,*,*,*,*,*,* 184
+XSV,*,*,*,*,*,*,*,* 185
--- a/wordfreq/data/mecab-ko-dic/rewrite.def
+++ b/wordfreq/data/mecab-ko-dic/rewrite.def
@ -0,0 +1,51 @@
+# Feature(POS) to Internal State mapping
+#
+# 소성(素性)열에서 내부 상태 소생(素生)열로 변환하는 매핑을 정의합니다.
+#
+# CRF는 unigram, 왼쪽 문맥 bigram, 오른쪽 문맥 bigram의 3가지 정보를 사용하여
+# 통계를 계산합니다
+#
+# 각 섹션 다음에 한 줄에 하나의 매핑 규칙이 계속됩니다.
+#
+#    일치패턴 대상(変換先)
+#
+# 일치패턴에는 간단한 정규식을 사용할 수 있습니다.
+#
+#     * : 모든 문자열과 일치
+#     (AB|CD|EF) : AB 또는 CD 또는 EF 일치
+#     AB : 문자열 AB에만 완전 매치
+#
+# 대상(変換先)은 $1 $2, $3..라는 매크로를 사용하여 소생(素生)의
+# 각 요소(CSV로 표시된 요소)의 내용을 참조할 수 있습니다.
+#
+#     품사태그,종성유무,표기,타입,첫번째품사,마지막품사,원형
+#
+# Unigram 내부 상태에 대한 매핑
+[unigram rewrite]
+*,*,*,*,*,*,*,*,*     $1,$2,$3,$4,$5,$6,$7,$8,$9
+
+# 왼쪽 문맥 bigram 내부 상태에 대한 매핑
+# 기본으로 의미부류($2)까지 표현한다.
+# 조사외 몇 가지의 품사에서는 품사,의미부로,읽기($1,$2,*,$4)로 표현한다.
+#     하늘/NNG,T,*,*,*,*,* + 은/J,*,은,*,*,*,*
+#
+[left rewrite]
+BOS/EOS,*,*,*,*,*,*,*,*         $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
+SF,*,*,*,*,*,*,*,*              $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
+*,*,*,*,Inflect,(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|VCP|ETM|XSN),*,*,*  $6,$2,*,$4,*,*,*,*,*
+*,*,*,*,(Inflect|Preanalysis),*,*,*,*                                           $6,$2,*,*,*,*,*,*,*
+(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|VCP|ETM|XSN),*,*,*,*,*,*,*,*        $1,$2,*,$4,*,*,*,*,*
+*,*,*,*,*,*,*,*,*               $1,$2,*,*,*,*,*,*,*
+
+# 오른쪽 문맥 bigram 내부 상태에 대한 매핑
+# 기본으로 종성유무($3)까지 표현한다.
+# 조사외 몇 가지의 품사에서는 품사,의미,종성유무,읽기($1,$2,$3,$4)로 표현한다.
+#     ex) 하늘/NN,T,*,*,*,*,* + 은/J,T,은,*,*,*,*
+[right rewrite]
+BOS/EOS,*,*,*,*,*,*,*,*         $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
+SF,*,*,*,*,*,*,*,*              $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
+SL,*,*,*,*,*,*,*,*              NNG,$2,$3,*,*,*,*,*,*
+*,*,*,*,Inflect,*,(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|XSN),*,*          $7,$2,$3,$4,*,*,*,*,*
+*,*,*,*,(Inflect|Preanalysis),*,*,*,*                                           $7,$2,$3,*,*,*,*,*,*
+(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|XSN),*,*,*,*,*,*,*,*                $1,$2,$3,$4,*,*,*,*,*
+*,*,*,*,*,*,*,*,*               $1,$2,$3,*,*,*,*,*,*
--- a/wordfreq/data/mecab-ko-dic/right-id.def
+++ b/wordfreq/data/mecab-ko-dic/right-id.def
--- a/wordfreq/data/mecab-ko-dic/sys.dic.REMOVED.git-id
+++ b/wordfreq/data/mecab-ko-dic/sys.dic.REMOVED.git-id
@ -0,0 +1 @@
+9655d23c3a0900764cbcdb8d8395d0f09ec098ed
--- a/wordfreq/data/mecab-ko-dic/unk.dic
+++ b/wordfreq/data/mecab-ko-dic/unk.dic
--- a/wordfreq/japanese.py
+++ b/wordfreq/japanese.py
@ -1,21 +0,0 @@
-import MeCab
-import unicodedata
-
-
-# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
-# Tagger.
-MECAB_ANALYZER = MeCab.Tagger()
-
-
-def mecab_tokenize(text):
-    """
-    Use the mecab-python3 package to tokenize the given Japanese text.
-
-    The simplest output from mecab-python3 is the single-string form, which
-    contains the same table that the command-line version of MeCab would output.
-    We find the tokens in the first column of this table.
-    """
-    text = unicodedata.normalize('NFKC', text.strip())
-    return [line.split('\t')[0]
-            for line in MECAB_ANALYZER.parse(text).split('\n')
-            if line != '' and line != 'EOS']
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@ -0,0 +1,28 @@
+from pkg_resources import resource_filename
+import MeCab
+import unicodedata
+
+
+# Instantiate the MeCab analyzers for each language.
+MECAB_ANALYZERS = {
+    'ja': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ja-ipadic')),
+    'ko': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ko-dic'))
+}
+
+
+def mecab_tokenize(text, lang):
+    """
+    Use the mecab-python3 package to tokenize the given text. The `lang`
+    must be 'ja' for Japanese or 'ko' for Korean.
+
+    The simplest output from mecab-python3 is the single-string form, which
+    contains the same table that the command-line version of MeCab would output.
+    We find the tokens in the first column of this table.
+    """
+    if lang not in MECAB_ANALYZERS:
+        raise ValueError("Can't run MeCab on language %r" % lang)
+    analyzer = MECAB_ANALYZERS[lang]
+    text = unicodedata.normalize('NFKC', text.strip())
+    return [line.split('\t')[0]
+            for line in analyzer.parse(text).split('\n')
+            if line != '' and line != 'EOS']
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -2,6 +2,9 @@ import regex
 import unicodedata


+mecab_tokenize = None
+jieba_tokenize = None
+
 # See the documentation inside TOKEN_RE for why we have to handle these
 # scripts specially.
 SPACELESS_SCRIPTS = [
@ -23,7 +26,6 @@ def _make_spaceless_expr():

 SPACELESS_EXPR = _make_spaceless_expr()

-
 TOKEN_RE = regex.compile(r"""
    # Case 1: a special case for non-spaced languages
    # -----------------------------------------------
@ -74,7 +76,7 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
    \S(?:\B\S|\p{M})*
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)

-ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
+MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)


 def simple_tokenize(text, include_punctuation=False):
@ -98,6 +100,13 @@ def simple_tokenize(text, include_punctuation=False):
      tokens.

    - It breaks on all spaces, even the "non-breaking" ones.
+
+    - It aims to keep marks together with words, so that they aren't erroneously
+      split off as punctuation in languages such as Hindi.
+
+    - It keeps Southeast Asian scripts, such as Thai, glued together. This yields
+      tokens that are much too long, but the alternative is that every character
+      would end up in its own token, which is worse.
    """
    text = unicodedata.normalize('NFC', text)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
@ -114,20 +123,20 @@ def turkish_tokenize(text, include_punctuation=False):
    return [token.strip("'").casefold() for token in token_expr.findall(text)]


-mecab_tokenize = None
-def japanese_tokenize(text, include_punctuation=False):
+def tokenize_mecab_language(text, lang, include_punctuation=False):
    """
-    Tokenize Japanese text, initializing the MeCab tokenizer if necessary.
+    Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
    """
    global mecab_tokenize
+    if lang not in {'ja', 'ko'}:
+        raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
    if mecab_tokenize is None:
-        from wordfreq.japanese import mecab_tokenize
-    tokens = mecab_tokenize(text)
+        from wordfreq.mecab import mecab_tokenize
+    tokens = mecab_tokenize(text, lang)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.casefold() for token in tokens if token_expr.match(token)]


-jieba_tokenize = None
 def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
    """
    Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
@ -140,16 +149,16 @@ def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
    return [token.casefold() for token in tokens if token_expr.match(token)]


-def remove_arabic_marks(text):
+def remove_marks(text):
    """
-    Remove decorations from Arabic words:
+    Remove decorations from words in abjad scripts:

    - Combining marks of class Mn, which tend to represent non-essential
      vowel markings.
-    - Tatweels, horizontal segments that are used to extend or justify a
-      word.
+    - Tatweels, horizontal segments that are used to extend or justify an
+      Arabic word.
    """
-    return ARABIC_MARK_RE.sub('', text)
+    return MARK_RE.sub('', text)


 def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
@ -158,30 +167,68 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    the language. Strings that are looked up in wordfreq will be run through
    this function first, so that they can be expected to match the data.

-    Here is what the tokenizer will do, depending on the language:
+    Some of the processing steps are specific to one language, such as Chinese,
+    but what broadly happens to the text depends on what general writing system
+    the language uses, out of these categories:

-    - Chinese will be mapped to Simplified Chinese characters and tokenized
-      using the Jieba tokenizer, trained on a custom word list of words that
-      can be looked up in wordfreq.
+    - Alphabetic scripts: English, Spanish, Russian, etc.
+    - Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
+    - CJK scripts: Chinese, Japanese, Korean
+    - Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.

-    - Japanese will be delegated to the external mecab-python module. It will
-      be NFKC normalized, which is stronger than NFC normalization.

-    - Chinese or Japanese texts that aren't identified as the appropriate
-      language will only split on punctuation and script boundaries, giving
-      you untokenized globs of characters that probably represent many words.
+    Alphabetic scripts
+    ------------------

-    - Arabic will be NFKC normalized, and will have Arabic-specific combining
-      marks and tatweels removed.
+    The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
+    European languages, which are relatively straightforward to tokenize.

-    - Languages written in cased alphabets will be case-folded to lowercase.
+    Text in these scripts will be normalized to NFC form, then passed
+    through a regular expression that implements the Word Segmentation section
+    of Unicode Annex #29, and then case-folded to lowercase.

-    - Turkish will use a different case-folding procedure, so that capital
-      I and İ map to ı and i respectively.
+    The effect is mostly to split the text on spaces and punctuation. There are
+    some subtleties involving apostrophes inside words, which the regex will
+    only split when they occur before a vowel. ("Hasn't" is one token, but
+    "l'enfant" is two.)

-    - Languages besides Japanese and Chinese will be tokenized using a regex
-      that mostly implements the Word Segmentation section of Unicode Annex
-      #29. See `simple_tokenize` for details.
+    If the language is Turkish, the case-folding rules will take this into
+    account, so that capital I and İ map to ı and i respectively.
+
+
+    Abjad scripts
+    -------------
+
+    Languages in the Arabic or Hebrew scripts are written with optional vowel
+    marks, and sometimes other decorative markings and ligatures. In these
+    languages:
+
+    - The text will be NFKC-normalized, which is a stronger and lossier form
+      than NFC. Here its purpose is to reduce ligatures to simpler characters.
+
+    - Marks will be removed, as well as the Arabic tatweel (an extension of
+      a word that is used for justification or decoration).
+
+    After these steps, the text will go through the same process as the
+    alphabetic scripts above.
+
+
+    CJK scripts
+    -----------
+
+    In the CJK languages, word boundaries can't usually be identified by a
+    regular expression. Instead, there needs to be some language-specific
+    handling.
+
+    - Chinese text first gets converted to a canonical representation we call
+      "Oversimplified Chinese", where all characters are replaced by their
+      Simplified Chinese form, no matter what, even when this misspells a word or
+      a name. This representation is then tokenized using the Jieba tokenizer,
+      trained on the list of Chinese words that can be looked up in wordfreq.
+
+    - Japanese and Korean will be NFKC-normalized, then tokenized using the
+      MeCab tokenizer, using dictionary files that are included in this
+      package.

    The `external_wordlist` option only affects Chinese tokenization.  If it's
    True, then wordfreq will not use its own Chinese wordlist for tokenization.
@ -189,15 +236,36 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    and it will leave Traditional Chinese characters as is. This will probably
    give more accurate tokenization, but the resulting tokens won't necessarily
    have word frequencies that can be looked up.
+
+    If you end up seeing tokens that are entire phrases or sentences glued
+    together, that probably means you passed in CJK text with the wrong
+    language code.
+
+
+    Brahmic scripts and other languages
+    -----------------------------------
+
+    Any kind of language not previously mentioned will just go through the same
+    tokenizer that alphabetic languages use.
+
+    We've tweaked this tokenizer for the case of Indic languages in Brahmic
+    scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
+    languages where the default Unicode algorithm wouldn't quite work.
+
+    Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
+    written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
+    does not support these languages yet. It will split on spaces and
+    punctuation, giving tokens that are far too long.
    """
-    if lang == 'ja':
-        return japanese_tokenize(text, include_punctuation)
+    if lang == 'ja' or lang == 'ko':
+        return tokenize_mecab_language(text, lang, include_punctuation)
    elif lang == 'zh':
        return chinese_tokenize(text, include_punctuation, external_wordlist)
    elif lang == 'tr':
        return turkish_tokenize(text, include_punctuation)
-    elif lang == 'ar':
-        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
+    elif lang in {'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'}:
+        # Abjad languages
+        text = remove_marks(unicodedata.normalize('NFKC', text))
        return simple_tokenize(text, include_punctuation)
    else:
        return simple_tokenize(text, include_punctuation)