Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function
This commit is contained in:
Rob Speer 2016-07-15 15:10:25 -04:00 committed by Lance Nathan
parent 270f6c7ca6
commit fec6eddcc3
24 changed files with 7054 additions and 62 deletions

View File

@ -1,3 +1,8 @@
recursive-include wordfreq/data *.gz
include README.md
recursive-include wordfreq/data *.txt
recursive-include wordfreq/data *.bin
recursive-include wordfreq/data *.def
recursive-include wordfreq/data *.dic
recursive-include wordfreq/data dicrc
recursive-include wordfreq/data COPYING

View File

@ -276,7 +276,11 @@ The terms of use of this data are:
acknowledgement of Google Books Ngram Viewer as the source, and inclusion
of a link to http://books.google.com/ngrams, would be appreciated.
It also contains data derived from the following Creative Commons-licensed
`wordfreq` uses MeCab, by Taku Kudo, plus Korean data files by Yongwoon Lee and
Yungho Yu. The Korean data is under an Apache 2 license, a copy of which
appears in wordfreq/data/mecab-ko-dic/COPYING.
`wordfreq` also contains data derived from the following Creative Commons-licensed
sources:
- The Leeds Internet Corpus, from the University of Leeds Centre for Translation

View File

@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
setup(
name="wordfreq",
version='1.4.1',
version='1.4.2',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',
@ -47,10 +47,10 @@ setup(
include_package_data=True,
install_requires=dependencies,
# mecab-python3 is required for looking up Japanese word frequencies. In
# turn, it depends on libmecab-dev being installed on the system. It's not
# listed under 'install_requires' because wordfreq should be usable in
# other languages without it.
# mecab-python3 is required for looking up Japanese or Korean word
# frequencies. In turn, it depends on libmecab-dev being installed on the
# system. It's not listed under 'install_requires' because wordfreq should
# be usable in other languages without it.
#
# Similarly, jieba is required for Chinese word frequencies.
extras_require={

View File

@ -152,7 +152,7 @@ def test_not_enough_ascii():
random_ascii_words(lang='zh')
def test_ar():
def test_arabic():
# Remove tatweels
eq_(
tokenize('متــــــــعب', 'ar'),
@ -183,6 +183,7 @@ def test_ideographic_fallback():
['ひらがな', 'カタカナ', 'romaji']
)
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
@ -194,3 +195,7 @@ def test_ideographic_fallback():
# Test Hindi -- tokens split where there are spaces, and not where there aren't
eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
# Remove vowel points in Hebrew
eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])

22
tests/test_korean.py Normal file
View File

@ -0,0 +1,22 @@
from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency
def test_tokens():
eq_(tokenize('감사합니다', 'ko'),
['감사', '합니다'])
def test_combination():
gamsa_freq = word_frequency('감사', 'ko')
habnida_freq = word_frequency('합니다', 'ko')
assert_almost_equal(
word_frequency('감사감사', 'ko'),
gamsa_freq / 2
)
assert_almost_equal(
1.0 / word_frequency('감사합니다', 'ko'),
1.0 / gamsa_freq + 1.0 / habnida_freq
)

Binary file not shown.

View File

@ -0,0 +1,29 @@
;
; Configuration file of IPADIC
;
; $Id: dicrc,v 1.4 2006/04/08 06:41:36 taku-ku Exp $;
;
cost-factor = 800
bos-feature = BOS/EOS,*,*,*,*,*,*,*,*
eval-size = 8
unk-eval-size = 4
config-charset = UTF-8
; yomi
node-format-yomi = %pS%f[7]
unk-format-yomi = %M
eos-format-yomi = \n
; simple
node-format-simple = %m\t%F-[0,1,2,3]\n
eos-format-simple = EOS\n
; ChaSen
node-format-chasen = %m\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n
unk-format-chasen = %m\t%m\t%m\t%F-[0,1,2,3]\t\t\n
eos-format-chasen = EOS\n
; ChaSen (include spaces)
node-format-chasen2 = %M\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n
unk-format-chasen2 = %M\t%m\t%m\t%F-[0,1,2,3]\t\t\n
eos-format-chasen2 = EOS\n

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Binary file not shown.

View File

@ -0,0 +1,25 @@
;
; Configuration file of mecab-ko-dic
;
# 비용 값으로 변환할 때 배율 팩터입니다. 700에서 800에서 문제가 없습니다.
cost-factor = 800
# 문장의 시작, 문장 끝에 대한 소성(素性)입니다. CSV로 표현합니다.
bos-feature = BOS/EOS,*,*,*,*,*,*,*,*
# 알려진 단어의 경우 소성(素性)의 처음부터 몇 개까지 일치하면 정답으로
# 인정하는지를 지정합니다. 일반적으로 알려진 단어는 품사 활용 등의 정보만
# 맞추면 되기 때문에, "읽기", "발음" 소성(素性)은 무시하도록 합니다.
# 여기에서는 3가지가 평가됩니다.
eval-size = 4
# 알 수 없는 단어의 경우
# 소성의 처음부터 몇 개까지 일치하면 정답으로 인정할지를 지정합니다.
unk-eval-size = 2
# dicrc, char.def, unk.def, pos-id.def 파일의 문자 코드셋입니다.
config-charset = UTF-8
# 좌측에 공백을 포함하는 품사의 연접 비용을 늘리기 위한 설정입니다.
# mecab-ko에서만 사용되는 설정입니다. 다음과 같은 형식을 가집니다.
# <posid 1>,<posid 1 penalty cost>,<posid 2>,<posid 2 penalty cost>...
#
# 예) 120,6000 => posid가 120인 품사(조사)의 좌측에 공백을 포함할 경우
# 연접 비용을 6000만큼 늘림
left-space-penalty-factor = 100,3000,120,6000,172,3000,183,3000,184,3000,185,3000,200,3000,210,6000,220,3000,221,3000,222,3000,230,3000

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,66 @@
UNKNOWN,*,*,*,*,*,*,*,* 0
*,*,*,*,Compound,*,*,*,* 1
*,*,*,*,Inflect,EC,*,*,* 200
*,*,*,*,Inflect,EF,*,*,* 200
*,*,*,*,Inflect,EP,*,*,* 200
*,*,*,*,Inflect,ETM,*,*,* 200
*,*,*,*,Inflect,ETN,*,*,* 200
*,*,*,*,Inflect,JC,*,*,* 210
*,*,*,*,Inflect,JKB,*,*,* 210
*,*,*,*,Inflect,JKC,*,*,* 210
*,*,*,*,Inflect,JKG,*,*,* 210
*,*,*,*,Inflect,JKO,*,*,* 210
*,*,*,*,Inflect,JKQ,*,*,* 210
*,*,*,*,Inflect,JKS,*,*,* 210
*,*,*,*,Inflect,JKV,*,*,* 210
*,*,*,*,Inflect,JX,*,*,* 210
*,*,*,*,Inflect,XSA,*,*,* 220
*,*,*,*,Inflect,XSN,*,*,* 221
*,*,*,*,Inflect,XSV,*,*,* 222
*,*,*,*,Inflect,VCP,*,*,* 230
*,*,*,*,Inflect,*,*,*,* 2
*,*,*,*,Preanalysis,*,*,*,* 3
EC,*,*,*,*,*,*,*,* 100
EF,*,*,*,*,*,*,*,* 100
EP,*,*,*,*,*,*,*,* 100
ETM,*,*,*,*,*,*,*,* 100
ETN,*,*,*,*,*,*,*,* 100
IC,*,*,*,*,*,*,*,* 110
JC,*,*,*,*,*,*,*,* 120
JKB,*,*,*,*,*,*,*,* 120
JKC,*,*,*,*,*,*,*,* 120
JKG,*,*,*,*,*,*,*,* 120
JKO,*,*,*,*,*,*,*,* 120
JKQ,*,*,*,*,*,*,*,* 120
JKS,*,*,*,*,*,*,*,* 120
JKV,*,*,*,*,*,*,*,* 120
JX,*,*,*,*,*,*,*,* 120
MAG,*,*,*,*,*,*,*,* 130
MAJ,*,*,*,*,*,*,*,* 131
MM,*,*,*,*,*,*,*,* 140
NNG,*,*,*,*,*,*,*,* 150
NNP,*,*,*,*,*,*,*,* 150
NNB,*,*,*,*,*,*,*,* 150
NNBC,*,*,*,*,*,*,*,* 150
NP,*,*,*,*,*,*,*,* 150
NR,*,*,*,*,*,*,*,* 150
SF,*,*,*,*,*,*,*,* 160
SH,*,*,*,*,*,*,*,* 161
SL,*,*,*,*,*,*,*,* 162
SN,*,*,*,*,*,*,*,* 163
SP,*,*,*,*,*,*,*,* 164
SSC,*,*,*,*,*,*,*,* 165
SSO,*,*,*,*,*,*,*,* 166
SC,*,*,*,*,*,*,*,* 167
SY,*,*,*,*,*,*,*,* 168
SE,*,*,*,*,*,*,*,* 169
VA,*,*,*,*,*,*,*,* 170
VCN,*,*,*,*,*,*,*,* 171
VCP,*,*,*,*,*,*,*,* 172
VV,*,*,*,*,*,*,*,* 173
VX,*,*,*,*,*,*,*,* 174
XPN,*,*,*,*,*,*,*,* 181
XR,*,*,*,*,*,*,*,* 182
XSA,*,*,*,*,*,*,*,* 183
XSN,*,*,*,*,*,*,*,* 184
XSV,*,*,*,*,*,*,*,* 185

View File

@ -0,0 +1,51 @@
# Feature(POS) to Internal State mapping
#
# () () .
#
# CRF는 unigram, bigram, bigram의 3
#
#
# .
#
# ()
#
# .
#
# * :
# (AB|CD|EF) : AB CD EF
# AB : AB에만
#
# () $1 $2, $3.. ()
# (CSV로 ) .
#
# ,,,,,,
#
# Unigram
[unigram rewrite]
*,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,$9
# bigram
# ($2) .
# ,,($1,$2,*,$4) .
# /NNG,T,*,*,*,*,* + /J,*,,*,*,*,*
#
[left rewrite]
BOS/EOS,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
SF,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
*,*,*,*,Inflect,(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|VCP|ETM|XSN),*,*,* $6,$2,*,$4,*,*,*,*,*
*,*,*,*,(Inflect|Preanalysis),*,*,*,* $6,$2,*,*,*,*,*,*,*
(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|VCP|ETM|XSN),*,*,*,*,*,*,*,* $1,$2,*,$4,*,*,*,*,*
*,*,*,*,*,*,*,*,* $1,$2,*,*,*,*,*,*,*
# bigram
# ($3) .
# ,,,($1,$2,$3,$4) .
# ex) /NN,T,*,*,*,*,* + /J,T,,*,*,*,*
[right rewrite]
BOS/EOS,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
SF,*,*,*,*,*,*,*,* $1,$2,$3,$4,$5,$6,$7,$8,BOS/EOS
SL,*,*,*,*,*,*,*,* NNG,$2,$3,*,*,*,*,*,*
*,*,*,*,Inflect,*,(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|XSN),*,* $7,$2,$3,$4,*,*,*,*,*
*,*,*,*,(Inflect|Preanalysis),*,*,*,* $7,$2,$3,*,*,*,*,*,*
(JC|JKB|JKC|JKG|JKO|JKQ|JKS|JKV|JX|NNB|NNBC|XSN),*,*,*,*,*,*,*,* $1,$2,$3,$4,*,*,*,*,*
*,*,*,*,*,*,*,*,* $1,$2,$3,*,*,*,*,*,*

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -1,21 +0,0 @@
import MeCab
import unicodedata
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
# Tagger.
MECAB_ANALYZER = MeCab.Tagger()
def mecab_tokenize(text):
"""
Use the mecab-python3 package to tokenize the given Japanese text.
The simplest output from mecab-python3 is the single-string form, which
contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table.
"""
text = unicodedata.normalize('NFKC', text.strip())
return [line.split('\t')[0]
for line in MECAB_ANALYZER.parse(text).split('\n')
if line != '' and line != 'EOS']

28
wordfreq/mecab.py Normal file
View File

@ -0,0 +1,28 @@
from pkg_resources import resource_filename
import MeCab
import unicodedata
# Instantiate the MeCab analyzers for each language.
MECAB_ANALYZERS = {
'ja': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ja-ipadic')),
'ko': MeCab.Tagger('-d %s' % resource_filename('wordfreq', 'data/mecab-ko-dic'))
}
def mecab_tokenize(text, lang):
"""
Use the mecab-python3 package to tokenize the given text. The `lang`
must be 'ja' for Japanese or 'ko' for Korean.
The simplest output from mecab-python3 is the single-string form, which
contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table.
"""
if lang not in MECAB_ANALYZERS:
raise ValueError("Can't run MeCab on language %r" % lang)
analyzer = MECAB_ANALYZERS[lang]
text = unicodedata.normalize('NFKC', text.strip())
return [line.split('\t')[0]
for line in analyzer.parse(text).split('\n')
if line != '' and line != 'EOS']

View File

@ -2,6 +2,9 @@ import regex
import unicodedata
mecab_tokenize = None
jieba_tokenize = None
# See the documentation inside TOKEN_RE for why we have to handle these
# scripts specially.
SPACELESS_SCRIPTS = [
@ -23,7 +26,6 @@ def _make_spaceless_expr():
SPACELESS_EXPR = _make_spaceless_expr()
TOKEN_RE = regex.compile(r"""
# Case 1: a special case for non-spaced languages
# -----------------------------------------------
@ -74,7 +76,7 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
\S(?:\B\S|\p{M})*
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
def simple_tokenize(text, include_punctuation=False):
@ -98,6 +100,13 @@ def simple_tokenize(text, include_punctuation=False):
tokens.
- It breaks on all spaces, even the "non-breaking" ones.
- It aims to keep marks together with words, so that they aren't erroneously
split off as punctuation in languages such as Hindi.
- It keeps Southeast Asian scripts, such as Thai, glued together. This yields
tokens that are much too long, but the alternative is that every character
would end up in its own token, which is worse.
"""
text = unicodedata.normalize('NFC', text)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
@ -114,20 +123,20 @@ def turkish_tokenize(text, include_punctuation=False):
return [token.strip("'").casefold() for token in token_expr.findall(text)]
mecab_tokenize = None
def japanese_tokenize(text, include_punctuation=False):
def tokenize_mecab_language(text, lang, include_punctuation=False):
"""
Tokenize Japanese text, initializing the MeCab tokenizer if necessary.
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
"""
global mecab_tokenize
if lang not in {'ja', 'ko'}:
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
if mecab_tokenize is None:
from wordfreq.japanese import mecab_tokenize
tokens = mecab_tokenize(text)
from wordfreq.mecab import mecab_tokenize
tokens = mecab_tokenize(text, lang)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
jieba_tokenize = None
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
"""
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
@ -140,16 +149,16 @@ def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
return [token.casefold() for token in tokens if token_expr.match(token)]
def remove_arabic_marks(text):
def remove_marks(text):
"""
Remove decorations from Arabic words:
Remove decorations from words in abjad scripts:
- Combining marks of class Mn, which tend to represent non-essential
vowel markings.
- Tatweels, horizontal segments that are used to extend or justify a
word.
- Tatweels, horizontal segments that are used to extend or justify an
Arabic word.
"""
return ARABIC_MARK_RE.sub('', text)
return MARK_RE.sub('', text)
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
@ -158,30 +167,68 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
the language. Strings that are looked up in wordfreq will be run through
this function first, so that they can be expected to match the data.
Here is what the tokenizer will do, depending on the language:
Some of the processing steps are specific to one language, such as Chinese,
but what broadly happens to the text depends on what general writing system
the language uses, out of these categories:
- Chinese will be mapped to Simplified Chinese characters and tokenized
using the Jieba tokenizer, trained on a custom word list of words that
can be looked up in wordfreq.
- Alphabetic scripts: English, Spanish, Russian, etc.
- Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
- CJK scripts: Chinese, Japanese, Korean
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
- Japanese will be delegated to the external mecab-python module. It will
be NFKC normalized, which is stronger than NFC normalization.
- Chinese or Japanese texts that aren't identified as the appropriate
language will only split on punctuation and script boundaries, giving
you untokenized globs of characters that probably represent many words.
Alphabetic scripts
------------------
- Arabic will be NFKC normalized, and will have Arabic-specific combining
marks and tatweels removed.
The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
European languages, which are relatively straightforward to tokenize.
- Languages written in cased alphabets will be case-folded to lowercase.
Text in these scripts will be normalized to NFC form, then passed
through a regular expression that implements the Word Segmentation section
of Unicode Annex #29, and then case-folded to lowercase.
- Turkish will use a different case-folding procedure, so that capital
I and İ map to ı and i respectively.
The effect is mostly to split the text on spaces and punctuation. There are
some subtleties involving apostrophes inside words, which the regex will
only split when they occur before a vowel. ("Hasn't" is one token, but
"l'enfant" is two.)
- Languages besides Japanese and Chinese will be tokenized using a regex
that mostly implements the Word Segmentation section of Unicode Annex
#29. See `simple_tokenize` for details.
If the language is Turkish, the case-folding rules will take this into
account, so that capital I and İ map to ı and i respectively.
Abjad scripts
-------------
Languages in the Arabic or Hebrew scripts are written with optional vowel
marks, and sometimes other decorative markings and ligatures. In these
languages:
- The text will be NFKC-normalized, which is a stronger and lossier form
than NFC. Here its purpose is to reduce ligatures to simpler characters.
- Marks will be removed, as well as the Arabic tatweel (an extension of
a word that is used for justification or decoration).
After these steps, the text will go through the same process as the
alphabetic scripts above.
CJK scripts
-----------
In the CJK languages, word boundaries can't usually be identified by a
regular expression. Instead, there needs to be some language-specific
handling.
- Chinese text first gets converted to a canonical representation we call
"Oversimplified Chinese", where all characters are replaced by their
Simplified Chinese form, no matter what, even when this misspells a word or
a name. This representation is then tokenized using the Jieba tokenizer,
trained on the list of Chinese words that can be looked up in wordfreq.
- Japanese and Korean will be NFKC-normalized, then tokenized using the
MeCab tokenizer, using dictionary files that are included in this
package.
The `external_wordlist` option only affects Chinese tokenization. If it's
True, then wordfreq will not use its own Chinese wordlist for tokenization.
@ -189,15 +236,36 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
and it will leave Traditional Chinese characters as is. This will probably
give more accurate tokenization, but the resulting tokens won't necessarily
have word frequencies that can be looked up.
If you end up seeing tokens that are entire phrases or sentences glued
together, that probably means you passed in CJK text with the wrong
language code.
Brahmic scripts and other languages
-----------------------------------
Any kind of language not previously mentioned will just go through the same
tokenizer that alphabetic languages use.
We've tweaked this tokenizer for the case of Indic languages in Brahmic
scripts, such as Hindi, Tamil, and Telugu, so that we can handle these
languages where the default Unicode algorithm wouldn't quite work.
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
does not support these languages yet. It will split on spaces and
punctuation, giving tokens that are far too long.
"""
if lang == 'ja':
return japanese_tokenize(text, include_punctuation)
if lang == 'ja' or lang == 'ko':
return tokenize_mecab_language(text, lang, include_punctuation)
elif lang == 'zh':
return chinese_tokenize(text, include_punctuation, external_wordlist)
elif lang == 'tr':
return turkish_tokenize(text, include_punctuation)
elif lang == 'ar':
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
elif lang in {'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'}:
# Abjad languages
text = remove_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text, include_punctuation)
else:
return simple_tokenize(text, include_punctuation)