Merge pull request #28 from LuminosoInsight/chinese-external-wordlist

Add some tokenizer options

Former-commit-id: ca00dfa1d9
This commit is contained in:
Lance Nathan 2015-10-19 18:21:52 -04:00
commit f4d865c0be
7 changed files with 349200 additions and 69 deletions

View File

@ -15,13 +15,26 @@ or by getting the repository and running its setup.py:
python3 setup.py install
To handle word frequency lookups in Japanese, you need to additionally install
mecab-python3, which itself depends on libmecab-dev. These commands will
install them on Ubuntu:
Japanese and Chinese have additional external dependencies so that they can be
tokenized correctly.
To be able to look up word frequencies in Japanese, you need to additionally
install mecab-python3, which itself depends on libmecab-dev and its dictionary.
These commands will install them on Ubuntu:
sudo apt-get install mecab-ipadic-utf8 libmecab-dev
pip3 install mecab-python3
To be able to look up word frequencies in Chinese, you need Jieba, a
pure-Python Chinese tokenizer:
pip3 install jieba
These dependencies can also be requested as options when installing wordfreq.
For example:
pip3 install wordfreq[mecab,jieba]
## Usage

View File

@ -21,17 +21,19 @@ def test_languages():
avail = available_languages()
assert_greater(len(avail), 15)
# Laughter is the universal language
# Laughter is the universal language. Look up either 'lol' or '笑' in each
# language and make sure it has a non-zero frequency.
for lang in avail:
if lang not in {'zh', 'ja'}:
# we do not have enough Chinese data
# Japanese people do not lol
assert_greater(word_frequency('lol', lang), 0)
if lang in {'zh', 'ja'}:
text = ''
else:
text = 'lol'
assert_greater(word_frequency(text, lang), 0)
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency('lol', new_lang_code), 0)
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency(text, new_lang_code), 0)
def test_twitter():
@ -98,6 +100,9 @@ def test_tokenization():
# data
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
# Certain punctuation does not inherently split a word.
eq_(tokenize("Anything is possible at zombo.com", 'en'),
@ -108,6 +113,9 @@ def test_tokenization():
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
['this', 'text', 'has', '...', 'punctuation', ':)'])
def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss'])

View File

@ -25,12 +25,33 @@ def test_tokens():
eq_(
tokenize(fact_simplified, 'zh'),
[
# he / is / in history / #6 / counter for people
'', '', '历史上', '第六', '',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
# he / is / in history / #6 / counter for people
'', '', '历史上', '第六', '',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
]
)
# Jieba's original tokenizer knows a lot of names, it seems.
eq_(
tokenize(hobart, 'zh', external_wordlist=True),
['加勒特', '霍巴特']
)
# We get almost the same tokens from the sentence using Jieba's own
# wordlist, but it tokenizes "in history" as two words and
# "sixth person" as one.
eq_(
tokenize(fact_simplified, 'zh', external_wordlist=True),
[
# he / is / history / in / sixth person
'', '', '历史', '', '第六位',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
]
)

View File

@ -4,9 +4,11 @@ import msgpack
import gzip
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
jieba_tokenizer = None
jieba_orig_tokenizer = None
def simplify_chinese(text):
@ -23,17 +25,28 @@ def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold()
def jieba_tokenize(text):
def jieba_tokenize(text, external_wordlist=False):
"""
Tokenize the given text into tokens whose word frequencies can probably
be looked up. This uses Jieba, a word-frequency-based tokenizer.
We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
to infer unknown words using a hidden Markov model. This ensures that the
multi-character tokens that it outputs will be ones whose word frequencies
we can look up.
If `external_wordlist` is False, we tell Jieba to default to using
wordfreq's own Chinese wordlist, and not to infer unknown words using a
hidden Markov model. This ensures that the multi-character tokens that it
outputs will be ones whose word frequencies we can look up.
If `external_wordlist` is True, this will use the largest version of
Jieba's original dictionary, with HMM enabled, so its results will be
independent of the data in wordfreq. These results will be better optimized
for purposes that aren't looking up word frequencies, such as general-
purpose tokenization, or collecting word frequencies in the first place.
"""
global jieba_tokenizer
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
global jieba_tokenizer, jieba_orig_tokenizer
if external_wordlist:
if jieba_orig_tokenizer is None:
jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
return jieba_orig_tokenizer.lcut(text)
else:
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)

349046
wordfreq/data/jieba_zh_orig.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -38,10 +38,16 @@ TOKEN_RE = regex.compile(r"""
(?:\B\S)*
""", regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
[\p{IsIdeo}\p{Script=Hiragana}]+ |
[\p{punct}]+ |
\S(?:\B\S)*
""", regex.V1 | regex.WORD | regex.VERBOSE)
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
def simple_tokenize(text):
def simple_tokenize(text, include_punctuation=False):
"""
Tokenize the given text using a straightforward, Unicode-aware token
expression.
@ -56,22 +62,46 @@ def simple_tokenize(text):
ideograms and hiragana) relatively untokenized, instead of splitting each
character into its own token.
- It outputs only the tokens that start with a word-like character, or
miscellaneous symbols such as emoji.
- If `include_punctuation` is False (the default), it outputs only the
tokens that start with a word-like character, or miscellaneous symbols
such as emoji. If `include_punctuation` is True, it outputs all non-space
tokens.
- It breaks on all spaces, even the "non-breaking" ones.
"""
text = unicodedata.normalize('NFC', text)
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.strip("'").casefold() for token in token_expr.findall(text)]
def turkish_tokenize(text):
def turkish_tokenize(text, include_punctuation=False):
"""
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
in Turkish.
"""
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.strip("'").casefold() for token in token_expr.findall(text)]
mecab_tokenize = None
def japanese_tokenize(text, include_punctuation=False):
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.japanese import mecab_tokenize
tokens = mecab_tokenize(text)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
jieba_tokenize = None
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
def remove_arabic_marks(text):
@ -86,53 +116,53 @@ def remove_arabic_marks(text):
return ARABIC_MARK_RE.sub('', text)
mecab_tokenize = None
jieba_tokenize = None
def tokenize(text, lang):
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
"""
Tokenize this text in a way that's relatively simple but appropriate for
the language.
the language. Strings that are looked up in wordfreq will be run through
this function first, so that they can be expected to match the data.
So far, this means:
Here is what the tokenizer will do, depending on the language:
- Chinese will be mapped to Simplified Chinese characters and tokenized
using the Jieba tokenizer, trained on a custom word list of words that
can be looked up in wordfreq.
- Japanese will be delegated to the external mecab-python module. It will
be NFKC normalized, which is stronger than NFC normalization.
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
- Japanese will be delegated to the external mecab-python module.
- Chinese or Japanese texts that aren't identified as the appropriate
language will only split on punctuation and script boundaries, giving
you untokenized globs of characters that probably represent many words.
- Arabic will be NFKC normalized, and will have Arabic-specific combining
marks and tatweels removed.
- Languages written in cased alphabets will be case-folded to lowercase.
- Turkish will use a different case-folding procedure, so that capital
I and İ map to ı and i respectively.
- All other languages will be tokenized using a regex that mostly
implements the Word Segmentation section of Unicode Annex #29.
See `simple_tokenize` for details.
Additionally, the text will be case-folded to lowercase, and text marked
as Arabic will be normalized more strongly and have combining marks and
tatweels removed.
- Languages besides Japanese and Chinese will be tokenized using a regex
that mostly implements the Word Segmentation section of Unicode Annex
#29. See `simple_tokenize` for details.
Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data.
The `external_wordlist` option only affects Chinese tokenization. If it's
True, then wordfreq will not use its own Chinese wordlist for tokenization.
Instead, it will use the large wordlist packaged with the Jieba tokenizer,
and it will leave Traditional Chinese characters as is. This will probably
give more accurate tokenization, but the resulting tokens won't necessarily
have word frequencies that can be looked up.
"""
if lang == 'ja':
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.japanese import mecab_tokenize
tokens = mecab_tokenize(text)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'zh':
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'tr':
return turkish_tokenize(text)
if lang == 'ar':
return japanese_tokenize(text, include_punctuation)
elif lang == 'zh':
return chinese_tokenize(text, include_punctuation, external_wordlist)
elif lang == 'tr':
return turkish_tokenize(text, include_punctuation)
elif lang == 'ar':
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text)
return simple_tokenize(text, include_punctuation)
else:
return simple_tokenize(text, include_punctuation)

View File

@ -63,7 +63,7 @@ rule convert_opensubtitles
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
# and remove lines with unfixable half-mojibake.
# spaces, and remove lines with unfixable half-mojibake.
rule convert_subtlex
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out