Merge pull request #28 from LuminosoInsight/chinese-external-wordlist

Add some tokenizer options
This commit is contained in:
Lance Nathan 2015-10-19 18:21:52 -04:00
commit ca00dfa1d9
7 changed files with 349200 additions and 69 deletions

View File

@ -15,13 +15,26 @@ or by getting the repository and running its setup.py:
python3 setup.py install python3 setup.py install
To handle word frequency lookups in Japanese, you need to additionally install Japanese and Chinese have additional external dependencies so that they can be
mecab-python3, which itself depends on libmecab-dev. These commands will tokenized correctly.
install them on Ubuntu:
To be able to look up word frequencies in Japanese, you need to additionally
install mecab-python3, which itself depends on libmecab-dev and its dictionary.
These commands will install them on Ubuntu:
sudo apt-get install mecab-ipadic-utf8 libmecab-dev sudo apt-get install mecab-ipadic-utf8 libmecab-dev
pip3 install mecab-python3 pip3 install mecab-python3
To be able to look up word frequencies in Chinese, you need Jieba, a
pure-Python Chinese tokenizer:
pip3 install jieba
These dependencies can also be requested as options when installing wordfreq.
For example:
pip3 install wordfreq[mecab,jieba]
## Usage ## Usage

View File

@ -21,17 +21,19 @@ def test_languages():
avail = available_languages() avail = available_languages()
assert_greater(len(avail), 15) assert_greater(len(avail), 15)
# Laughter is the universal language # Laughter is the universal language. Look up either 'lol' or '笑' in each
# language and make sure it has a non-zero frequency.
for lang in avail: for lang in avail:
if lang not in {'zh', 'ja'}: if lang in {'zh', 'ja'}:
# we do not have enough Chinese data text = ''
# Japanese people do not lol else:
assert_greater(word_frequency('lol', lang), 0) text = 'lol'
assert_greater(word_frequency(text, lang), 0)
# Make up a weirdly verbose language code and make sure # Make up a weirdly verbose language code and make sure
# we still get it # we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper() new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency('lol', new_lang_code), 0) assert_greater(word_frequency(text, new_lang_code), 0)
def test_twitter(): def test_twitter():
@ -98,6 +100,9 @@ def test_tokenization():
# data # data
eq_(tokenize("I don't split at apostrophes, you see.", 'en'), eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
# Certain punctuation does not inherently split a word. # Certain punctuation does not inherently split a word.
eq_(tokenize("Anything is possible at zombo.com", 'en'), eq_(tokenize("Anything is possible at zombo.com", 'en'),
@ -108,6 +113,9 @@ def test_tokenization():
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop']) eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
['this', 'text', 'has', '...', 'punctuation', ':)'])
def test_casefolding(): def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss']) eq_(tokenize('WEISS', 'de'), ['weiss'])

View File

@ -25,12 +25,33 @@ def test_tokens():
eq_( eq_(
tokenize(fact_simplified, 'zh'), tokenize(fact_simplified, 'zh'),
[ [
# he / is / in history / #6 / counter for people # he / is / in history / #6 / counter for people
'', '', '历史上', '第六', '', '', '', '历史上', '第六', '',
# during / term of office / in / die # during / term of office / in / die
'', '任期', '', '去世', '', '任期', '', '去世',
# of / U.S. / deputy / president # of / U.S. / deputy / president
'', '美国', '', '总统' '', '美国', '', '总统'
]
)
# Jieba's original tokenizer knows a lot of names, it seems.
eq_(
tokenize(hobart, 'zh', external_wordlist=True),
['加勒特', '霍巴特']
)
# We get almost the same tokens from the sentence using Jieba's own
# wordlist, but it tokenizes "in history" as two words and
# "sixth person" as one.
eq_(
tokenize(fact_simplified, 'zh', external_wordlist=True),
[
# he / is / history / in / sixth person
'', '', '历史', '', '第六位',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president
'', '美国', '', '总统'
] ]
) )

View File

@ -4,9 +4,11 @@ import msgpack
import gzip import gzip
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz') SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8') SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
jieba_tokenizer = None jieba_tokenizer = None
jieba_orig_tokenizer = None
def simplify_chinese(text): def simplify_chinese(text):
@ -23,17 +25,28 @@ def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold() return text.translate(SIMPLIFIED_MAP).casefold()
def jieba_tokenize(text): def jieba_tokenize(text, external_wordlist=False):
""" """
Tokenize the given text into tokens whose word frequencies can probably Tokenize the given text into tokens whose word frequencies can probably
be looked up. This uses Jieba, a word-frequency-based tokenizer. be looked up. This uses Jieba, a word-frequency-based tokenizer.
We tell Jieba to default to using wordfreq's own Chinese wordlist, and not If `external_wordlist` is False, we tell Jieba to default to using
to infer unknown words using a hidden Markov model. This ensures that the wordfreq's own Chinese wordlist, and not to infer unknown words using a
multi-character tokens that it outputs will be ones whose word frequencies hidden Markov model. This ensures that the multi-character tokens that it
we can look up. outputs will be ones whose word frequencies we can look up.
If `external_wordlist` is True, this will use the largest version of
Jieba's original dictionary, with HMM enabled, so its results will be
independent of the data in wordfreq. These results will be better optimized
for purposes that aren't looking up word frequencies, such as general-
purpose tokenization, or collecting word frequencies in the first place.
""" """
global jieba_tokenizer global jieba_tokenizer, jieba_orig_tokenizer
if jieba_tokenizer is None: if external_wordlist:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME) if jieba_orig_tokenizer is None:
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False) jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
return jieba_orig_tokenizer.lcut(text)
else:
if jieba_tokenizer is None:
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)

349046
wordfreq/data/jieba_zh_orig.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -38,10 +38,16 @@ TOKEN_RE = regex.compile(r"""
(?:\B\S)* (?:\B\S)*
""", regex.V1 | regex.WORD | regex.VERBOSE) """, regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
[\p{IsIdeo}\p{Script=Hiragana}]+ |
[\p{punct}]+ |
\S(?:\B\S)*
""", regex.V1 | regex.WORD | regex.VERBOSE)
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1) ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
def simple_tokenize(text): def simple_tokenize(text, include_punctuation=False):
""" """
Tokenize the given text using a straightforward, Unicode-aware token Tokenize the given text using a straightforward, Unicode-aware token
expression. expression.
@ -56,22 +62,46 @@ def simple_tokenize(text):
ideograms and hiragana) relatively untokenized, instead of splitting each ideograms and hiragana) relatively untokenized, instead of splitting each
character into its own token. character into its own token.
- It outputs only the tokens that start with a word-like character, or - If `include_punctuation` is False (the default), it outputs only the
miscellaneous symbols such as emoji. tokens that start with a word-like character, or miscellaneous symbols
such as emoji. If `include_punctuation` is True, it outputs all non-space
tokens.
- It breaks on all spaces, even the "non-breaking" ones. - It breaks on all spaces, even the "non-breaking" ones.
""" """
text = unicodedata.normalize('NFC', text) text = unicodedata.normalize('NFC', text)
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.strip("'").casefold() for token in token_expr.findall(text)]
def turkish_tokenize(text): def turkish_tokenize(text, include_punctuation=False):
""" """
Like `simple_tokenize`, but modifies i's so that they case-fold correctly Like `simple_tokenize`, but modifies i's so that they case-fold correctly
in Turkish. in Turkish.
""" """
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.strip("'").casefold() for token in token_expr.findall(text)]
mecab_tokenize = None
def japanese_tokenize(text, include_punctuation=False):
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.japanese import mecab_tokenize
tokens = mecab_tokenize(text)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
jieba_tokenize = None
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
return [token.casefold() for token in tokens if token_expr.match(token)]
def remove_arabic_marks(text): def remove_arabic_marks(text):
@ -86,53 +116,53 @@ def remove_arabic_marks(text):
return ARABIC_MARK_RE.sub('', text) return ARABIC_MARK_RE.sub('', text)
mecab_tokenize = None def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
jieba_tokenize = None
def tokenize(text, lang):
""" """
Tokenize this text in a way that's relatively simple but appropriate for Tokenize this text in a way that's relatively simple but appropriate for
the language. the language. Strings that are looked up in wordfreq will be run through
this function first, so that they can be expected to match the data.
So far, this means: Here is what the tokenizer will do, depending on the language:
- Chinese will be mapped to Simplified Chinese characters and tokenized
using the Jieba tokenizer, trained on a custom word list of words that
can be looked up in wordfreq.
- Japanese will be delegated to the external mecab-python module. It will
be NFKC normalized, which is stronger than NFC normalization.
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
- Japanese will be delegated to the external mecab-python module.
- Chinese or Japanese texts that aren't identified as the appropriate - Chinese or Japanese texts that aren't identified as the appropriate
language will only split on punctuation and script boundaries, giving language will only split on punctuation and script boundaries, giving
you untokenized globs of characters that probably represent many words. you untokenized globs of characters that probably represent many words.
- Arabic will be NFKC normalized, and will have Arabic-specific combining
marks and tatweels removed.
- Languages written in cased alphabets will be case-folded to lowercase.
- Turkish will use a different case-folding procedure, so that capital - Turkish will use a different case-folding procedure, so that capital
I and İ map to ı and i respectively. I and İ map to ı and i respectively.
- All other languages will be tokenized using a regex that mostly
implements the Word Segmentation section of Unicode Annex #29.
See `simple_tokenize` for details.
Additionally, the text will be case-folded to lowercase, and text marked - Languages besides Japanese and Chinese will be tokenized using a regex
as Arabic will be normalized more strongly and have combining marks and that mostly implements the Word Segmentation section of Unicode Annex
tatweels removed. #29. See `simple_tokenize` for details.
Strings that are looked up in wordfreq will be run through this function The `external_wordlist` option only affects Chinese tokenization. If it's
first, so that they can be expected to match the data. True, then wordfreq will not use its own Chinese wordlist for tokenization.
Instead, it will use the large wordlist packaged with the Jieba tokenizer,
and it will leave Traditional Chinese characters as is. This will probably
give more accurate tokenization, but the resulting tokens won't necessarily
have word frequencies that can be looked up.
""" """
if lang == 'ja': if lang == 'ja':
global mecab_tokenize return japanese_tokenize(text, include_punctuation)
if mecab_tokenize is None: elif lang == 'zh':
from wordfreq.japanese import mecab_tokenize return chinese_tokenize(text, include_punctuation, external_wordlist)
tokens = mecab_tokenize(text) elif lang == 'tr':
return [token.casefold() for token in tokens if TOKEN_RE.match(token)] return turkish_tokenize(text, include_punctuation)
elif lang == 'ar':
if lang == 'zh':
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
tokens = jieba_tokenize(text)
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
if lang == 'tr':
return turkish_tokenize(text)
if lang == 'ar':
text = remove_arabic_marks(unicodedata.normalize('NFKC', text)) text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text, include_punctuation)
return simple_tokenize(text) else:
return simple_tokenize(text, include_punctuation)

View File

@ -63,7 +63,7 @@ rule convert_opensubtitles
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, # To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
# run it through ftfy, convert tabs to commas and spurious CSV formatting to # run it through ftfy, convert tabs to commas and spurious CSV formatting to
# and remove lines with unfixable half-mojibake. # spaces, and remove lines with unfixable half-mojibake.
rule convert_subtlex rule convert_subtlex
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out