mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge pull request #28 from LuminosoInsight/chinese-external-wordlist
Add some tokenizer options
This commit is contained in:
commit
ca00dfa1d9
19
README.md
19
README.md
@ -15,13 +15,26 @@ or by getting the repository and running its setup.py:
|
||||
|
||||
python3 setup.py install
|
||||
|
||||
To handle word frequency lookups in Japanese, you need to additionally install
|
||||
mecab-python3, which itself depends on libmecab-dev. These commands will
|
||||
install them on Ubuntu:
|
||||
Japanese and Chinese have additional external dependencies so that they can be
|
||||
tokenized correctly.
|
||||
|
||||
To be able to look up word frequencies in Japanese, you need to additionally
|
||||
install mecab-python3, which itself depends on libmecab-dev and its dictionary.
|
||||
These commands will install them on Ubuntu:
|
||||
|
||||
sudo apt-get install mecab-ipadic-utf8 libmecab-dev
|
||||
pip3 install mecab-python3
|
||||
|
||||
To be able to look up word frequencies in Chinese, you need Jieba, a
|
||||
pure-Python Chinese tokenizer:
|
||||
|
||||
pip3 install jieba
|
||||
|
||||
These dependencies can also be requested as options when installing wordfreq.
|
||||
For example:
|
||||
|
||||
pip3 install wordfreq[mecab,jieba]
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
|
@ -21,17 +21,19 @@ def test_languages():
|
||||
avail = available_languages()
|
||||
assert_greater(len(avail), 15)
|
||||
|
||||
# Laughter is the universal language
|
||||
# Laughter is the universal language. Look up either 'lol' or '笑' in each
|
||||
# language and make sure it has a non-zero frequency.
|
||||
for lang in avail:
|
||||
if lang not in {'zh', 'ja'}:
|
||||
# we do not have enough Chinese data
|
||||
# Japanese people do not lol
|
||||
assert_greater(word_frequency('lol', lang), 0)
|
||||
if lang in {'zh', 'ja'}:
|
||||
text = '笑'
|
||||
else:
|
||||
text = 'lol'
|
||||
assert_greater(word_frequency(text, lang), 0)
|
||||
|
||||
# Make up a weirdly verbose language code and make sure
|
||||
# we still get it
|
||||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||||
assert_greater(word_frequency('lol', new_lang_code), 0)
|
||||
# Make up a weirdly verbose language code and make sure
|
||||
# we still get it
|
||||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||||
assert_greater(word_frequency(text, new_lang_code), 0)
|
||||
|
||||
|
||||
def test_twitter():
|
||||
@ -98,6 +100,9 @@ def test_tokenization():
|
||||
# data
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
||||
|
||||
# Certain punctuation does not inherently split a word.
|
||||
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
||||
@ -108,6 +113,9 @@ def test_tokenization():
|
||||
|
||||
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
||||
|
||||
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
||||
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||
|
@ -25,12 +25,33 @@ def test_tokens():
|
||||
eq_(
|
||||
tokenize(fact_simplified, 'zh'),
|
||||
[
|
||||
# he / is / in history / #6 / counter for people
|
||||
'他', '是', '历史上', '第六', '位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
# he / is / in history / #6 / counter for people
|
||||
'他', '是', '历史上', '第六', '位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
)
|
||||
|
||||
# Jieba's original tokenizer knows a lot of names, it seems.
|
||||
eq_(
|
||||
tokenize(hobart, 'zh', external_wordlist=True),
|
||||
['加勒特', '霍巴特']
|
||||
)
|
||||
|
||||
# We get almost the same tokens from the sentence using Jieba's own
|
||||
# wordlist, but it tokenizes "in history" as two words and
|
||||
# "sixth person" as one.
|
||||
eq_(
|
||||
tokenize(fact_simplified, 'zh', external_wordlist=True),
|
||||
[
|
||||
# he / is / history / in / sixth person
|
||||
'他', '是', '历史', '上', '第六位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -4,9 +4,11 @@ import msgpack
|
||||
import gzip
|
||||
|
||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
||||
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
||||
jieba_tokenizer = None
|
||||
jieba_orig_tokenizer = None
|
||||
|
||||
|
||||
def simplify_chinese(text):
|
||||
@ -23,17 +25,28 @@ def simplify_chinese(text):
|
||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||
|
||||
|
||||
def jieba_tokenize(text):
|
||||
def jieba_tokenize(text, external_wordlist=False):
|
||||
"""
|
||||
Tokenize the given text into tokens whose word frequencies can probably
|
||||
be looked up. This uses Jieba, a word-frequency-based tokenizer.
|
||||
|
||||
We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
|
||||
to infer unknown words using a hidden Markov model. This ensures that the
|
||||
multi-character tokens that it outputs will be ones whose word frequencies
|
||||
we can look up.
|
||||
If `external_wordlist` is False, we tell Jieba to default to using
|
||||
wordfreq's own Chinese wordlist, and not to infer unknown words using a
|
||||
hidden Markov model. This ensures that the multi-character tokens that it
|
||||
outputs will be ones whose word frequencies we can look up.
|
||||
|
||||
If `external_wordlist` is True, this will use the largest version of
|
||||
Jieba's original dictionary, with HMM enabled, so its results will be
|
||||
independent of the data in wordfreq. These results will be better optimized
|
||||
for purposes that aren't looking up word frequencies, such as general-
|
||||
purpose tokenization, or collecting word frequencies in the first place.
|
||||
"""
|
||||
global jieba_tokenizer
|
||||
if jieba_tokenizer is None:
|
||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||
global jieba_tokenizer, jieba_orig_tokenizer
|
||||
if external_wordlist:
|
||||
if jieba_orig_tokenizer is None:
|
||||
jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
|
||||
return jieba_orig_tokenizer.lcut(text)
|
||||
else:
|
||||
if jieba_tokenizer is None:
|
||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||
|
349046
wordfreq/data/jieba_zh_orig.txt
Normal file
349046
wordfreq/data/jieba_zh_orig.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -38,10 +38,16 @@ TOKEN_RE = regex.compile(r"""
|
||||
(?:\B\S)*
|
||||
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||
[\p{IsIdeo}\p{Script=Hiragana}]+ |
|
||||
[\p{punct}]+ |
|
||||
\S(?:\B\S)*
|
||||
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
|
||||
|
||||
def simple_tokenize(text):
|
||||
def simple_tokenize(text, include_punctuation=False):
|
||||
"""
|
||||
Tokenize the given text using a straightforward, Unicode-aware token
|
||||
expression.
|
||||
@ -56,22 +62,46 @@ def simple_tokenize(text):
|
||||
ideograms and hiragana) relatively untokenized, instead of splitting each
|
||||
character into its own token.
|
||||
|
||||
- It outputs only the tokens that start with a word-like character, or
|
||||
miscellaneous symbols such as emoji.
|
||||
- If `include_punctuation` is False (the default), it outputs only the
|
||||
tokens that start with a word-like character, or miscellaneous symbols
|
||||
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
||||
tokens.
|
||||
|
||||
- It breaks on all spaces, even the "non-breaking" ones.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text)
|
||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||||
|
||||
|
||||
def turkish_tokenize(text):
|
||||
def turkish_tokenize(text, include_punctuation=False):
|
||||
"""
|
||||
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
|
||||
in Turkish.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
def japanese_tokenize(text, include_punctuation=False):
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.japanese import mecab_tokenize
|
||||
tokens = mecab_tokenize(text)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
|
||||
|
||||
jieba_tokenize = None
|
||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
|
||||
|
||||
def remove_arabic_marks(text):
|
||||
@ -86,53 +116,53 @@ def remove_arabic_marks(text):
|
||||
return ARABIC_MARK_RE.sub('', text)
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
jieba_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
"""
|
||||
Tokenize this text in a way that's relatively simple but appropriate for
|
||||
the language.
|
||||
the language. Strings that are looked up in wordfreq will be run through
|
||||
this function first, so that they can be expected to match the data.
|
||||
|
||||
So far, this means:
|
||||
Here is what the tokenizer will do, depending on the language:
|
||||
|
||||
- Chinese will be mapped to Simplified Chinese characters and tokenized
|
||||
using the Jieba tokenizer, trained on a custom word list of words that
|
||||
can be looked up in wordfreq.
|
||||
|
||||
- Japanese will be delegated to the external mecab-python module. It will
|
||||
be NFKC normalized, which is stronger than NFC normalization.
|
||||
|
||||
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
|
||||
- Japanese will be delegated to the external mecab-python module.
|
||||
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||
language will only split on punctuation and script boundaries, giving
|
||||
you untokenized globs of characters that probably represent many words.
|
||||
|
||||
- Arabic will be NFKC normalized, and will have Arabic-specific combining
|
||||
marks and tatweels removed.
|
||||
|
||||
- Languages written in cased alphabets will be case-folded to lowercase.
|
||||
|
||||
- Turkish will use a different case-folding procedure, so that capital
|
||||
I and İ map to ı and i respectively.
|
||||
- All other languages will be tokenized using a regex that mostly
|
||||
implements the Word Segmentation section of Unicode Annex #29.
|
||||
See `simple_tokenize` for details.
|
||||
|
||||
Additionally, the text will be case-folded to lowercase, and text marked
|
||||
as Arabic will be normalized more strongly and have combining marks and
|
||||
tatweels removed.
|
||||
- Languages besides Japanese and Chinese will be tokenized using a regex
|
||||
that mostly implements the Word Segmentation section of Unicode Annex
|
||||
#29. See `simple_tokenize` for details.
|
||||
|
||||
Strings that are looked up in wordfreq will be run through this function
|
||||
first, so that they can be expected to match the data.
|
||||
The `external_wordlist` option only affects Chinese tokenization. If it's
|
||||
True, then wordfreq will not use its own Chinese wordlist for tokenization.
|
||||
Instead, it will use the large wordlist packaged with the Jieba tokenizer,
|
||||
and it will leave Traditional Chinese characters as is. This will probably
|
||||
give more accurate tokenization, but the resulting tokens won't necessarily
|
||||
have word frequencies that can be looked up.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.japanese import mecab_tokenize
|
||||
tokens = mecab_tokenize(text)
|
||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
||||
|
||||
if lang == 'zh':
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
tokens = jieba_tokenize(text)
|
||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
||||
|
||||
|
||||
if lang == 'tr':
|
||||
return turkish_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
return japanese_tokenize(text, include_punctuation)
|
||||
elif lang == 'zh':
|
||||
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||
elif lang == 'tr':
|
||||
return turkish_tokenize(text, include_punctuation)
|
||||
elif lang == 'ar':
|
||||
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
|
||||
|
||||
return simple_tokenize(text)
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
else:
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
|
||||
|
@ -63,7 +63,7 @@ rule convert_opensubtitles
|
||||
|
||||
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
|
||||
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
|
||||
# and remove lines with unfixable half-mojibake.
|
||||
# spaces, and remove lines with unfixable half-mojibake.
|
||||
rule convert_subtlex
|
||||
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user