mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge pull request #28 from LuminosoInsight/chinese-external-wordlist
Add some tokenizer options
This commit is contained in:
commit
ca00dfa1d9
19
README.md
19
README.md
@ -15,13 +15,26 @@ or by getting the repository and running its setup.py:
|
|||||||
|
|
||||||
python3 setup.py install
|
python3 setup.py install
|
||||||
|
|
||||||
To handle word frequency lookups in Japanese, you need to additionally install
|
Japanese and Chinese have additional external dependencies so that they can be
|
||||||
mecab-python3, which itself depends on libmecab-dev. These commands will
|
tokenized correctly.
|
||||||
install them on Ubuntu:
|
|
||||||
|
To be able to look up word frequencies in Japanese, you need to additionally
|
||||||
|
install mecab-python3, which itself depends on libmecab-dev and its dictionary.
|
||||||
|
These commands will install them on Ubuntu:
|
||||||
|
|
||||||
sudo apt-get install mecab-ipadic-utf8 libmecab-dev
|
sudo apt-get install mecab-ipadic-utf8 libmecab-dev
|
||||||
pip3 install mecab-python3
|
pip3 install mecab-python3
|
||||||
|
|
||||||
|
To be able to look up word frequencies in Chinese, you need Jieba, a
|
||||||
|
pure-Python Chinese tokenizer:
|
||||||
|
|
||||||
|
pip3 install jieba
|
||||||
|
|
||||||
|
These dependencies can also be requested as options when installing wordfreq.
|
||||||
|
For example:
|
||||||
|
|
||||||
|
pip3 install wordfreq[mecab,jieba]
|
||||||
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
@ -21,17 +21,19 @@ def test_languages():
|
|||||||
avail = available_languages()
|
avail = available_languages()
|
||||||
assert_greater(len(avail), 15)
|
assert_greater(len(avail), 15)
|
||||||
|
|
||||||
# Laughter is the universal language
|
# Laughter is the universal language. Look up either 'lol' or '笑' in each
|
||||||
|
# language and make sure it has a non-zero frequency.
|
||||||
for lang in avail:
|
for lang in avail:
|
||||||
if lang not in {'zh', 'ja'}:
|
if lang in {'zh', 'ja'}:
|
||||||
# we do not have enough Chinese data
|
text = '笑'
|
||||||
# Japanese people do not lol
|
else:
|
||||||
assert_greater(word_frequency('lol', lang), 0)
|
text = 'lol'
|
||||||
|
assert_greater(word_frequency(text, lang), 0)
|
||||||
|
|
||||||
# Make up a weirdly verbose language code and make sure
|
# Make up a weirdly verbose language code and make sure
|
||||||
# we still get it
|
# we still get it
|
||||||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||||||
assert_greater(word_frequency('lol', new_lang_code), 0)
|
assert_greater(word_frequency(text, new_lang_code), 0)
|
||||||
|
|
||||||
|
|
||||||
def test_twitter():
|
def test_twitter():
|
||||||
@ -98,6 +100,9 @@ def test_tokenization():
|
|||||||
# data
|
# data
|
||||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||||
|
|
||||||
|
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
||||||
|
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
||||||
|
|
||||||
# Certain punctuation does not inherently split a word.
|
# Certain punctuation does not inherently split a word.
|
||||||
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
||||||
@ -108,6 +113,9 @@ def test_tokenization():
|
|||||||
|
|
||||||
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
||||||
|
|
||||||
|
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
||||||
|
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
||||||
|
|
||||||
|
|
||||||
def test_casefolding():
|
def test_casefolding():
|
||||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||||
|
@ -25,12 +25,33 @@ def test_tokens():
|
|||||||
eq_(
|
eq_(
|
||||||
tokenize(fact_simplified, 'zh'),
|
tokenize(fact_simplified, 'zh'),
|
||||||
[
|
[
|
||||||
# he / is / in history / #6 / counter for people
|
# he / is / in history / #6 / counter for people
|
||||||
'他', '是', '历史上', '第六', '位',
|
'他', '是', '历史上', '第六', '位',
|
||||||
# during / term of office / in / die
|
# during / term of office / in / die
|
||||||
'在', '任期', '内', '去世',
|
'在', '任期', '内', '去世',
|
||||||
# of / U.S. / deputy / president
|
# of / U.S. / deputy / president
|
||||||
'的', '美国', '副', '总统'
|
'的', '美国', '副', '总统'
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Jieba's original tokenizer knows a lot of names, it seems.
|
||||||
|
eq_(
|
||||||
|
tokenize(hobart, 'zh', external_wordlist=True),
|
||||||
|
['加勒特', '霍巴特']
|
||||||
|
)
|
||||||
|
|
||||||
|
# We get almost the same tokens from the sentence using Jieba's own
|
||||||
|
# wordlist, but it tokenizes "in history" as two words and
|
||||||
|
# "sixth person" as one.
|
||||||
|
eq_(
|
||||||
|
tokenize(fact_simplified, 'zh', external_wordlist=True),
|
||||||
|
[
|
||||||
|
# he / is / history / in / sixth person
|
||||||
|
'他', '是', '历史', '上', '第六位',
|
||||||
|
# during / term of office / in / die
|
||||||
|
'在', '任期', '内', '去世',
|
||||||
|
# of / U.S. / deputy / president
|
||||||
|
'的', '美国', '副', '总统'
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -4,9 +4,11 @@ import msgpack
|
|||||||
import gzip
|
import gzip
|
||||||
|
|
||||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||||
|
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
|
||||||
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
||||||
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
||||||
jieba_tokenizer = None
|
jieba_tokenizer = None
|
||||||
|
jieba_orig_tokenizer = None
|
||||||
|
|
||||||
|
|
||||||
def simplify_chinese(text):
|
def simplify_chinese(text):
|
||||||
@ -23,17 +25,28 @@ def simplify_chinese(text):
|
|||||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||||
|
|
||||||
|
|
||||||
def jieba_tokenize(text):
|
def jieba_tokenize(text, external_wordlist=False):
|
||||||
"""
|
"""
|
||||||
Tokenize the given text into tokens whose word frequencies can probably
|
Tokenize the given text into tokens whose word frequencies can probably
|
||||||
be looked up. This uses Jieba, a word-frequency-based tokenizer.
|
be looked up. This uses Jieba, a word-frequency-based tokenizer.
|
||||||
|
|
||||||
We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
|
If `external_wordlist` is False, we tell Jieba to default to using
|
||||||
to infer unknown words using a hidden Markov model. This ensures that the
|
wordfreq's own Chinese wordlist, and not to infer unknown words using a
|
||||||
multi-character tokens that it outputs will be ones whose word frequencies
|
hidden Markov model. This ensures that the multi-character tokens that it
|
||||||
we can look up.
|
outputs will be ones whose word frequencies we can look up.
|
||||||
|
|
||||||
|
If `external_wordlist` is True, this will use the largest version of
|
||||||
|
Jieba's original dictionary, with HMM enabled, so its results will be
|
||||||
|
independent of the data in wordfreq. These results will be better optimized
|
||||||
|
for purposes that aren't looking up word frequencies, such as general-
|
||||||
|
purpose tokenization, or collecting word frequencies in the first place.
|
||||||
"""
|
"""
|
||||||
global jieba_tokenizer
|
global jieba_tokenizer, jieba_orig_tokenizer
|
||||||
if jieba_tokenizer is None:
|
if external_wordlist:
|
||||||
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
if jieba_orig_tokenizer is None:
|
||||||
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
|
||||||
|
return jieba_orig_tokenizer.lcut(text)
|
||||||
|
else:
|
||||||
|
if jieba_tokenizer is None:
|
||||||
|
jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
|
||||||
|
return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
|
||||||
|
349046
wordfreq/data/jieba_zh_orig.txt
Normal file
349046
wordfreq/data/jieba_zh_orig.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -38,10 +38,16 @@ TOKEN_RE = regex.compile(r"""
|
|||||||
(?:\B\S)*
|
(?:\B\S)*
|
||||||
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
|
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||||
|
[\p{IsIdeo}\p{Script=Hiragana}]+ |
|
||||||
|
[\p{punct}]+ |
|
||||||
|
\S(?:\B\S)*
|
||||||
|
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||||
|
|
||||||
|
|
||||||
def simple_tokenize(text):
|
def simple_tokenize(text, include_punctuation=False):
|
||||||
"""
|
"""
|
||||||
Tokenize the given text using a straightforward, Unicode-aware token
|
Tokenize the given text using a straightforward, Unicode-aware token
|
||||||
expression.
|
expression.
|
||||||
@ -56,22 +62,46 @@ def simple_tokenize(text):
|
|||||||
ideograms and hiragana) relatively untokenized, instead of splitting each
|
ideograms and hiragana) relatively untokenized, instead of splitting each
|
||||||
character into its own token.
|
character into its own token.
|
||||||
|
|
||||||
- It outputs only the tokens that start with a word-like character, or
|
- If `include_punctuation` is False (the default), it outputs only the
|
||||||
miscellaneous symbols such as emoji.
|
tokens that start with a word-like character, or miscellaneous symbols
|
||||||
|
such as emoji. If `include_punctuation` is True, it outputs all non-space
|
||||||
|
tokens.
|
||||||
|
|
||||||
- It breaks on all spaces, even the "non-breaking" ones.
|
- It breaks on all spaces, even the "non-breaking" ones.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
|
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
def turkish_tokenize(text):
|
def turkish_tokenize(text, include_punctuation=False):
|
||||||
"""
|
"""
|
||||||
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
|
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
|
||||||
in Turkish.
|
in Turkish.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
|
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
|
mecab_tokenize = None
|
||||||
|
def japanese_tokenize(text, include_punctuation=False):
|
||||||
|
global mecab_tokenize
|
||||||
|
if mecab_tokenize is None:
|
||||||
|
from wordfreq.japanese import mecab_tokenize
|
||||||
|
tokens = mecab_tokenize(text)
|
||||||
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
|
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||||
|
|
||||||
|
|
||||||
|
jieba_tokenize = None
|
||||||
|
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||||
|
global jieba_tokenize
|
||||||
|
if jieba_tokenize is None:
|
||||||
|
from wordfreq.chinese import jieba_tokenize
|
||||||
|
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||||
|
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||||
|
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||||
|
|
||||||
|
|
||||||
def remove_arabic_marks(text):
|
def remove_arabic_marks(text):
|
||||||
@ -86,53 +116,53 @@ def remove_arabic_marks(text):
|
|||||||
return ARABIC_MARK_RE.sub('', text)
|
return ARABIC_MARK_RE.sub('', text)
|
||||||
|
|
||||||
|
|
||||||
mecab_tokenize = None
|
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||||
jieba_tokenize = None
|
|
||||||
def tokenize(text, lang):
|
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's relatively simple but appropriate for
|
Tokenize this text in a way that's relatively simple but appropriate for
|
||||||
the language.
|
the language. Strings that are looked up in wordfreq will be run through
|
||||||
|
this function first, so that they can be expected to match the data.
|
||||||
|
|
||||||
So far, this means:
|
Here is what the tokenizer will do, depending on the language:
|
||||||
|
|
||||||
|
- Chinese will be mapped to Simplified Chinese characters and tokenized
|
||||||
|
using the Jieba tokenizer, trained on a custom word list of words that
|
||||||
|
can be looked up in wordfreq.
|
||||||
|
|
||||||
|
- Japanese will be delegated to the external mecab-python module. It will
|
||||||
|
be NFKC normalized, which is stronger than NFC normalization.
|
||||||
|
|
||||||
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
|
|
||||||
- Japanese will be delegated to the external mecab-python module.
|
|
||||||
- Chinese or Japanese texts that aren't identified as the appropriate
|
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||||
language will only split on punctuation and script boundaries, giving
|
language will only split on punctuation and script boundaries, giving
|
||||||
you untokenized globs of characters that probably represent many words.
|
you untokenized globs of characters that probably represent many words.
|
||||||
|
|
||||||
|
- Arabic will be NFKC normalized, and will have Arabic-specific combining
|
||||||
|
marks and tatweels removed.
|
||||||
|
|
||||||
|
- Languages written in cased alphabets will be case-folded to lowercase.
|
||||||
|
|
||||||
- Turkish will use a different case-folding procedure, so that capital
|
- Turkish will use a different case-folding procedure, so that capital
|
||||||
I and İ map to ı and i respectively.
|
I and İ map to ı and i respectively.
|
||||||
- All other languages will be tokenized using a regex that mostly
|
|
||||||
implements the Word Segmentation section of Unicode Annex #29.
|
|
||||||
See `simple_tokenize` for details.
|
|
||||||
|
|
||||||
Additionally, the text will be case-folded to lowercase, and text marked
|
- Languages besides Japanese and Chinese will be tokenized using a regex
|
||||||
as Arabic will be normalized more strongly and have combining marks and
|
that mostly implements the Word Segmentation section of Unicode Annex
|
||||||
tatweels removed.
|
#29. See `simple_tokenize` for details.
|
||||||
|
|
||||||
Strings that are looked up in wordfreq will be run through this function
|
The `external_wordlist` option only affects Chinese tokenization. If it's
|
||||||
first, so that they can be expected to match the data.
|
True, then wordfreq will not use its own Chinese wordlist for tokenization.
|
||||||
|
Instead, it will use the large wordlist packaged with the Jieba tokenizer,
|
||||||
|
and it will leave Traditional Chinese characters as is. This will probably
|
||||||
|
give more accurate tokenization, but the resulting tokens won't necessarily
|
||||||
|
have word frequencies that can be looked up.
|
||||||
"""
|
"""
|
||||||
if lang == 'ja':
|
if lang == 'ja':
|
||||||
global mecab_tokenize
|
return japanese_tokenize(text, include_punctuation)
|
||||||
if mecab_tokenize is None:
|
elif lang == 'zh':
|
||||||
from wordfreq.japanese import mecab_tokenize
|
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||||
tokens = mecab_tokenize(text)
|
elif lang == 'tr':
|
||||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
return turkish_tokenize(text, include_punctuation)
|
||||||
|
elif lang == 'ar':
|
||||||
if lang == 'zh':
|
|
||||||
global jieba_tokenize
|
|
||||||
if jieba_tokenize is None:
|
|
||||||
from wordfreq.chinese import jieba_tokenize
|
|
||||||
tokens = jieba_tokenize(text)
|
|
||||||
return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
|
|
||||||
|
|
||||||
|
|
||||||
if lang == 'tr':
|
|
||||||
return turkish_tokenize(text)
|
|
||||||
|
|
||||||
if lang == 'ar':
|
|
||||||
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
|
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
|
||||||
|
return simple_tokenize(text, include_punctuation)
|
||||||
return simple_tokenize(text)
|
else:
|
||||||
|
return simple_tokenize(text, include_punctuation)
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ rule convert_opensubtitles
|
|||||||
|
|
||||||
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
|
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
|
||||||
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
|
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
|
||||||
# and remove lines with unfixable half-mojibake.
|
# spaces, and remove lines with unfixable half-mojibake.
|
||||||
rule convert_subtlex
|
rule convert_subtlex
|
||||||
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
|
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user