mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
e77c2dbca8
commit
7d1c2e72e4
20
README.md
20
README.md
@ -117,32 +117,32 @@ of word usage on different topics at different levels of formality. The sources
|
|||||||
|
|
||||||
- **GBooks**: Google Books Ngrams 2013
|
- **GBooks**: Google Books Ngrams 2013
|
||||||
- **LeedsIC**: The Leeds Internet Corpus
|
- **LeedsIC**: The Leeds Internet Corpus
|
||||||
- **OpenSub**: OpenSubtitles
|
|
||||||
- **SUBTLEX**: The SUBTLEX word frequency lists
|
- **SUBTLEX**: The SUBTLEX word frequency lists
|
||||||
|
- **OpenSub**: Data derived from OpenSubtitles but not from SUBTLEX
|
||||||
- **Twitter**: Messages sampled from Twitter's public stream
|
- **Twitter**: Messages sampled from Twitter's public stream
|
||||||
- **Wikipedia**: The full text of Wikipedia in 2015
|
- **Wikipedia**: The full text of Wikipedia in 2015
|
||||||
|
|
||||||
The following 12 languages are well-supported, with reasonable tokenization and
|
The following 12 languages are well-supported, with reasonable tokenization and
|
||||||
at least 3 different sources of word frequencies:
|
at least 3 different sources of word frequencies:
|
||||||
|
|
||||||
Language Code GBooks SUBTLEX LeedsIC OpenSub Twitter Wikipedia
|
Language Code GBooks SUBTLEX OpenSub LeedsIC Twitter Wikipedia
|
||||||
──────────────────┼──────────────────────────────────────────────────
|
──────────────────┼──────────────────────────────────────────────────
|
||||||
Arabic ar │ - - Yes Yes Yes Yes
|
Arabic ar │ - - Yes Yes Yes Yes
|
||||||
German de │ - Yes Yes Yes Yes[1] Yes
|
German de │ - Yes - Yes Yes[1] Yes
|
||||||
Greek el │ - - Yes Yes Yes Yes
|
Greek el │ - - Yes Yes Yes Yes
|
||||||
English en │ Yes Yes Yes Yes Yes Yes
|
English en │ Yes Yes Yes Yes Yes Yes
|
||||||
Spanish es │ - - Yes Yes Yes Yes
|
Spanish es │ - - Yes Yes Yes Yes
|
||||||
French fr │ - - Yes Yes Yes Yes
|
French fr │ - - Yes Yes Yes Yes
|
||||||
Indonesian id │ - - - Yes Yes Yes
|
Indonesian id │ - - Yes - Yes Yes
|
||||||
Italian it │ - - Yes Yes Yes Yes
|
Italian it │ - - Yes Yes Yes Yes
|
||||||
Japanese ja │ - - Yes - Yes Yes
|
Japanese ja │ - - - Yes Yes Yes
|
||||||
Malay ms │ - - - Yes Yes Yes
|
Malay ms │ - - Yes - Yes Yes
|
||||||
Dutch nl │ - Yes - Yes Yes Yes
|
Dutch nl │ - Yes Yes - Yes Yes
|
||||||
Polish pl │ - - - Yes Yes Yes
|
Polish pl │ - - Yes - Yes Yes
|
||||||
Portuguese pt │ - - Yes Yes Yes Yes
|
Portuguese pt │ - - Yes Yes Yes Yes
|
||||||
Russian ru │ - - Yes Yes Yes Yes
|
Russian ru │ - - Yes Yes Yes Yes
|
||||||
Swedish sv │ - - - Yes Yes Yes
|
Swedish sv │ - - Yes - Yes Yes
|
||||||
Turkish tr │ - - - Yes Yes Yes
|
Turkish tr │ - - Yes - Yes Yes
|
||||||
|
|
||||||
These languages are only marginally supported so far. We have too few data
|
These languages are only marginally supported so far. We have too few data
|
||||||
sources so far in Korean (feel free to suggest some), and we are lacking
|
sources so far in Korean (feel free to suggest some), and we are lacking
|
||||||
|
37
scripts/make_chinese_mapping.py
Normal file
37
scripts/make_chinese_mapping.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import unicodedata
|
||||||
|
import itertools
|
||||||
|
import os
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
|
||||||
|
def make_hanzi_table(filename):
|
||||||
|
with open(filename, 'w', encoding='utf-8') as out:
|
||||||
|
for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
|
||||||
|
char = chr(codept)
|
||||||
|
if unicodedata.category(char) != 'Cn':
|
||||||
|
print('%5X\t%s' % (codept, char), file=out)
|
||||||
|
|
||||||
|
|
||||||
|
def make_hanzi_converter(table_in, python_out):
|
||||||
|
table = {}
|
||||||
|
with open(table_in, encoding='utf-8') as infile:
|
||||||
|
for line in infile:
|
||||||
|
hexcode, char = line.rstrip('\n').split('\t')
|
||||||
|
codept = int(hexcode, 16)
|
||||||
|
assert len(char) == 1
|
||||||
|
if chr(codept) != char:
|
||||||
|
table[codept] = char
|
||||||
|
with open(python_out, 'w', encoding='utf-8') as outfile:
|
||||||
|
print('SIMPLIFIED_MAP = ', end='', file=outfile)
|
||||||
|
pprint.pprint(table, stream=outfile)
|
||||||
|
|
||||||
|
|
||||||
|
def build():
|
||||||
|
make_hanzi_table('/tmp/han_in.txt')
|
||||||
|
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
|
||||||
|
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
build()
|
||||||
|
|
3275
wordfreq/_chinese_mapping.py
Normal file
3275
wordfreq/_chinese_mapping.py
Normal file
File diff suppressed because it is too large
Load Diff
19
wordfreq/chinese.py
Normal file
19
wordfreq/chinese.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from pkg_resources import resource_filename
|
||||||
|
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
|
||||||
|
jieba_initialized = False
|
||||||
|
|
||||||
|
|
||||||
|
def simplify_chinese(text):
|
||||||
|
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||||
|
|
||||||
|
|
||||||
|
def chinese_tokenize(text):
|
||||||
|
global jieba_initialized
|
||||||
|
if not jieba_initialized:
|
||||||
|
jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
|
||||||
|
jieba_initialized = True
|
||||||
|
return list(jieba.cut(simplify_chinese(text)))
|
||||||
|
|
@ -1,5 +1,6 @@
|
|||||||
import regex
|
import regex
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
from pkg_resources import resource_filename
|
||||||
|
|
||||||
|
|
||||||
TOKEN_RE = regex.compile(r"""
|
TOKEN_RE = regex.compile(r"""
|
||||||
@ -87,6 +88,7 @@ def remove_arabic_marks(text):
|
|||||||
|
|
||||||
|
|
||||||
mecab_tokenize = None
|
mecab_tokenize = None
|
||||||
|
jieba_tokenize = None
|
||||||
def tokenize(text, lang):
|
def tokenize(text, lang):
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's relatively simple but appropriate for
|
Tokenize this text in a way that's relatively simple but appropriate for
|
||||||
@ -115,9 +117,15 @@ def tokenize(text, lang):
|
|||||||
if lang == 'ja':
|
if lang == 'ja':
|
||||||
global mecab_tokenize
|
global mecab_tokenize
|
||||||
if mecab_tokenize is None:
|
if mecab_tokenize is None:
|
||||||
from wordfreq.mecab import mecab_tokenize
|
from wordfreq.japanese import mecab_tokenize
|
||||||
return mecab_tokenize(text)
|
return mecab_tokenize(text)
|
||||||
|
|
||||||
|
if lang == 'zh':
|
||||||
|
global jieba_tokenize
|
||||||
|
if jieba_tokenize is None:
|
||||||
|
from wordfreq.chinese import jieba_tokenize
|
||||||
|
return jieba_tokenize(text)
|
||||||
|
|
||||||
if lang == 'tr':
|
if lang == 'tr':
|
||||||
return turkish_tokenize(text)
|
return turkish_tokenize(text)
|
||||||
|
|
||||||
|
@ -32,10 +32,15 @@ rule wiki2text
|
|||||||
command = bunzip2 -c $in | wiki2text > $out
|
command = bunzip2 -c $in | wiki2text > $out
|
||||||
|
|
||||||
# To tokenize Japanese, we run it through Mecab and take the first column.
|
# To tokenize Japanese, we run it through Mecab and take the first column.
|
||||||
# We don't have a plan for tokenizing Chinese yet.
|
|
||||||
rule tokenize_japanese
|
rule tokenize_japanese
|
||||||
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
|
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
|
||||||
|
|
||||||
|
# Process Chinese by converting all Traditional Chinese characters to
|
||||||
|
# Simplified equivalents -- not because that's a good way to get readable
|
||||||
|
# text, but because that's how we're going to look them up.
|
||||||
|
rule simplify_chinese
|
||||||
|
command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out
|
||||||
|
|
||||||
# Tokenizing text from Twitter requires us to language-detect and tokenize
|
# Tokenizing text from Twitter requires us to language-detect and tokenize
|
||||||
# in the same step.
|
# in the same step.
|
||||||
rule tokenize_twitter
|
rule tokenize_twitter
|
||||||
|
11
wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py
Normal file
11
wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from wordfreq.chinese import simplify_chinese
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
for line in sys.stdin:
|
||||||
|
sys.stdout.write(simplify_chinese(line))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user