mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
WIP: Traditional Chinese
This commit is contained in:
parent
3c3371a9ff
commit
7906a671ea
20
README.md
20
README.md
@ -117,32 +117,32 @@ of word usage on different topics at different levels of formality. The sources
|
||||
|
||||
- **GBooks**: Google Books Ngrams 2013
|
||||
- **LeedsIC**: The Leeds Internet Corpus
|
||||
- **OpenSub**: OpenSubtitles
|
||||
- **SUBTLEX**: The SUBTLEX word frequency lists
|
||||
- **OpenSub**: Data derived from OpenSubtitles but not from SUBTLEX
|
||||
- **Twitter**: Messages sampled from Twitter's public stream
|
||||
- **Wikipedia**: The full text of Wikipedia in 2015
|
||||
|
||||
The following 12 languages are well-supported, with reasonable tokenization and
|
||||
at least 3 different sources of word frequencies:
|
||||
|
||||
Language Code GBooks SUBTLEX LeedsIC OpenSub Twitter Wikipedia
|
||||
Language Code GBooks SUBTLEX OpenSub LeedsIC Twitter Wikipedia
|
||||
──────────────────┼──────────────────────────────────────────────────
|
||||
Arabic ar │ - - Yes Yes Yes Yes
|
||||
German de │ - Yes Yes Yes Yes[1] Yes
|
||||
German de │ - Yes - Yes Yes[1] Yes
|
||||
Greek el │ - - Yes Yes Yes Yes
|
||||
English en │ Yes Yes Yes Yes Yes Yes
|
||||
Spanish es │ - - Yes Yes Yes Yes
|
||||
French fr │ - - Yes Yes Yes Yes
|
||||
Indonesian id │ - - - Yes Yes Yes
|
||||
Indonesian id │ - - Yes - Yes Yes
|
||||
Italian it │ - - Yes Yes Yes Yes
|
||||
Japanese ja │ - - Yes - Yes Yes
|
||||
Malay ms │ - - - Yes Yes Yes
|
||||
Dutch nl │ - Yes - Yes Yes Yes
|
||||
Polish pl │ - - - Yes Yes Yes
|
||||
Japanese ja │ - - - Yes Yes Yes
|
||||
Malay ms │ - - Yes - Yes Yes
|
||||
Dutch nl │ - Yes Yes - Yes Yes
|
||||
Polish pl │ - - Yes - Yes Yes
|
||||
Portuguese pt │ - - Yes Yes Yes Yes
|
||||
Russian ru │ - - Yes Yes Yes Yes
|
||||
Swedish sv │ - - - Yes Yes Yes
|
||||
Turkish tr │ - - - Yes Yes Yes
|
||||
Swedish sv │ - - Yes - Yes Yes
|
||||
Turkish tr │ - - Yes - Yes Yes
|
||||
|
||||
These languages are only marginally supported so far. We have too few data
|
||||
sources so far in Korean (feel free to suggest some), and we are lacking
|
||||
|
37
scripts/make_chinese_mapping.py
Normal file
37
scripts/make_chinese_mapping.py
Normal file
@ -0,0 +1,37 @@
|
||||
import unicodedata
|
||||
import itertools
|
||||
import os
|
||||
import pprint
|
||||
|
||||
|
||||
def make_hanzi_table(filename):
|
||||
with open(filename, 'w', encoding='utf-8') as out:
|
||||
for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
|
||||
char = chr(codept)
|
||||
if unicodedata.category(char) != 'Cn':
|
||||
print('%5X\t%s' % (codept, char), file=out)
|
||||
|
||||
|
||||
def make_hanzi_converter(table_in, python_out):
|
||||
table = {}
|
||||
with open(table_in, encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
hexcode, char = line.rstrip('\n').split('\t')
|
||||
codept = int(hexcode, 16)
|
||||
assert len(char) == 1
|
||||
if chr(codept) != char:
|
||||
table[codept] = char
|
||||
with open(python_out, 'w', encoding='utf-8') as outfile:
|
||||
print('SIMPLIFIED_MAP = ', end='', file=outfile)
|
||||
pprint.pprint(table, stream=outfile)
|
||||
|
||||
|
||||
def build():
|
||||
make_hanzi_table('/tmp/han_in.txt')
|
||||
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
|
||||
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
build()
|
||||
|
3275
wordfreq/_chinese_mapping.py
Normal file
3275
wordfreq/_chinese_mapping.py
Normal file
File diff suppressed because it is too large
Load Diff
19
wordfreq/chinese.py
Normal file
19
wordfreq/chinese.py
Normal file
@ -0,0 +1,19 @@
|
||||
from pkg_resources import resource_filename
|
||||
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
|
||||
import jieba
|
||||
|
||||
|
||||
jieba_initialized = False
|
||||
|
||||
|
||||
def simplify_chinese(text):
|
||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
||||
|
||||
|
||||
def chinese_tokenize(text):
|
||||
global jieba_initialized
|
||||
if not jieba_initialized:
|
||||
jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
|
||||
jieba_initialized = True
|
||||
return list(jieba.cut(simplify_chinese(text)))
|
||||
|
@ -1,5 +1,6 @@
|
||||
import regex
|
||||
import unicodedata
|
||||
from pkg_resources import resource_filename
|
||||
|
||||
|
||||
TOKEN_RE = regex.compile(r"""
|
||||
@ -87,6 +88,7 @@ def remove_arabic_marks(text):
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
jieba_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
"""
|
||||
Tokenize this text in a way that's relatively simple but appropriate for
|
||||
@ -115,9 +117,15 @@ def tokenize(text, lang):
|
||||
if lang == 'ja':
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
from wordfreq.japanese import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
|
||||
if lang == 'zh':
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
return jieba_tokenize(text)
|
||||
|
||||
if lang == 'tr':
|
||||
return turkish_tokenize(text)
|
||||
|
||||
|
@ -32,10 +32,15 @@ rule wiki2text
|
||||
command = bunzip2 -c $in | wiki2text > $out
|
||||
|
||||
# To tokenize Japanese, we run it through Mecab and take the first column.
|
||||
# We don't have a plan for tokenizing Chinese yet.
|
||||
rule tokenize_japanese
|
||||
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
|
||||
|
||||
# Process Chinese by converting all Traditional Chinese characters to
|
||||
# Simplified equivalents -- not because that's a good way to get readable
|
||||
# text, but because that's how we're going to look them up.
|
||||
rule simplify_chinese
|
||||
command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out
|
||||
|
||||
# Tokenizing text from Twitter requires us to language-detect and tokenize
|
||||
# in the same step.
|
||||
rule tokenize_twitter
|
||||
|
11
wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py
Normal file
11
wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py
Normal file
@ -0,0 +1,11 @@
|
||||
from wordfreq.chinese import simplify_chinese
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
for line in sys.stdin:
|
||||
sys.stdout.write(simplify_chinese(line))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user