Merge pull request #4 from LuminosoInsight/tokenization-cleanup

remove wiki2tokens and tokenize_wikipedia
This commit is contained in:
Joshua Chin 2015-07-01 11:34:30 -04:00
commit 34a886feaa
5 changed files with 7 additions and 46 deletions

View File

@ -31,16 +31,6 @@ rule split
rule wiki2text rule wiki2text
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
# The wiki2tokens rule is the same as the wiki2text rule, but uses the -t
# flag to tell the Nim code to output one token per line (according to its
# language-agnostic tokenizer, which splits on punctuation and whitespace in
# basically the same way as wordfreq).
#
# The fact that this uses a language-agnostic tokenizer means it should not
# be applied to Chinese or Japanese.
rule wiki2tokens
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
# To tokenize Japanese, we run it through Mecab and take the first column. # To tokenize Japanese, we run it through Mecab and take the first column.
# We don't have a plan for tokenizing Chinese yet. # We don't have a plan for tokenizing Chinese yet.
rule tokenize_japanese rule tokenize_japanese

View File

@ -14,7 +14,6 @@ setup(
'console_scripts': [ 'console_scripts': [
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main', 'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main' 'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
] ]
} }

View File

@ -1,30 +0,0 @@
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, monolingual_tokenize_file
import argparse
def tokenize_wikipedia(in_filename, out_filename, language, proportion):
monolingual_tokenize_file(
in_filename, out_filename,
language=language,
tokenizer=cld2_surface_tokenizer,
line_reader=strip_headings,
sample_proportion=proportion
)
def strip_headings(text):
return text.strip().strip('=')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('in_filename', help='filename of input file')
parser.add_argument('out_filename', help='filename of output file')
parser.add_argument('language', help='the language code of the text')
parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
args = parser.parse_args()
tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
if __name__ == '__main__':
main()

View File

@ -96,16 +96,16 @@ def wikipedia_deps(dirname_in, languages):
input_file = max(path_in.glob( input_file = max(path_in.glob(
'{}wiki*.bz2'.format(language) '{}wiki*.bz2'.format(language)
)) ))
token_file = wordlist_filename('wikipedia', language, 'tokens.txt') plain_text_file = wordlist_filename('wikipedia', language, 'txt')
count_file = wordlist_filename('wikipedia', language, 'counts.txt') count_file = wordlist_filename('wikipedia', language, 'counts.txt')
add_dep(lines, 'wiki2tokens', input_file, token_file) add_dep(lines, 'wiki2text', input_file, plain_text_file)
if language == 'ja': if language == 'ja':
mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt') mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file) add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
add_dep(lines, 'count', mecab_token_file, count_file) add_dep(lines, 'count', mecab_token_file, count_file)
else: else:
add_dep(lines, 'count', token_file, count_file) add_dep(lines, 'count', plain_text_file, count_file)
return lines return lines

View File

@ -12,9 +12,11 @@ def count_tokens(filename):
""" """
Count tokens that appear in a file, running each line through our Count tokens that appear in a file, running each line through our
simple tokenizer. simple tokenizer.
Unicode errors in the input data will become token boundaries.
""" """
counts = defaultdict(int) counts = defaultdict(int)
with open(filename, encoding='utf-8') as infile: with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile: for line in infile:
for token in simple_tokenize(line.strip()): for token in simple_tokenize(line.strip()):
counts[token] += 1 counts[token] += 1