fixed bugs and removed unused code

This commit is contained in:
Joshua Chin 2015-06-16 17:25:06 -04:00
parent aa0bef3fb7
commit 20bc34f224

View File

@ -11,7 +11,7 @@ TMPDIR = data_filename('tmp')
# Set this to True to rebuild the Twitter tokenization (which takes days)
PRETOKENIZE_TWITTER = False
TOKENIZE_TWITTER = True
def add_dep(lines, rule, input, output, extra=None, params=None):
@ -31,7 +31,7 @@ def add_dep(lines, rule, input, output, extra=None, params=None):
lines.append(build_rule)
if params:
for key, val in params.items():
lines.append(" {key} = {val}".format(locals()))
lines.append(" {key} = {val}".format(key=key, val=val))
lines.append("")
@ -49,9 +49,9 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
extra='wordfreq_builder/ninja.py')
if PRETOKENIZE_TWITTER:
if TOKENIZE_TWITTER:
lines.extend(
twitter_preprocess_deps(
twitter_deps(
data_filename('raw-input/twitter/all-2014.txt'),
slice_prefix=data_filename('slices/twitter/tweets-2014'),
combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
@ -59,12 +59,6 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
languages=CONFIG['sources']['twitter']
)
)
lines.extend(
twitter_deps(
data_filename('intermediate/twitter/tweets-2014'),
languages=CONFIG['sources']['twitter']
)
)
lines.extend(
wikipedia_deps(
data_filename('raw-input/wikipedia'),
@ -131,7 +125,7 @@ def google_books_deps(dirname_in):
return lines
def twitter_preprocess_deps(input_filename, slice_prefix,
def twitter_deps(input_filename, slice_prefix,
combined_prefix, slices, languages):
lines = []
@ -140,7 +134,7 @@ def twitter_preprocess_deps(input_filename, slice_prefix,
# split the input into slices
add_dep(lines,
'split', input_filename, slice_files,
{'prefix': '{}.part'.format(slice_prefix),
params={'prefix': '{}.part'.format(slice_prefix),
'slices': slices})
for slicenum in range(slices):
@ -150,7 +144,7 @@ def twitter_preprocess_deps(input_filename, slice_prefix,
for language in languages
]
add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
{'prefix': slice_file})
params={'prefix': slice_file})
for language in languages:
combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language)
@ -163,26 +157,6 @@ def twitter_preprocess_deps(input_filename, slice_prefix,
return lines
def twitter_deps(prefix_in, languages):
lines = []
for language in languages:
input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language)
token_file = wordlist_filename('twitter', language, 'tokens.txt')
add_dep(lines,
'format_twitter', input_file, token_file,
extra='wordfreq_builder/tokenizers.py')
count_file = wordlist_filename('twitter', language, 'counts.txt')
if language == 'ja':
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
else:
add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')
return lines
def leeds_deps(dirname_in, languages):
lines = []
for language in languages: