add 'twitter' as a final build, and a new build dir

The `data/dist` directory is now a convenient place to find the final
built files that can be copied into wordfreq.
This commit is contained in:
Robyn Speer 2015-07-01 17:45:39 -04:00
parent 58c8bda21b
commit 3eb3e7c388
2 changed files with 14 additions and 2 deletions

View File

@ -41,7 +41,9 @@ CONFIG = {
'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
'combined': 'generated/combined/combined_{lang}.{ext}'
'combined': 'generated/combined/combined_{lang}.{ext}',
'combined-dist': 'dist/combined_{lang}.{ext}',
'twitter-dist': 'dist/twitter_{lang}.{ext}'
},
'min_sources': 2
}

View File

@ -205,11 +205,21 @@ def combine_lists(languages):
add_dep(lines, 'merge', input_files, output_file,
extra='wordfreq_builder/word_counts.py')
output_cBpack = wordlist_filename('combined', language, 'msgpack.gz')
output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
extra='wordfreq_builder/word_counts.py')
lines.append('default {}'.format(output_cBpack))
# Write standalone lists for Twitter frequency
if language in CONFIG['sources']['twitter']:
input_file = wordlist_filename('twitter', language, 'counts.txt')
output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
extra='wordfreq_builder/word_counts.py')
lines.append('default {}'.format(output_cBpack))
return lines