diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 8303663..0b7e57f 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -55,6 +55,17 @@ rule convert_leeds rule convert_opensubtitles command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out +# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all +# the input files, keep only the single words and their counts, and only keep +# lines with counts of 100 or more. +# +# (These will still be repeated as the word appears in different grammatical +# roles, information that the source data provides that we're discarding. The +# source data was already filtered to only show words in roles with at least +# two-digit counts of occurences.) +rule convert_google_syntactic_ngrams + command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out + rule count command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index b6af74d..72f5967 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -16,6 +16,7 @@ CONFIG = { 'wikipedia': [ 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', 'pt', 'ru' + # many more can be added ], 'opensubtitles': [ # All languages where the most common word in OpenSubtitles @@ -27,6 +28,11 @@ CONFIG = { ], 'leeds': [ 'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh' + ], + 'google-books': [ + 'en', + # Using the 2012 data, we could get French, German, Italian, + # Russian, Spanish, and (Simplified) Chinese. ] }, 'wordlist_paths': { @@ -34,6 +40,7 @@ CONFIG = { 'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}', 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}', 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', + 'google-books': 'generated/google-books/google_books_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}' }, 'min_sources': 2 diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 3dcc9ee..243b3ef 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -71,6 +71,11 @@ def make_ninja_deps(rules_filename, out=sys.stdout): CONFIG['sources']['wikipedia'] ) ) + lines.extend( + google_books_deps( + data_filename('raw-input/google-books') + ) + ) lines.extend( leeds_deps( data_filename('source-lists/leeds'), @@ -106,6 +111,22 @@ def wikipedia_deps(dirname_in, languages): return lines +def google_books_deps(dirname_in): + # Get English data from the split-up files of the Google Syntactic N-grams + # 2013 corpus. + lines = [] + + # Yes, the files are numbered 00 through 98 of 99. This is not an + # off-by-one error. Not on my part, anyway. + input_files = [ + '{}/nodes.{:>02d}-of-99.gz'.format(dirname_in, i) + for i in range(99) + ] + output_file = wordlist_filename('google-books', 'en', 'counts.txt') + add_dep(lines, 'convert_google_syntactic_ngrams', input_files, output_file) + return lines + + def twitter_preprocess_deps(input_filename, slice_prefix, combined_prefix, slices, languages): lines = [] @@ -192,7 +213,7 @@ def combine_lists(languages): output_dBpack = wordlist_filename('combined', language, 'msgpack.gz') add_dep(lines, 'freqs2dB', output_file, output_dBpack, extra='wordfreq_builder/word_counts.py') - + lines.append('default {}'.format(output_dBpack)) return lines