mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
add Google Books data for English
This commit is contained in:
parent
ed4f79b90e
commit
4513fed60c
@ -55,6 +55,17 @@ rule convert_leeds
|
|||||||
rule convert_opensubtitles
|
rule convert_opensubtitles
|
||||||
command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
|
command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
|
||||||
|
|
||||||
|
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
||||||
|
# the input files, keep only the single words and their counts, and only keep
|
||||||
|
# lines with counts of 100 or more.
|
||||||
|
#
|
||||||
|
# (These will still be repeated as the word appears in different grammatical
|
||||||
|
# roles, information that the source data provides that we're discarding. The
|
||||||
|
# source data was already filtered to only show words in roles with at least
|
||||||
|
# two-digit counts of occurences.)
|
||||||
|
rule convert_google_syntactic_ngrams
|
||||||
|
command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
|
||||||
|
|
||||||
rule count
|
rule count
|
||||||
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
|
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@ CONFIG = {
|
|||||||
'wikipedia': [
|
'wikipedia': [
|
||||||
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
||||||
'pt', 'ru'
|
'pt', 'ru'
|
||||||
|
# many more can be added
|
||||||
],
|
],
|
||||||
'opensubtitles': [
|
'opensubtitles': [
|
||||||
# All languages where the most common word in OpenSubtitles
|
# All languages where the most common word in OpenSubtitles
|
||||||
@ -27,6 +28,11 @@ CONFIG = {
|
|||||||
],
|
],
|
||||||
'leeds': [
|
'leeds': [
|
||||||
'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh'
|
'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh'
|
||||||
|
],
|
||||||
|
'google-books': [
|
||||||
|
'en',
|
||||||
|
# Using the 2012 data, we could get French, German, Italian,
|
||||||
|
# Russian, Spanish, and (Simplified) Chinese.
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
'wordlist_paths': {
|
'wordlist_paths': {
|
||||||
@ -34,6 +40,7 @@ CONFIG = {
|
|||||||
'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
|
'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
|
||||||
'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
|
'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
|
||||||
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
|
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
|
||||||
|
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
|
||||||
'combined': 'generated/combined/combined_{lang}.{ext}'
|
'combined': 'generated/combined/combined_{lang}.{ext}'
|
||||||
},
|
},
|
||||||
'min_sources': 2
|
'min_sources': 2
|
||||||
|
@ -71,6 +71,11 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
|||||||
CONFIG['sources']['wikipedia']
|
CONFIG['sources']['wikipedia']
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
lines.extend(
|
||||||
|
google_books_deps(
|
||||||
|
data_filename('raw-input/google-books')
|
||||||
|
)
|
||||||
|
)
|
||||||
lines.extend(
|
lines.extend(
|
||||||
leeds_deps(
|
leeds_deps(
|
||||||
data_filename('source-lists/leeds'),
|
data_filename('source-lists/leeds'),
|
||||||
@ -106,6 +111,22 @@ def wikipedia_deps(dirname_in, languages):
|
|||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def google_books_deps(dirname_in):
|
||||||
|
# Get English data from the split-up files of the Google Syntactic N-grams
|
||||||
|
# 2013 corpus.
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
# Yes, the files are numbered 00 through 98 of 99. This is not an
|
||||||
|
# off-by-one error. Not on my part, anyway.
|
||||||
|
input_files = [
|
||||||
|
'{}/nodes.{:>02d}-of-99.gz'.format(dirname_in, i)
|
||||||
|
for i in range(99)
|
||||||
|
]
|
||||||
|
output_file = wordlist_filename('google-books', 'en', 'counts.txt')
|
||||||
|
add_dep(lines, 'convert_google_syntactic_ngrams', input_files, output_file)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def twitter_preprocess_deps(input_filename, slice_prefix,
|
def twitter_preprocess_deps(input_filename, slice_prefix,
|
||||||
combined_prefix, slices, languages):
|
combined_prefix, slices, languages):
|
||||||
lines = []
|
lines = []
|
||||||
@ -192,7 +213,7 @@ def combine_lists(languages):
|
|||||||
output_dBpack = wordlist_filename('combined', language, 'msgpack.gz')
|
output_dBpack = wordlist_filename('combined', language, 'msgpack.gz')
|
||||||
add_dep(lines, 'freqs2dB', output_file, output_dBpack,
|
add_dep(lines, 'freqs2dB', output_file, output_dBpack,
|
||||||
extra='wordfreq_builder/word_counts.py')
|
extra='wordfreq_builder/word_counts.py')
|
||||||
|
|
||||||
lines.append('default {}'.format(output_dBpack))
|
lines.append('default {}'.format(output_dBpack))
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user