diff --git a/wordfreq/data/large_de.msgpack.gz b/wordfreq/data/large_de.msgpack.gz new file mode 100644 index 0000000..fb717c2 Binary files /dev/null and b/wordfreq/data/large_de.msgpack.gz differ diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 595c191..4eac43b 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -41,9 +41,11 @@ CONFIG = { 'subtlex-en': ['en'], 'subtlex-other': ['de', 'nl', 'zh'], 'jieba': ['zh'], - 'reddit': [ - 'de', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'sv' - ] + + # About 99.2% of Reddit is in English, but there are pockets of + # conversation in other languages. These are the languages that seem + # to have enough non-spam comments to actually learn from. + 'reddit': ['de', 'en', 'es', 'sv'] }, # Subtlex languages that need to be pre-processed 'wordlist_paths': { @@ -63,7 +65,7 @@ CONFIG = { 'jieba-dist': 'dist/jieba_{lang}.{ext}' }, 'min_sources': 2, - 'big-lists': ['en', 'fr', 'es', 'pt'] + 'big-lists': ['en', 'fr', 'es', 'pt', 'de'] } diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index cea3283..d8bfd12 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -116,19 +116,17 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer): Produces output files that are separated by language, with spaces between the tokens. """ - out_files = {} + out_files = { + language: open('%s.%s.txt' % (out_prefix, language), 'w', encoding='utf-8') + for language in KEEP_THESE_LANGUAGES + } with open(in_filename, encoding='utf-8') as in_file: for line in in_file: text = line.split('\t')[-1].strip() language, tokens = tokenizer(text) - if language != 'un': + if language in KEEP_THESE_LANGUAGES: + out_file = out_files[language] tokenized = ' '.join(tokens) - out_filename = '%s.%s.txt' % (out_prefix, language) - if out_filename in out_files: - out_file = out_files[out_filename] - else: - out_file = open(out_filename, 'w', encoding='utf-8') - out_files[out_filename] = out_file print(tokenized, file=out_file) for out_file in out_files.values(): out_file.close()