Discard text detected as an uncommon language; add large German list

This commit is contained in:
Rob Speer 2016-03-28 12:26:02 -04:00
parent 08130908c7
commit abbc295538
3 changed files with 12 additions and 12 deletions

Binary file not shown.

View File

@ -41,9 +41,11 @@ CONFIG = {
'subtlex-en': ['en'],
'subtlex-other': ['de', 'nl', 'zh'],
'jieba': ['zh'],
'reddit': [
'de', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'sv'
]
# About 99.2% of Reddit is in English, but there are pockets of
# conversation in other languages. These are the languages that seem
# to have enough non-spam comments to actually learn from.
'reddit': ['de', 'en', 'es', 'sv']
},
# Subtlex languages that need to be pre-processed
'wordlist_paths': {
@ -63,7 +65,7 @@ CONFIG = {
'jieba-dist': 'dist/jieba_{lang}.{ext}'
},
'min_sources': 2,
'big-lists': ['en', 'fr', 'es', 'pt']
'big-lists': ['en', 'fr', 'es', 'pt', 'de']
}

View File

@ -116,19 +116,17 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
Produces output files that are separated by language, with spaces
between the tokens.
"""
out_files = {}
out_files = {
language: open('%s.%s.txt' % (out_prefix, language), 'w', encoding='utf-8')
for language in KEEP_THESE_LANGUAGES
}
with open(in_filename, encoding='utf-8') as in_file:
for line in in_file:
text = line.split('\t')[-1].strip()
language, tokens = tokenizer(text)
if language != 'un':
if language in KEEP_THESE_LANGUAGES:
out_file = out_files[language]
tokenized = ' '.join(tokens)
out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files:
out_file = out_files[out_filename]
else:
out_file = open(out_filename, 'w', encoding='utf-8')
out_files[out_filename] = out_file
print(tokenized, file=out_file)
for out_file in out_files.values():
out_file.close()