mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Discard text detected as an uncommon language; add large German list
Former-commit-id: abbc295538
This commit is contained in:
parent
aa7802b552
commit
0c7527140c
BIN
wordfreq/data/large_de.msgpack.gz
Normal file
BIN
wordfreq/data/large_de.msgpack.gz
Normal file
Binary file not shown.
@ -41,9 +41,11 @@ CONFIG = {
|
||||
'subtlex-en': ['en'],
|
||||
'subtlex-other': ['de', 'nl', 'zh'],
|
||||
'jieba': ['zh'],
|
||||
'reddit': [
|
||||
'de', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'sv'
|
||||
]
|
||||
|
||||
# About 99.2% of Reddit is in English, but there are pockets of
|
||||
# conversation in other languages. These are the languages that seem
|
||||
# to have enough non-spam comments to actually learn from.
|
||||
'reddit': ['de', 'en', 'es', 'sv']
|
||||
},
|
||||
# Subtlex languages that need to be pre-processed
|
||||
'wordlist_paths': {
|
||||
@ -63,7 +65,7 @@ CONFIG = {
|
||||
'jieba-dist': 'dist/jieba_{lang}.{ext}'
|
||||
},
|
||||
'min_sources': 2,
|
||||
'big-lists': ['en', 'fr', 'es', 'pt']
|
||||
'big-lists': ['en', 'fr', 'es', 'pt', 'de']
|
||||
}
|
||||
|
||||
|
||||
|
@ -116,19 +116,17 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
||||
Produces output files that are separated by language, with spaces
|
||||
between the tokens.
|
||||
"""
|
||||
out_files = {}
|
||||
out_files = {
|
||||
language: open('%s.%s.txt' % (out_prefix, language), 'w', encoding='utf-8')
|
||||
for language in KEEP_THESE_LANGUAGES
|
||||
}
|
||||
with open(in_filename, encoding='utf-8') as in_file:
|
||||
for line in in_file:
|
||||
text = line.split('\t')[-1].strip()
|
||||
language, tokens = tokenizer(text)
|
||||
if language != 'un':
|
||||
if language in KEEP_THESE_LANGUAGES:
|
||||
out_file = out_files[language]
|
||||
tokenized = ' '.join(tokens)
|
||||
out_filename = '%s.%s.txt' % (out_prefix, language)
|
||||
if out_filename in out_files:
|
||||
out_file = out_files[out_filename]
|
||||
else:
|
||||
out_file = open(out_filename, 'w', encoding='utf-8')
|
||||
out_files[out_filename] = out_file
|
||||
print(tokenized, file=out_file)
|
||||
for out_file in out_files.values():
|
||||
out_file.close()
|
||||
|
Loading…
Reference in New Issue
Block a user