From 8277de2c7fb4bce057cf9a0e913f971dc9809f76 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Wed, 17 Jun 2015 14:19:28 -0400 Subject: [PATCH] changed tokenize_file: cld2 return 'un' instead of None if it cannot recognize the language --- wordfreq_builder/wordfreq_builder/tokenizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 653fa7e..6b9107d 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -66,7 +66,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): for line in in_file: text = line_reader(line) tokens, language = tokenizer(text) - if language is not None: + if language == 'un': tokenized = '\n'.join(tokens) out_filename = '%s.%s.txt' % (out_prefix, language) if out_filename in out_files: