diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 2ae66c4..93d957e 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -153,7 +153,8 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, for language in languages ] add_dep(lines, 'tokenize_twitter', slice_file, language_outputs, - params={'prefix': slice_file}) + params={'prefix': slice_file}, + extra='wordfreq_builder/tokenizers.py') for language in languages: combined_output = wordlist_filename('twitter', language, 'tokens.txt') diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 92d0714..1a75626 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -13,7 +13,8 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( '\ufdd0-\ufdef', '\N{HANGUL FILLER}', '\N{HANGUL CHOSEONG FILLER}', - '\N{HANGUL JUNGSEONG FILLER}' + '\N{HANGUL JUNGSEONG FILLER}', + '<>' ] + [chr(65534+65536*x+y) for x in range(17) for y in range(2)] )