diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index fa937cd..84c1818 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -123,7 +123,6 @@ def google_books_deps(dirname_in): def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, languages): - lines = [] slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 5815292..8e9f192 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -11,7 +11,10 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( '\x0e-\x1f', '\x7f-\x9f', '\ud800-\udfff', - '\ufdd0-\ufdef' + '\ufdd0-\ufdef', + '\N{HANGUL FILLER}', + '\N{HANGUL CHOSEONG FILLER}', + '\N{HANGUL JUNGSEONG FILLER}' ] + [chr(65534+65536*x+y) for x in range(17) for y in range(2)] ) @@ -45,7 +48,7 @@ def cld2_detect_language(text): # Language code: str # Percent of text in this language: float # Confidence score: float)) - + text = CLD2_BAD_CHARS_RE.sub('', text) return pycld2.detect(text)[2][0][1]