remove Hangul fillers that confuse cld2

2024-12-23 09:21:37 +00:00 · 2015-08-24 17:10:01 -04:00 · 2015-08-24 17:10:01 -04:00 · 140ca6c050
commit 140ca6c050
parent 102bc715ae
2 changed files with 5 additions and 3 deletions
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -123,7 +123,6 @@ def google_books_deps(dirname_in):

 def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
                 languages):
-
    lines = []

    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -11,7 +11,10 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
        '\x0e-\x1f',
        '\x7f-\x9f',
        '\ud800-\udfff',
-        '\ufdd0-\ufdef'
+        '\ufdd0-\ufdef',
+        '\N{HANGUL FILLER}',
+        '\N{HANGUL CHOSEONG FILLER}',
+        '\N{HANGUL JUNGSEONG FILLER}'
    ] +
    [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
 )
@ -45,7 +48,7 @@ def cld2_detect_language(text):
    #       Language code: str
    #       Percent of text in this language: float
    #       Confidence score: float))
-    
+
    text = CLD2_BAD_CHARS_RE.sub('', text)
    return pycld2.detect(text)[2][0][1]