remove Hangul fillers that confuse cld2

This commit is contained in:
Rob Speer 2015-08-24 17:10:01 -04:00
parent 102bc715ae
commit 140ca6c050
2 changed files with 5 additions and 3 deletions

View File

@ -123,7 +123,6 @@ def google_books_deps(dirname_in):
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
languages): languages):
lines = [] lines = []
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,

View File

@ -11,7 +11,10 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
'\x0e-\x1f', '\x0e-\x1f',
'\x7f-\x9f', '\x7f-\x9f',
'\ud800-\udfff', '\ud800-\udfff',
'\ufdd0-\ufdef' '\ufdd0-\ufdef',
'\N{HANGUL FILLER}',
'\N{HANGUL CHOSEONG FILLER}',
'\N{HANGUL JUNGSEONG FILLER}'
] + ] +
[chr(65534+65536*x+y) for x in range(17) for y in range(2)] [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
) )
@ -45,7 +48,7 @@ def cld2_detect_language(text):
# Language code: str # Language code: str
# Percent of text in this language: float # Percent of text in this language: float
# Confidence score: float)) # Confidence score: float))
text = CLD2_BAD_CHARS_RE.sub('', text) text = CLD2_BAD_CHARS_RE.sub('', text)
return pycld2.detect(text)[2][0][1] return pycld2.detect(text)[2][0][1]