mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
remove Hangul fillers that confuse cld2
This commit is contained in:
parent
102bc715ae
commit
140ca6c050
@ -123,7 +123,6 @@ def google_books_deps(dirname_in):
|
|||||||
|
|
||||||
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
||||||
languages):
|
languages):
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
|
|
||||||
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
|
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
|
||||||
|
@ -11,7 +11,10 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
|||||||
'\x0e-\x1f',
|
'\x0e-\x1f',
|
||||||
'\x7f-\x9f',
|
'\x7f-\x9f',
|
||||||
'\ud800-\udfff',
|
'\ud800-\udfff',
|
||||||
'\ufdd0-\ufdef'
|
'\ufdd0-\ufdef',
|
||||||
|
'\N{HANGUL FILLER}',
|
||||||
|
'\N{HANGUL CHOSEONG FILLER}',
|
||||||
|
'\N{HANGUL JUNGSEONG FILLER}'
|
||||||
] +
|
] +
|
||||||
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
||||||
)
|
)
|
||||||
@ -45,7 +48,7 @@ def cld2_detect_language(text):
|
|||||||
# Language code: str
|
# Language code: str
|
||||||
# Percent of text in this language: float
|
# Percent of text in this language: float
|
||||||
# Confidence score: float))
|
# Confidence score: float))
|
||||||
|
|
||||||
text = CLD2_BAD_CHARS_RE.sub('', text)
|
text = CLD2_BAD_CHARS_RE.sub('', text)
|
||||||
return pycld2.detect(text)[2][0][1]
|
return pycld2.detect(text)[2][0][1]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user