Exclude angle brackets from CLD2 detection

This commit is contained in:
Rob Speer 2015-09-04 14:56:06 -04:00
parent 81bbe663fb
commit 0d3ee869c1
2 changed files with 4 additions and 2 deletions

View File

@ -153,7 +153,8 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
for language in languages
]
add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
params={'prefix': slice_file})
params={'prefix': slice_file},
extra='wordfreq_builder/tokenizers.py')
for language in languages:
combined_output = wordlist_filename('twitter', language, 'tokens.txt')

View File

@ -13,7 +13,8 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
'\ufdd0-\ufdef',
'\N{HANGUL FILLER}',
'\N{HANGUL CHOSEONG FILLER}',
'\N{HANGUL JUNGSEONG FILLER}'
'\N{HANGUL JUNGSEONG FILLER}',
'<>'
] +
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
)