mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Exclude angle brackets from CLD2 detection
This commit is contained in:
parent
81bbe663fb
commit
0d3ee869c1
@ -153,7 +153,8 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
||||
for language in languages
|
||||
]
|
||||
add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
|
||||
params={'prefix': slice_file})
|
||||
params={'prefix': slice_file},
|
||||
extra='wordfreq_builder/tokenizers.py')
|
||||
|
||||
for language in languages:
|
||||
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
|
||||
|
@ -13,7 +13,8 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
||||
'\ufdd0-\ufdef',
|
||||
'\N{HANGUL FILLER}',
|
||||
'\N{HANGUL CHOSEONG FILLER}',
|
||||
'\N{HANGUL JUNGSEONG FILLER}'
|
||||
'\N{HANGUL JUNGSEONG FILLER}',
|
||||
'<>'
|
||||
] +
|
||||
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user