mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Exclude angle brackets from CLD2 detection
This commit is contained in:
parent
81bbe663fb
commit
0d3ee869c1
@ -153,7 +153,8 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
|||||||
for language in languages
|
for language in languages
|
||||||
]
|
]
|
||||||
add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
|
add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
|
||||||
params={'prefix': slice_file})
|
params={'prefix': slice_file},
|
||||||
|
extra='wordfreq_builder/tokenizers.py')
|
||||||
|
|
||||||
for language in languages:
|
for language in languages:
|
||||||
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
|
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
|
||||||
|
@ -13,7 +13,8 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
|||||||
'\ufdd0-\ufdef',
|
'\ufdd0-\ufdef',
|
||||||
'\N{HANGUL FILLER}',
|
'\N{HANGUL FILLER}',
|
||||||
'\N{HANGUL CHOSEONG FILLER}',
|
'\N{HANGUL CHOSEONG FILLER}',
|
||||||
'\N{HANGUL JUNGSEONG FILLER}'
|
'\N{HANGUL JUNGSEONG FILLER}',
|
||||||
|
'<>'
|
||||||
] +
|
] +
|
||||||
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user