mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 18:01:38 +00:00
parent
c2f3928433
commit
71ff0c62d6
@ -1,5 +1,6 @@
|
|||||||
from html.entities import name2codepoint
|
from html.entities import name2codepoint
|
||||||
from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
|
from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
|
||||||
|
from ftfy.fixes import unescape_html
|
||||||
import re
|
import re
|
||||||
import pycld2
|
import pycld2
|
||||||
|
|
||||||
@ -24,7 +25,7 @@ def cld2_surface_tokenizer(text):
|
|||||||
"""
|
"""
|
||||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
|
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
|
||||||
"""
|
"""
|
||||||
text = fix_entities(text)
|
text = unescape_html(text)
|
||||||
text = TWITTER_HANDLE_RE.sub('', text)
|
text = TWITTER_HANDLE_RE.sub('', text)
|
||||||
text = TCO_RE.sub('', text)
|
text = TCO_RE.sub('', text)
|
||||||
lang = cld2_detect_language(text)
|
lang = cld2_detect_language(text)
|
||||||
@ -36,6 +37,15 @@ def cld2_detect_language(text):
|
|||||||
"""
|
"""
|
||||||
Uses CLD2 to detect the language
|
Uses CLD2 to detect the language
|
||||||
"""
|
"""
|
||||||
|
# Format of pycld2.detect:
|
||||||
|
# (Confident in result: bool,
|
||||||
|
# Number of bytes of text: Int,
|
||||||
|
# Triples of detected languages in order of certainty:
|
||||||
|
# (Language name: str,
|
||||||
|
# Language code: str
|
||||||
|
# Percent of text in this language: float
|
||||||
|
# Confidence score: float))
|
||||||
|
|
||||||
text = CLD2_BAD_CHARS_RE.sub('', text)
|
text = CLD2_BAD_CHARS_RE.sub('', text)
|
||||||
return pycld2.detect(text)[2][0][1]
|
return pycld2.detect(text)[2][0][1]
|
||||||
|
|
||||||
@ -62,14 +72,3 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
|||||||
print(tokenized, file=out_file)
|
print(tokenized, file=out_file)
|
||||||
for out_file in out_files.values():
|
for out_file in out_files.values():
|
||||||
out_file.close()
|
out_file.close()
|
||||||
|
|
||||||
|
|
||||||
ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
|
|
||||||
def fix_entities(text):
|
|
||||||
"""
|
|
||||||
Fix the few HTML entities that Twitter uses -- even if they've
|
|
||||||
already been tokenized.
|
|
||||||
"""
|
|
||||||
def replace_entity(match):
|
|
||||||
return chr(name2codepoint[match.group(1)])
|
|
||||||
return ENTITY_RE.sub(replace_entity, text)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user