added cld2 tokenizer comments

Former-commit-id: a44927e98e
2024-12-24 18:01:38 +00:00 · 2015-07-17 16:03:33 -04:00 · 2015-07-17 16:03:33 -04:00 · 71ff0c62d6
commit 71ff0c62d6
parent c2f3928433
1 changed files with 11 additions and 12 deletions
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,5 +1,6 @@
 from html.entities import name2codepoint
 from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
 from ftfy.fixes import unescape_html
 import re
 import pycld2
@ -24,7 +25,7 @@ def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
    """
-    text = fix_entities(text)
+    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)
@ -36,6 +37,15 @@ def cld2_detect_language(text):
    """
    Uses CLD2 to detect the language
    """
    # Format of pycld2.detect:
    #   (Confident in result: bool,
    #   Number of bytes of text: Int,
    #   Triples of detected languages in order of certainty:
    #       (Language name: str,
    #       Language code: str
    #       Percent of text in this language: float
    #       Confidence score: float))
    text = CLD2_BAD_CHARS_RE.sub('', text)
    return pycld2.detect(text)[2][0][1]
@ -62,14 +72,3 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer):
                print(tokenized, file=out_file)
    for out_file in out_files.values():
        out_file.close()
 ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
 def fix_entities(text):
    """
    Fix the few HTML entities that Twitter uses -- even if they've
    already been tokenized.
    """
    def replace_entity(match):
        return chr(name2codepoint[match.group(1)])
    return ENTITY_RE.sub(replace_entity, text)