Reorganize and document some functions

2024-12-23 17:31:41 +00:00 · 2015-06-15 12:40:31 -04:00 · 2015-06-15 12:40:31 -04:00 · 26b03392fe
commit 26b03392fe
parent 04ad6720cc
3 changed files with 44 additions and 35 deletions
--- a/wordfreq_builder/wordfreq_builder/cli/format_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/format_twitter.py
@ -1,7 +1,15 @@
-from wordfreq_builder.tokenizers import retokenize_file
+from wordfreq_builder.tokenizers import retokenize_rosette
 import argparse


+def retokenize_file(in_filename, out_filename):
+    with open(in_filename, encoding='utf-8') as in_file:
+        with open(out_filename, 'w', encoding='utf-8') as out_file:
+            for line in in_file:
+                for token in retokenize_rosette(line.strip()):
+                    print(token, file=out_file)
+
+
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('in_filename', help='filename of input file containing one tweet per line')
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -21,6 +21,24 @@ EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF'
 ROSETTE_RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE))


+def rosette_surface_tokenizer(text):
+    """
+    Use Rosette to both detect the language of the given text and split it
+    into tokens.
+    """
+    try:
+        analysis, lang = ROSETTE.rosette.analyze(text)
+    except (RuntimeError, UnicodeError):
+        # Our Rosette interface throws errors given arbitrary data. :(
+        return text, None
+    language = ROSETTE_LANG_MAP.get(lang, lang)
+    tokens = []
+    for (stem, pos, span) in analysis:
+        surface_text = text[span[0]:span[1]]
+        tokens.append(surface_text)
+    return tokens, language
+
+
 def last_tab(line):
    """
    Read lines by keeping only the last tab-separated value.
@ -29,6 +47,10 @@ def last_tab(line):


 def lowercase_text_filter(token):
+    """
+    If this looks like a token that we want to count, return it, lowercased.
+    If not, filter it out by returning None.
+    """
    if TOKEN_RE.search(token):
        return token.lower()
    else:
@ -78,6 +100,11 @@ def fix_entities(text):


 def retokenize_rosette(text):
+    """
+    Given text that has had spaces inserted between tokens by Rosette,
+    apply some transformations that help us avoid counting the frequency
+    of UNLs and usernames.
+    """
    text = fix_entities(text)
    tokens = ROSETTE_RETOKENIZE_RE.findall(text)
    skip_next = False
@ -95,30 +122,14 @@ def retokenize_rosette(text):
                    yield filtered


-def retokenize_file(in_filename, out_filename):
-    """
-    Process a file that has been tokenized (by inserting spaces) in a
-    language-specific way by Rosette.
-    """
-    with open(in_filename, encoding='utf-8') as in_file:
-        with open(out_filename, 'w', encoding='utf-8') as out_file:
-            for line in in_file:
-                skip_next = False
-                for token in retokenize_rosette(line.strip()):
-                    if skip_next:
-                        skip_next = False
-                    elif token == '/' or token == '@':
-                        # Avoid idiosyncratic tokens such as URLs and
-                        # usernames
-                        skip_next = True
-                    elif lowercase_text_filter(token):
-                        print(token, file=out_file)
-
-
 def monolingual_tokenize_file(in_filename, out_filename, language,
                              tokenizer, line_reader=last_tab,
                              token_filter=lowercase_text_filter,
                              sample_proportion=100):
+    """
+    Apply a tokenizer that can distinguish different languages, but only
+    keep the lines that are in the language we're asking for.
+    """
    with open(in_filename, encoding='utf-8', errors='replace') as in_file:
        with open(out_filename, 'w', encoding='utf-8') as out_file:
            for i, line in enumerate(in_file):
@ -128,17 +139,3 @@ def monolingual_tokenize_file(in_filename, out_filename, language,
                    if line_language == language:
                        for token in tokens:
                            print(token, file=out_file)
-
-
-def rosette_surface_tokenizer(text):
-    try:
-        analysis, lang = ROSETTE.rosette.analyze(text)
-    except (RuntimeError, UnicodeError):
-        # Our Rosette interface throws errors given arbitrary data. :(
-        return text, None
-    language = ROSETTE_LANG_MAP.get(lang, lang)
-    tokens = []
-    for (stem, pos, span) in analysis:
-        surface_text = text[span[0]:span[1]]
-        tokens.append(surface_text)
-    return tokens, language
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -92,6 +92,10 @@ def write_wordlist(freqs, filename, cutoff=1e-8):
    """
    Write a dictionary of either raw counts or frequencies to a file of
    comma-separated values.
+
+    Keep the CSV format simple by explicitly skipping words containing
+    commas or quotation marks. We don't believe we want those in our tokens
+    anyway.
    """
    with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
        writer = csv.writer(outfile)