move some functions to the wordfreq package

2024-12-24 09:51:38 +00:00 · 2015-05-11 17:02:52 -04:00 · 2015-05-11 17:02:52 -04:00 · ed4f79b90e
commit ed4f79b90e
parent 414c9ac1f0
3 changed files with 19 additions and 73 deletions
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -148,7 +148,7 @@ def twitter_deps(prefix_in, languages):
                extra='wordfreq_builder/tokenizers.py')

        count_file = wordlist_filename('twitter', language, 'counts.txt')
-        add_dep(lines, 'count', token_file, count_file)
+        add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')

    return lines

--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,5 +1,6 @@
 from lumi_science.text_readers.rosette_readers import RosetteReader
 from html.entities import name2codepoint
+from wordfreq import tokenize, TOKEN_RE
 import re


@ -16,10 +17,6 @@ ROSETTE_LANG_MAP = {
 }


-NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff０-９Ａ-Ｚａ-ｚ\uff66-\U0002ffff]'
-NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
-TOKEN_RE = re.compile("{0}('{0})+".format(NON_PUNCT_RANGE)
-
 EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF'
 ROSETTE_RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE))

@ -80,10 +77,6 @@ def fix_entities(text):
    return ENTITY_RE.sub(replace_entity, text)


-def tokenize(text):
-    return TOKEN_RE.findall(text)
-
-
 def retokenize_rosette(text):
    text = fix_entities(text)
    tokens = ROSETTE_RETOKENIZE_RE.findall(text)
@ -133,9 +126,7 @@ def monolingual_tokenize_file(in_filename, out_filename, language,
                    text = line_reader(line)
                    tokens, line_language = tokenizer(text)
                    if line_language == language:
-                        filtered = [token_filter(t) for t in tokens]
-                        filtered = [t for t in filtered if t is not None]
-                        for token in filtered:
+                        for token in tokens:
                            print(token, file=out_file)


@ -151,63 +142,3 @@ def rosette_surface_tokenizer(text):
        surface_text = text[span[0]:span[1]]
        tokens.append(surface_text)
    return tokens, language
-
-
-def treebank_surface_tokenizer(text, language='en'):
-    """
-    This is a simplified version of the Treebank tokenizer in NLTK.
-
-    NLTK's version depends on the text first having been sentence-tokenized
-    using Punkt, which is a statistical model that we'd rather not implement
-    here. The main reason to use Punkt first is to disambiguate periods that
-    are sentence-ending from those that are part of abbreviations.
-
-    NLTK's tokenizer thus assumes that any periods that appear in the middle
-    of the text are meant to be there, and leaves them attached to words. We
-    can skip the complication of Punkt at the cost of altering abbreviations
-    such as "U.S.".
-
-    NLTK also splits contractions that lack apostrophes, giving pseudo-words
-    as a result -- for example, it splits "wanna" into "wan" and "na", which
-    are supposed to be considered unusual surface forms of "want" and "to".
-    We just leave it as the word "wanna".
-
-    The language will just be returned, as this function isn't doing any
-    language detection. It defaults to 'en', as English is the language that
-    Treebank tokenization is designed for.
-    """
-    #starting quotes
-    text = re.sub(r'^\"', r'``', text)
-    text = re.sub(r'(``)', r' \1 ', text)
-    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
-
-    #punctuation
-    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
-    text = re.sub(r'\.\.\.', r' ... ', text)
-    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
-
-    # The following rule was modified from NLTK, which only separated periods
-    # at the end of the text. We simply made whitespace an alternative to the
-    # text-ending symbol $.
-    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)(\s|$)', r'\1 \2\3 ', text)
-    text = re.sub(r'[?!]', r' \g<0> ', text)
-
-    text = re.sub(r"([^'])' ", r"\1 ' ", text)
-
-    #parens, brackets, etc.
-    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
-    text = re.sub(r'--', r' -- ', text)
-
-    #add extra space to make things easier
-    text = " " + text + " "
-
-    #ending quotes
-    text = re.sub(r'"', " '' ", text)
-    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
-
-    #contractions
-    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
-    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
-                  text)
-
-    return text.split(), language
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -1,4 +1,4 @@
-from wordfreq_builder.tokenizers import tokenize
+from wordfreq import tokenize
 from collections import defaultdict
 from operator import itemgetter
 from ftfy import fix_text
@ -9,6 +9,10 @@ import gzip


 def count_tokens(filename):
+    """
+    Count tokens that appear in a file, running each line through our
+    simple tokenizer.
+    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8') as infile:
        for line in infile:
@ -18,6 +22,9 @@ def count_tokens(filename):


 def read_freqs(filename, cutoff=0):
+    """
+    Read words and their frequencies from a CSV file.
+    """
    raw_counts = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
@ -39,6 +46,10 @@ def read_freqs(filename, cutoff=0):


 def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
+    """
+    Convert a dictionary of word frequencies to a file in the idiosyncratic
+    'dBpack' format.
+    """
    freq_cutoff = 10 ** (cutoff / 10.)
    freqs = read_freqs(in_filename, freq_cutoff)
    dBpack = []
@ -58,6 +69,10 @@ def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):


 def merge_freqs(freq_dicts):
+    """
+    Merge multiple dictionaries of frequencies, representing each word with
+    the word's average frequency over all sources.
+    """
    vocab = set()
    for freq_dict in freq_dicts:
        vocab |= set(freq_dict)