move some functions to the wordfreq package

This commit is contained in:
Robyn Speer 2015-05-11 17:02:52 -04:00
parent 414c9ac1f0
commit ed4f79b90e
3 changed files with 19 additions and 73 deletions

View File

@ -148,7 +148,7 @@ def twitter_deps(prefix_in, languages):
extra='wordfreq_builder/tokenizers.py')
count_file = wordlist_filename('twitter', language, 'counts.txt')
add_dep(lines, 'count', token_file, count_file)
add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')
return lines

View File

@ -1,5 +1,6 @@
from lumi_science.text_readers.rosette_readers import RosetteReader
from html.entities import name2codepoint
from wordfreq import tokenize, TOKEN_RE
import re
@ -16,10 +17,6 @@ ROSETTE_LANG_MAP = {
}
NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff---\uff66-\U0002ffff]'
NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
TOKEN_RE = re.compile("{0}('{0})+".format(NON_PUNCT_RANGE)
EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF'
ROSETTE_RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE))
@ -80,10 +77,6 @@ def fix_entities(text):
return ENTITY_RE.sub(replace_entity, text)
def tokenize(text):
return TOKEN_RE.findall(text)
def retokenize_rosette(text):
text = fix_entities(text)
tokens = ROSETTE_RETOKENIZE_RE.findall(text)
@ -133,9 +126,7 @@ def monolingual_tokenize_file(in_filename, out_filename, language,
text = line_reader(line)
tokens, line_language = tokenizer(text)
if line_language == language:
filtered = [token_filter(t) for t in tokens]
filtered = [t for t in filtered if t is not None]
for token in filtered:
for token in tokens:
print(token, file=out_file)
@ -151,63 +142,3 @@ def rosette_surface_tokenizer(text):
surface_text = text[span[0]:span[1]]
tokens.append(surface_text)
return tokens, language
def treebank_surface_tokenizer(text, language='en'):
"""
This is a simplified version of the Treebank tokenizer in NLTK.
NLTK's version depends on the text first having been sentence-tokenized
using Punkt, which is a statistical model that we'd rather not implement
here. The main reason to use Punkt first is to disambiguate periods that
are sentence-ending from those that are part of abbreviations.
NLTK's tokenizer thus assumes that any periods that appear in the middle
of the text are meant to be there, and leaves them attached to words. We
can skip the complication of Punkt at the cost of altering abbreviations
such as "U.S.".
NLTK also splits contractions that lack apostrophes, giving pseudo-words
as a result -- for example, it splits "wanna" into "wan" and "na", which
are supposed to be considered unusual surface forms of "want" and "to".
We just leave it as the word "wanna".
The language will just be returned, as this function isn't doing any
language detection. It defaults to 'en', as English is the language that
Treebank tokenization is designed for.
"""
#starting quotes
text = re.sub(r'^\"', r'``', text)
text = re.sub(r'(``)', r' \1 ', text)
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
#punctuation
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
text = re.sub(r'\.\.\.', r' ... ', text)
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
# The following rule was modified from NLTK, which only separated periods
# at the end of the text. We simply made whitespace an alternative to the
# text-ending symbol $.
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)(\s|$)', r'\1 \2\3 ', text)
text = re.sub(r'[?!]', r' \g<0> ', text)
text = re.sub(r"([^'])' ", r"\1 ' ", text)
#parens, brackets, etc.
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
text = re.sub(r'--', r' -- ', text)
#add extra space to make things easier
text = " " + text + " "
#ending quotes
text = re.sub(r'"', " '' ", text)
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
#contractions
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
text)
return text.split(), language

View File

@ -1,4 +1,4 @@
from wordfreq_builder.tokenizers import tokenize
from wordfreq import tokenize
from collections import defaultdict
from operator import itemgetter
from ftfy import fix_text
@ -9,6 +9,10 @@ import gzip
def count_tokens(filename):
"""
Count tokens that appear in a file, running each line through our
simple tokenizer.
"""
counts = defaultdict(int)
with open(filename, encoding='utf-8') as infile:
for line in infile:
@ -18,6 +22,9 @@ def count_tokens(filename):
def read_freqs(filename, cutoff=0):
"""
Read words and their frequencies from a CSV file.
"""
raw_counts = defaultdict(float)
total = 0.
with open(filename, encoding='utf-8', newline='') as infile:
@ -39,6 +46,10 @@ def read_freqs(filename, cutoff=0):
def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
"""
Convert a dictionary of word frequencies to a file in the idiosyncratic
'dBpack' format.
"""
freq_cutoff = 10 ** (cutoff / 10.)
freqs = read_freqs(in_filename, freq_cutoff)
dBpack = []
@ -58,6 +69,10 @@ def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
def merge_freqs(freq_dicts):
"""
Merge multiple dictionaries of frequencies, representing each word with
the word's average frequency over all sources.
"""
vocab = set()
for freq_dict in freq_dicts:
vocab |= set(freq_dict)