mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
move some functions to the wordfreq package
This commit is contained in:
parent
414c9ac1f0
commit
ed4f79b90e
@ -148,7 +148,7 @@ def twitter_deps(prefix_in, languages):
|
||||
extra='wordfreq_builder/tokenizers.py')
|
||||
|
||||
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
||||
add_dep(lines, 'count', token_file, count_file)
|
||||
add_dep(lines, 'count', token_file, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||
|
||||
return lines
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
from lumi_science.text_readers.rosette_readers import RosetteReader
|
||||
from html.entities import name2codepoint
|
||||
from wordfreq import tokenize, TOKEN_RE
|
||||
import re
|
||||
|
||||
|
||||
@ -16,10 +17,6 @@ ROSETTE_LANG_MAP = {
|
||||
}
|
||||
|
||||
|
||||
NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]'
|
||||
NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
|
||||
TOKEN_RE = re.compile("{0}('{0})+".format(NON_PUNCT_RANGE)
|
||||
|
||||
EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF'
|
||||
ROSETTE_RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE))
|
||||
|
||||
@ -80,10 +77,6 @@ def fix_entities(text):
|
||||
return ENTITY_RE.sub(replace_entity, text)
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
return TOKEN_RE.findall(text)
|
||||
|
||||
|
||||
def retokenize_rosette(text):
|
||||
text = fix_entities(text)
|
||||
tokens = ROSETTE_RETOKENIZE_RE.findall(text)
|
||||
@ -133,9 +126,7 @@ def monolingual_tokenize_file(in_filename, out_filename, language,
|
||||
text = line_reader(line)
|
||||
tokens, line_language = tokenizer(text)
|
||||
if line_language == language:
|
||||
filtered = [token_filter(t) for t in tokens]
|
||||
filtered = [t for t in filtered if t is not None]
|
||||
for token in filtered:
|
||||
for token in tokens:
|
||||
print(token, file=out_file)
|
||||
|
||||
|
||||
@ -151,63 +142,3 @@ def rosette_surface_tokenizer(text):
|
||||
surface_text = text[span[0]:span[1]]
|
||||
tokens.append(surface_text)
|
||||
return tokens, language
|
||||
|
||||
|
||||
def treebank_surface_tokenizer(text, language='en'):
|
||||
"""
|
||||
This is a simplified version of the Treebank tokenizer in NLTK.
|
||||
|
||||
NLTK's version depends on the text first having been sentence-tokenized
|
||||
using Punkt, which is a statistical model that we'd rather not implement
|
||||
here. The main reason to use Punkt first is to disambiguate periods that
|
||||
are sentence-ending from those that are part of abbreviations.
|
||||
|
||||
NLTK's tokenizer thus assumes that any periods that appear in the middle
|
||||
of the text are meant to be there, and leaves them attached to words. We
|
||||
can skip the complication of Punkt at the cost of altering abbreviations
|
||||
such as "U.S.".
|
||||
|
||||
NLTK also splits contractions that lack apostrophes, giving pseudo-words
|
||||
as a result -- for example, it splits "wanna" into "wan" and "na", which
|
||||
are supposed to be considered unusual surface forms of "want" and "to".
|
||||
We just leave it as the word "wanna".
|
||||
|
||||
The language will just be returned, as this function isn't doing any
|
||||
language detection. It defaults to 'en', as English is the language that
|
||||
Treebank tokenization is designed for.
|
||||
"""
|
||||
#starting quotes
|
||||
text = re.sub(r'^\"', r'``', text)
|
||||
text = re.sub(r'(``)', r' \1 ', text)
|
||||
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
|
||||
|
||||
#punctuation
|
||||
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
|
||||
text = re.sub(r'\.\.\.', r' ... ', text)
|
||||
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
|
||||
|
||||
# The following rule was modified from NLTK, which only separated periods
|
||||
# at the end of the text. We simply made whitespace an alternative to the
|
||||
# text-ending symbol $.
|
||||
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)(\s|$)', r'\1 \2\3 ', text)
|
||||
text = re.sub(r'[?!]', r' \g<0> ', text)
|
||||
|
||||
text = re.sub(r"([^'])' ", r"\1 ' ", text)
|
||||
|
||||
#parens, brackets, etc.
|
||||
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
|
||||
text = re.sub(r'--', r' -- ', text)
|
||||
|
||||
#add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
#ending quotes
|
||||
text = re.sub(r'"', " '' ", text)
|
||||
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
|
||||
|
||||
#contractions
|
||||
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
|
||||
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
||||
text)
|
||||
|
||||
return text.split(), language
|
||||
|
@ -1,4 +1,4 @@
|
||||
from wordfreq_builder.tokenizers import tokenize
|
||||
from wordfreq import tokenize
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
from ftfy import fix_text
|
||||
@ -9,6 +9,10 @@ import gzip
|
||||
|
||||
|
||||
def count_tokens(filename):
|
||||
"""
|
||||
Count tokens that appear in a file, running each line through our
|
||||
simple tokenizer.
|
||||
"""
|
||||
counts = defaultdict(int)
|
||||
with open(filename, encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
@ -18,6 +22,9 @@ def count_tokens(filename):
|
||||
|
||||
|
||||
def read_freqs(filename, cutoff=0):
|
||||
"""
|
||||
Read words and their frequencies from a CSV file.
|
||||
"""
|
||||
raw_counts = defaultdict(float)
|
||||
total = 0.
|
||||
with open(filename, encoding='utf-8', newline='') as infile:
|
||||
@ -39,6 +46,10 @@ def read_freqs(filename, cutoff=0):
|
||||
|
||||
|
||||
def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
|
||||
"""
|
||||
Convert a dictionary of word frequencies to a file in the idiosyncratic
|
||||
'dBpack' format.
|
||||
"""
|
||||
freq_cutoff = 10 ** (cutoff / 10.)
|
||||
freqs = read_freqs(in_filename, freq_cutoff)
|
||||
dBpack = []
|
||||
@ -58,6 +69,10 @@ def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
|
||||
|
||||
|
||||
def merge_freqs(freq_dicts):
|
||||
"""
|
||||
Merge multiple dictionaries of frequencies, representing each word with
|
||||
the word's average frequency over all sources.
|
||||
"""
|
||||
vocab = set()
|
||||
for freq_dict in freq_dicts:
|
||||
vocab |= set(freq_dict)
|
||||
|
Loading…
Reference in New Issue
Block a user