mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge pull request #19 from LuminosoInsight/code-review-fixes-2015-07-17
Code review fixes 2015 07 17
This commit is contained in:
commit
32102ba3c2
@ -23,8 +23,8 @@ install them on Ubuntu:
|
||||
|
||||
## Unicode data
|
||||
|
||||
The tokenizers used to split non-Japanese phrases use regexes built using the
|
||||
`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0. To
|
||||
The tokenizers that split non-Japanese phrases utilize regexes built using the
|
||||
`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0. To
|
||||
update these regexes, run `scripts/gen_regex.py`.
|
||||
|
||||
## License
|
||||
@ -58,4 +58,3 @@ Some additional data was collected by a custom application that watches the
|
||||
streaming Twitter API, in accordance with Twitter's Developer Agreement &
|
||||
Policy. This software only gives statistics about words that are very commonly
|
||||
used on Twitter; it does not display or republish any Twitter content.
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
""" This file generates a graph of the dependencies for the ninja build."""
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
@ -26,4 +28,3 @@ def ninja_to_dot():
|
||||
|
||||
if __name__ == '__main__':
|
||||
ninja_to_dot()
|
||||
|
@ -94,7 +94,7 @@ def test_failed_cB_conversion():
|
||||
|
||||
def test_tokenization():
|
||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||
# data, while the fake word "plan't" can't be found.
|
||||
# data
|
||||
eq_(tokenize("can't", 'en'), ["can't"])
|
||||
|
||||
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
||||
@ -135,12 +135,20 @@ def test_not_enough_ascii():
|
||||
random_ascii_words(lang='zh')
|
||||
|
||||
def test_ar():
|
||||
|
||||
# Remove tatweels
|
||||
eq_(
|
||||
tokenize('متــــــــعب', 'ar'),
|
||||
['متعب']
|
||||
)
|
||||
|
||||
# Remove combining marks
|
||||
eq_(
|
||||
tokenize('حَرَكَات', 'ar'),
|
||||
['حركات']
|
||||
)
|
||||
|
||||
eq_(
|
||||
tokenize('إﻻ', 'ar'),
|
||||
['إلا']
|
||||
)
|
||||
|
@ -8,6 +8,8 @@ import itertools
|
||||
import pathlib
|
||||
import random
|
||||
import logging
|
||||
import unicodedata
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -66,11 +68,21 @@ def tokenize(text, lang):
|
||||
return mecab_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
text = standardize_arabic(text)
|
||||
|
||||
return simple_tokenize(text)
|
||||
|
||||
|
||||
def standardize_arabic(text):
|
||||
"""
|
||||
Standardizes arabic text by removing combining marks and tatweels.
|
||||
"""
|
||||
return unicodedata.normalize(
|
||||
'NFKC',
|
||||
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
)
|
||||
|
||||
|
||||
def read_cBpack(filename):
|
||||
"""
|
||||
Read a file from an idiosyncratic format that we use for storing
|
||||
@ -257,6 +269,9 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
|
||||
If a word decomposes into multiple tokens, we'll return a smoothed estimate
|
||||
of the word frequency that is no greater than the frequency of any of its
|
||||
individual tokens.
|
||||
|
||||
It should be noted that the current tokenizer does not support
|
||||
multi-word Chinese phrases.
|
||||
"""
|
||||
args = (word, lang, wordlist, minimum)
|
||||
try:
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -47,8 +47,7 @@ Start the build, and find something else to do for a few hours:
|
||||
|
||||
ninja -v
|
||||
|
||||
You can copy the results into wordfreq with this command (supposing that
|
||||
$WORDFREQ points to your wordfreq repo):
|
||||
You can copy the results into wordfreq with this command:
|
||||
|
||||
cp data/dist/*.msgpack.gz ../wordfreq/data/
|
||||
|
||||
@ -83,6 +82,19 @@ The specific rules are described by the comments in `rules.ninja`.
|
||||
|
||||
## Data sources
|
||||
|
||||
### Wikipedia
|
||||
|
||||
Wikipedia is a "free-access, free-content Internet encyclopedia".
|
||||
|
||||
These files can be downloaded from [wikimedia dump][wikipedia]
|
||||
|
||||
The original files are in `data/raw-input/wikipedia`, and they're processed
|
||||
by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
|
||||
[wiki2text][] package.
|
||||
|
||||
[wikipedia]: https://dumps.wikimedia.org/backup-index.html
|
||||
[wiki2text]: https://github.com/rspeer/wiki2text
|
||||
|
||||
### Leeds Internet Corpus
|
||||
|
||||
Also known as the "Web as Corpus" project, this is a University of Leeds
|
||||
@ -102,7 +114,7 @@ by the `convert_leeds` rule in `rules.ninja`.
|
||||
The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets
|
||||
collected by the `ftfy.streamtester` package in 2014.
|
||||
|
||||
It's not possible to distribute the text of tweets. However, this process could
|
||||
We are not allowed to distribute the text of tweets. However, this process could
|
||||
be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for
|
||||
a couple of weeks.
|
||||
|
||||
@ -162,4 +174,3 @@ longer represents the words 'don' and 'won', as we assume most of their
|
||||
frequency comes from "don't" and "won't". Words that turned into similarly
|
||||
common words, however, were left alone: this list doesn't represent "can't"
|
||||
because the word was left as "can".
|
||||
|
||||
|
@ -29,12 +29,12 @@ rule split
|
||||
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
|
||||
# https://github.com/rspeer/wiki2text.
|
||||
rule wiki2text
|
||||
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
|
||||
command = bunzip2 -c $in | wiki2text > $out
|
||||
|
||||
# To tokenize Japanese, we run it through Mecab and take the first column.
|
||||
# We don't have a plan for tokenizing Chinese yet.
|
||||
rule tokenize_japanese
|
||||
command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
|
||||
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
|
||||
|
||||
# Tokenizing text from Twitter requires us to language-detect and tokenize
|
||||
# in the same step.
|
||||
@ -49,12 +49,12 @@ rule tokenize_twitter
|
||||
# Grep out the term "EOS", an indication that Leeds used MeCab and didn't
|
||||
# strip out the EOS lines.
|
||||
rule convert_leeds
|
||||
command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
|
||||
command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
|
||||
|
||||
# To convert the OpenSubtitles frequency data, simply replace spaces with
|
||||
# commas.
|
||||
rule convert_opensubtitles
|
||||
command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
|
||||
command = tr ' ' ',' < $in > $out
|
||||
|
||||
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
||||
# the input files, keep only the single words and their counts, and only keep
|
||||
@ -65,16 +65,16 @@ rule convert_opensubtitles
|
||||
# source data was already filtered to only show words in roles with at least
|
||||
# two-digit counts of occurences.)
|
||||
rule convert_google_syntactic_ngrams
|
||||
command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
|
||||
command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
|
||||
|
||||
rule count
|
||||
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
|
||||
command = python -m wordfreq_builder.cli.count_tokens $in $out
|
||||
|
||||
rule merge
|
||||
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
|
||||
command = python -m wordfreq_builder.cli.combine_lists -o $out $in
|
||||
|
||||
rule freqs2cB
|
||||
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
||||
command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
|
||||
|
||||
rule cat
|
||||
command = cat $in > $out
|
||||
|
@ -9,12 +9,5 @@ setup(
|
||||
platforms=["any"],
|
||||
description="Turns raw data into word frequency lists",
|
||||
packages=['wordfreq_builder'],
|
||||
install_requires=['msgpack-python', 'pycld2'],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
|
||||
'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
|
||||
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
|
||||
]
|
||||
}
|
||||
install_requires=['msgpack-python', 'pycld2']
|
||||
)
|
||||
|
@ -13,4 +13,3 @@ if __name__ == '__main__':
|
||||
parser.add_argument('filename_out', help='name of output file')
|
||||
args = parser.parse_args()
|
||||
handle_counts(args.filename_in, args.filename_out)
|
||||
|
||||
|
@ -4,8 +4,8 @@ import argparse
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('language', help='language of the input file')
|
||||
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||
parser.add_argument('filename_out', help='name of output file')
|
||||
args = parser.parse_args()
|
||||
freqs_to_cBpack(args.filename_in, args.filename_out)
|
||||
|
||||
freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)
|
||||
|
@ -1,18 +1,13 @@
|
||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
|
||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
|
||||
import argparse
|
||||
|
||||
|
||||
def tokenize_twitter(in_filename, out_prefix):
|
||||
tokenize_file(in_filename, out_prefix,
|
||||
tokenizer=cld2_surface_tokenizer)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||
args = parser.parse_args()
|
||||
tokenize_twitter(args.filename, args.outprefix)
|
||||
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -10,10 +10,6 @@ HEADER = """# This file is automatically generated. Do not edit it.
|
||||
TMPDIR = data_filename('tmp')
|
||||
|
||||
|
||||
# Set this to True to rebuild the Twitter tokenization (which takes days)
|
||||
TOKENIZE_TWITTER = True
|
||||
|
||||
|
||||
def add_dep(lines, rule, input, output, extra=None, params=None):
|
||||
if isinstance(output, list):
|
||||
output = ' '.join(output)
|
||||
@ -48,17 +44,15 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||
# The first dependency is to make sure the build file is up to date.
|
||||
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
|
||||
extra='wordfreq_builder/ninja.py')
|
||||
|
||||
if TOKENIZE_TWITTER:
|
||||
lines.extend(
|
||||
twitter_deps(
|
||||
data_filename('raw-input/twitter/all-2014.txt'),
|
||||
slice_prefix=data_filename('slices/twitter/tweets-2014'),
|
||||
combined_prefix=data_filename('generated/twitter/tweets-2014'),
|
||||
slices=40,
|
||||
languages=CONFIG['sources']['twitter']
|
||||
)
|
||||
lines.extend(
|
||||
twitter_deps(
|
||||
data_filename('raw-input/twitter/all-2014.txt'),
|
||||
slice_prefix=data_filename('slices/twitter/tweets-2014'),
|
||||
combined_prefix=data_filename('generated/twitter/tweets-2014'),
|
||||
slices=40,
|
||||
languages=CONFIG['sources']['twitter']
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
wikipedia_deps(
|
||||
data_filename('raw-input/wikipedia'),
|
||||
@ -92,17 +86,18 @@ def wikipedia_deps(dirname_in, languages):
|
||||
path_in = pathlib.Path(dirname_in)
|
||||
for language in languages:
|
||||
# Find the most recent file for this language
|
||||
# Skip over files that do not exist
|
||||
input_file = max(path_in.glob(
|
||||
'{}wiki*.bz2'.format(language)
|
||||
))
|
||||
input_file = max(path_in.glob('{}wiki*.bz2'.format(language)))
|
||||
plain_text_file = wordlist_filename('wikipedia', language, 'txt')
|
||||
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
|
||||
|
||||
add_dep(lines, 'wiki2text', input_file, plain_text_file)
|
||||
if language == 'ja':
|
||||
mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
|
||||
add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
|
||||
mecab_token_file = wordlist_filename(
|
||||
'wikipedia', language, 'mecab-tokens.txt'
|
||||
)
|
||||
add_dep(
|
||||
lines, 'tokenize_japanese', plain_text_file, mecab_token_file
|
||||
)
|
||||
add_dep(lines, 'count', mecab_token_file, count_file)
|
||||
else:
|
||||
add_dep(lines, 'count', plain_text_file, count_file)
|
||||
@ -126,17 +121,18 @@ def google_books_deps(dirname_in):
|
||||
return lines
|
||||
|
||||
|
||||
def twitter_deps(input_filename, slice_prefix,
|
||||
combined_prefix, slices, languages):
|
||||
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
||||
languages):
|
||||
|
||||
lines = []
|
||||
|
||||
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
|
||||
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
|
||||
num=num)
|
||||
for num in range(slices)]
|
||||
# split the input into slices
|
||||
add_dep(lines,
|
||||
'split', input_filename, slice_files,
|
||||
add_dep(lines, 'split', input_filename, slice_files,
|
||||
params={'prefix': '{}.part'.format(slice_prefix),
|
||||
'slices': slices})
|
||||
'slices': slices})
|
||||
|
||||
for slicenum in range(slices):
|
||||
slice_file = slice_files[slicenum]
|
||||
@ -151,7 +147,9 @@ def twitter_deps(input_filename, slice_prefix,
|
||||
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
|
||||
|
||||
language_inputs = [
|
||||
'{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
|
||||
'{prefix}.{lang}.txt'.format(
|
||||
prefix=slice_files[slicenum], lang=language
|
||||
)
|
||||
for slicenum in range(slices)
|
||||
]
|
||||
|
||||
@ -160,11 +158,14 @@ def twitter_deps(input_filename, slice_prefix,
|
||||
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
||||
|
||||
if language == 'ja':
|
||||
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
|
||||
add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file)
|
||||
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||
else:
|
||||
add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py')
|
||||
mecab_token_file = wordlist_filename(
|
||||
'twitter', language, 'mecab-tokens.txt')
|
||||
add_dep(
|
||||
lines, 'tokenize_japanese', combined_output, mecab_token_file)
|
||||
combined_output = mecab_token_file
|
||||
|
||||
add_dep(lines, 'count', combined_output, count_file,
|
||||
extra='wordfreq_builder/tokenizers.py')
|
||||
|
||||
return lines
|
||||
|
||||
@ -187,7 +188,8 @@ def opensubtitles_deps(dirname_in, languages):
|
||||
input_file = '{prefix}/{lang}.txt'.format(
|
||||
prefix=dirname_in, lang=language
|
||||
)
|
||||
reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
|
||||
reformatted_file = wordlist_filename(
|
||||
'opensubtitles', language, 'counts.txt')
|
||||
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
|
||||
|
||||
return lines
|
||||
@ -205,18 +207,22 @@ def combine_lists(languages):
|
||||
add_dep(lines, 'merge', input_files, output_file,
|
||||
extra='wordfreq_builder/word_counts.py')
|
||||
|
||||
output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz')
|
||||
output_cBpack = wordlist_filename(
|
||||
'combined-dist', language, 'msgpack.gz')
|
||||
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
|
||||
extra='wordfreq_builder/word_counts.py')
|
||||
extra='wordfreq_builder/word_counts.py',
|
||||
params={'lang': language})
|
||||
|
||||
lines.append('default {}'.format(output_cBpack))
|
||||
|
||||
# Write standalone lists for Twitter frequency
|
||||
if language in CONFIG['sources']['twitter']:
|
||||
input_file = wordlist_filename('twitter', language, 'counts.txt')
|
||||
output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz')
|
||||
output_cBpack = wordlist_filename(
|
||||
'twitter-dist', language, 'msgpack.gz')
|
||||
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
|
||||
extra='wordfreq_builder/word_counts.py')
|
||||
extra='wordfreq_builder/word_counts.py',
|
||||
params={'lang': language})
|
||||
|
||||
lines.append('default {}'.format(output_cBpack))
|
||||
|
||||
|
@ -1,63 +1,56 @@
|
||||
from html.entities import name2codepoint
|
||||
from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
|
||||
from ftfy.fixes import unescape_html
|
||||
import re
|
||||
import pycld2
|
||||
|
||||
CLD2_BAD_CHAR_RANGE = "".join([
|
||||
'[',
|
||||
'\x00-\x08',
|
||||
'\x0b',
|
||||
'\x0e-\x1f',
|
||||
'\x7f-\x9f',
|
||||
'\ud800-\udfff',
|
||||
'\ufdd0-\ufdef'] +
|
||||
[chr(65534+65536*x+y) for x in range(17) for y in range(2)] +
|
||||
[']'])
|
||||
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
||||
[
|
||||
'\x00-\x08',
|
||||
'\x0b',
|
||||
'\x0e-\x1f',
|
||||
'\x7f-\x9f',
|
||||
'\ud800-\udfff',
|
||||
'\ufdd0-\ufdef'
|
||||
] +
|
||||
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
||||
)
|
||||
CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
|
||||
|
||||
TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
|
||||
TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+'.format(NON_PUNCT_RANGE))
|
||||
TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
||||
|
||||
|
||||
def cld2_surface_tokenizer(text):
|
||||
"""
|
||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
|
||||
"""
|
||||
text = remove_handles_and_urls(text)
|
||||
text = unescape_html(text)
|
||||
text = TWITTER_HANDLE_RE.sub('', text)
|
||||
text = TCO_RE.sub('', text)
|
||||
lang = cld2_detect_language(text)
|
||||
tokens = tokenize(text, lang)
|
||||
return lang, tokens
|
||||
|
||||
|
||||
def cld2_detect_language(text):
|
||||
"""
|
||||
Uses CLD2 to detect the language
|
||||
"""
|
||||
# Format of pycld2.detect:
|
||||
# (Confident in result: bool,
|
||||
# Number of bytes of text: Int,
|
||||
# Triples of detected languages in order of certainty:
|
||||
# (Language name: str,
|
||||
# Language code: str
|
||||
# Percent of text in this language: float
|
||||
# Confidence score: float))
|
||||
|
||||
text = CLD2_BAD_CHARS_RE.sub('', text)
|
||||
return pycld2.detect(text)[2][0][1]
|
||||
|
||||
def remove_handles_and_urls(text):
|
||||
text = fix_entities(text)
|
||||
text = TWITTER_HANDLE_RE.sub('', text)
|
||||
text = TCO_RE.sub('', text)
|
||||
return text
|
||||
|
||||
def last_tab(line):
|
||||
"""
|
||||
Read lines by keeping only the last tab-separated value.
|
||||
"""
|
||||
return line.split('\t')[-1].strip()
|
||||
|
||||
def lowercase_text_filter(token):
|
||||
"""
|
||||
If this looks like a token that we want to count, return it, lowercased.
|
||||
If not, filter it out by returning None.
|
||||
"""
|
||||
if TOKEN_RE.search(token):
|
||||
return token.lower()
|
||||
else:
|
||||
return None
|
||||
|
||||
def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
||||
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
||||
"""
|
||||
Process a file by running it through the given tokenizer, sorting the
|
||||
results by the language of each line, and inserting newlines
|
||||
@ -66,7 +59,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
||||
out_files = {}
|
||||
with open(in_filename, encoding='utf-8') as in_file:
|
||||
for line in in_file:
|
||||
text = line_reader(line)
|
||||
text = line.split('\t')[-1].strip()
|
||||
language, tokens = tokenizer(text)
|
||||
if language != 'un':
|
||||
tokenized = '\n'.join(tokens)
|
||||
@ -79,37 +72,3 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
||||
print(tokenized, file=out_file)
|
||||
for out_file in out_files.values():
|
||||
out_file.close()
|
||||
|
||||
ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
|
||||
|
||||
def fix_entities(text):
|
||||
"""
|
||||
Fix the few HTML entities that Twitter uses -- even if they've
|
||||
already been tokenized.
|
||||
"""
|
||||
def replace_entity(match):
|
||||
return chr(name2codepoint[match.group(1)])
|
||||
return ENTITY_RE.sub(replace_entity, text)
|
||||
|
||||
def monolingual_tokenize_file(in_filename, out_filename, language,
|
||||
tokenizer, line_reader=last_tab,
|
||||
sample_proportion=1):
|
||||
"""
|
||||
Process a file by running it through the given tokenizer, only keeping
|
||||
lines of the language we're asking for, and inserting newlines
|
||||
to mark the token boundaries.
|
||||
|
||||
`line_reader` is applied to each line before it given to the tokenizer
|
||||
|
||||
Only the first line out of every `sample_proportion` lines are run through
|
||||
then tokenizer.
|
||||
"""
|
||||
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
|
||||
with open(out_filename, 'w', encoding='utf-8') as out_file:
|
||||
for i, line in enumerate(in_file):
|
||||
if i % sample_proportion == 0:
|
||||
text = line_reader(line)
|
||||
tokens, line_language = tokenizer(text)
|
||||
if line_language == language:
|
||||
for token in tokens:
|
||||
print(token, file=out_file)
|
||||
|
@ -1,4 +1,4 @@
|
||||
from wordfreq import simple_tokenize
|
||||
from wordfreq import simple_tokenize, tokenize
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
from ftfy import fix_text
|
||||
@ -18,41 +18,49 @@ def count_tokens(filename):
|
||||
counts = defaultdict(int)
|
||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||
for line in infile:
|
||||
for token in simple_tokenize(line.strip()):
|
||||
for token in simple_tokenize(line):
|
||||
counts[token] += 1
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
def read_freqs(filename, cutoff=0):
|
||||
def read_freqs(filename, cutoff=0, lang=None):
|
||||
"""
|
||||
Read words and their frequencies from a CSV file.
|
||||
|
||||
Only words with a frequency greater than `cutoff` are returned.
|
||||
Only words with a frequency greater than or equal to `cutoff` are returned.
|
||||
|
||||
If `cutoff` is greater than 0, the csv file must be sorted by frequency
|
||||
in descending order.
|
||||
|
||||
If lang is given, read_freqs will apply language specific preprocessing
|
||||
operations.
|
||||
"""
|
||||
raw_counts = defaultdict(float)
|
||||
total = 0.
|
||||
with open(filename, encoding='utf-8', newline='') as infile:
|
||||
reader = csv.reader(infile)
|
||||
for key, strval in reader:
|
||||
|
||||
val = float(strval)
|
||||
if val < cutoff:
|
||||
break
|
||||
for token in simple_tokenize(key):
|
||||
|
||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||
for token in tokens:
|
||||
token = fix_text(token)
|
||||
total += val
|
||||
# Use += so that, if we give the reader concatenated files with
|
||||
# duplicates, it does the right thing
|
||||
raw_counts[token] += val
|
||||
|
||||
freqs = {key: raw_count / total
|
||||
for (key, raw_count) in raw_counts.items()}
|
||||
return freqs
|
||||
for word in raw_counts:
|
||||
raw_counts[word] /= total
|
||||
|
||||
return raw_counts
|
||||
|
||||
|
||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
|
||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
||||
"""
|
||||
Convert a csv file of words and their frequencies to a file in the
|
||||
idiosyncratic 'cBpack' format.
|
||||
@ -61,15 +69,14 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
|
||||
written to the new file.
|
||||
"""
|
||||
freq_cutoff = 10 ** (cutoff / 100.)
|
||||
freqs = read_freqs(in_filename, freq_cutoff)
|
||||
freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
|
||||
cBpack = []
|
||||
for token, freq in freqs.items():
|
||||
cB = round(math.log10(freq) * 100)
|
||||
if cB >= cutoff:
|
||||
neg_cB = -cB
|
||||
while neg_cB >= len(cBpack):
|
||||
cBpack.append([])
|
||||
cBpack[neg_cB].append(token)
|
||||
neg_cB = -cB
|
||||
while neg_cB >= len(cBpack):
|
||||
cBpack.append([])
|
||||
cBpack[neg_cB].append(token)
|
||||
|
||||
for sublist in cBpack:
|
||||
sublist.sort()
|
||||
@ -88,7 +95,7 @@ def merge_freqs(freq_dicts):
|
||||
"""
|
||||
vocab = set()
|
||||
for freq_dict in freq_dicts:
|
||||
vocab |= set(freq_dict)
|
||||
vocab.update(freq_dict)
|
||||
|
||||
merged = defaultdict(float)
|
||||
N = len(freq_dicts)
|
||||
|
Loading…
Reference in New Issue
Block a user