mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 10:15:23 +00:00
Merge branch 'ninja-build'
Conflicts: wordfreq_builder/cmd_count_twitter.py wordfreq_builder/cmd_count_wikipedia.py
This commit is contained in:
commit
b541fe68e1
4
wordfreq_builder/.gitignore
vendored
4
wordfreq_builder/.gitignore
vendored
@ -6,3 +6,7 @@ dist
|
|||||||
*.egg-info
|
*.egg-info
|
||||||
build
|
build
|
||||||
_build
|
_build
|
||||||
|
build.ninja
|
||||||
|
data
|
||||||
|
.ninja_deps
|
||||||
|
.ninja_log
|
||||||
|
12
wordfreq_builder/Makefile
Normal file
12
wordfreq_builder/Makefile
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
PYTHON = python
|
||||||
|
|
||||||
|
all: build.ninja
|
||||||
|
|
||||||
|
# make sure this package is in 'develop' mode and up to date
|
||||||
|
wordfreq_builder.egg-info/PKG-INFO: setup.py
|
||||||
|
$(PYTHON) setup.py develop
|
||||||
|
|
||||||
|
# build the Ninja file that will take over the build process
|
||||||
|
build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO
|
||||||
|
$(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja
|
||||||
|
|
63
wordfreq_builder/rules.ninja
Normal file
63
wordfreq_builder/rules.ninja
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
# This defines the rules on how to build parts of the wordfreq lists, using the
|
||||||
|
# Ninja build system:
|
||||||
|
#
|
||||||
|
# http://martine.github.io/ninja/manual.html
|
||||||
|
#
|
||||||
|
# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
|
||||||
|
# better parallelism and the ability for build steps to produce multiple
|
||||||
|
# outputs. The tradeoff is that its rule syntax isn't full of magic for
|
||||||
|
# expanding wildcards and finding dependencies, so in general you have to
|
||||||
|
# write the dependencies using a script.
|
||||||
|
#
|
||||||
|
# This file will become the header of the larger build.ninja file, which also
|
||||||
|
# contains the programatically-defined dependency graph.
|
||||||
|
|
||||||
|
# Variables
|
||||||
|
DATA = ./data
|
||||||
|
|
||||||
|
# Splits the single file $in into $slices parts, whose names will be
|
||||||
|
# $prefix plus a two-digit numeric suffix.
|
||||||
|
rule split
|
||||||
|
command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
|
||||||
|
|
||||||
|
# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
|
||||||
|
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
|
||||||
|
# https://github.com/rspeer/wiki2text.
|
||||||
|
rule wiki2text
|
||||||
|
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
|
||||||
|
|
||||||
|
rule wiki2tokens
|
||||||
|
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
|
||||||
|
|
||||||
|
rule tokenize_japanese
|
||||||
|
command = mkdir -p $$(dirname $out) && mecab < $in | cut -f 1 | grep -v "EOS"
|
||||||
|
|
||||||
|
rule tokenize_twitter
|
||||||
|
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.pretokenize_twitter $in $prefix
|
||||||
|
|
||||||
|
rule format_twitter
|
||||||
|
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.format_twitter $in $out
|
||||||
|
|
||||||
|
# To convert the Leeds corpus, look for space-separated lines that start with
|
||||||
|
# an integer and a decimal. The integer is the rank, which we discard. The
|
||||||
|
# decimal is the frequency, and the remaining text is the term. Use sed -n
|
||||||
|
# with /p to output only lines where the match was successful.
|
||||||
|
rule convert_leeds
|
||||||
|
command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in > $out
|
||||||
|
|
||||||
|
# To convert the OpenSubtitles frequency data, simply replace spaces with
|
||||||
|
# commas.
|
||||||
|
rule convert_opensubtitles
|
||||||
|
command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
|
||||||
|
|
||||||
|
rule count
|
||||||
|
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
|
||||||
|
|
||||||
|
rule merge
|
||||||
|
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
|
||||||
|
|
||||||
|
rule freqs2dB
|
||||||
|
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out
|
||||||
|
|
||||||
|
rule cat
|
||||||
|
command = cat $in > $out
|
@ -9,4 +9,13 @@ setup(
|
|||||||
platforms=["any"],
|
platforms=["any"],
|
||||||
description="Turns raw data into word frequency lists",
|
description="Turns raw data into word frequency lists",
|
||||||
packages=['wordfreq_builder'],
|
packages=['wordfreq_builder'],
|
||||||
|
install_requires=['msgpack-python'],
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
|
||||||
|
'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
|
||||||
|
'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
|
||||||
|
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
|
||||||
|
]
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
0
wordfreq_builder/wordfreq_builder/cli/__init__.py
Normal file
0
wordfreq_builder/wordfreq_builder/cli/__init__.py
Normal file
15
wordfreq_builder/wordfreq_builder/cli/build_deps.py
Normal file
15
wordfreq_builder/wordfreq_builder/cli/build_deps.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from wordfreq_builder.ninja import make_ninja_deps
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('in_filename', help='filename of rules file')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Make the complete ninja file and write it to standard out
|
||||||
|
make_ninja_deps(args.in_filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
19
wordfreq_builder/wordfreq_builder/cli/combine_lists.py
Normal file
19
wordfreq_builder/wordfreq_builder/cli/combine_lists.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def merge_lists(input_names, output_name):
|
||||||
|
freq_dicts = []
|
||||||
|
for input_name in input_names:
|
||||||
|
freq_dicts.append(read_freqs(input_name, cutoff=2))
|
||||||
|
merged = merge_freqs(freq_dicts)
|
||||||
|
write_wordlist(merged, output_name)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
||||||
|
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||||
|
args = parser.parse_args()
|
||||||
|
merge_lists(args.inputs, args.output)
|
||||||
|
|
16
wordfreq_builder/wordfreq_builder/cli/count_tokens.py
Normal file
16
wordfreq_builder/wordfreq_builder/cli/count_tokens.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from wordfreq_builder.word_counts import count_tokens, write_wordlist
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def handle_counts(filename_in, filename_out):
|
||||||
|
counts = count_tokens(filename_in)
|
||||||
|
write_wordlist(counts, filename_out)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||||
|
parser.add_argument('filename_out', help='name of output file')
|
||||||
|
args = parser.parse_args()
|
||||||
|
handle_counts(args.filename_in, args.filename_out)
|
||||||
|
|
14
wordfreq_builder/wordfreq_builder/cli/format_twitter.py
Normal file
14
wordfreq_builder/wordfreq_builder/cli/format_twitter.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from wordfreq_builder.tokenizers import retokenize_file
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('in_filename', help='filename of input file containing one tweet per line')
|
||||||
|
parser.add_argument('out_filename', help='filename of output file')
|
||||||
|
args = parser.parse_args()
|
||||||
|
retokenize_file(args.in_filename, args.out_filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
11
wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py
Normal file
11
wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from wordfreq_builder.word_counts import freqs_to_dBpack
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('filename_in', help='name of input file containing tokens')
|
||||||
|
parser.add_argument('filename_out', help='name of output file')
|
||||||
|
args = parser.parse_args()
|
||||||
|
freqs_to_dBpack(args.filename_in, args.filename_out)
|
||||||
|
|
19
wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
Normal file
19
wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def pretokenize_twitter(in_filename, out_prefix):
|
||||||
|
pretokenize_file(in_filename, out_prefix,
|
||||||
|
tokenizer=rosette_surface_tokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||||
|
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||||
|
args = parser.parse_args()
|
||||||
|
pretokenize_twitter(args.filename, args.outprefix)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
30
wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
Normal file
30
wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_wikipedia(in_filename, out_filename, language, proportion):
|
||||||
|
monolingual_tokenize_file(
|
||||||
|
in_filename, out_filename,
|
||||||
|
language=language,
|
||||||
|
tokenizer=rosette_surface_tokenizer,
|
||||||
|
line_reader=strip_headings,
|
||||||
|
sample_proportion=proportion
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_headings(text):
|
||||||
|
return text.strip().strip('=')
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('in_filename', help='filename of input file')
|
||||||
|
parser.add_argument('out_filename', help='filename of output file')
|
||||||
|
parser.add_argument('language', help='the language code of the text')
|
||||||
|
parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
|
||||||
|
args = parser.parse_args()
|
||||||
|
tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -1,21 +0,0 @@
|
|||||||
from wordfreq_builder.word_counts import read_counts, write_counts, merge_counts
|
|
||||||
from pathlib import Path
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name, balance=False):
|
|
||||||
count_dicts = []
|
|
||||||
for input_name in input_names:
|
|
||||||
count_dicts.append(read_counts(Path(input_name)))
|
|
||||||
merged = merge_counts(count_dicts, balance=balance)
|
|
||||||
write_counts(merged, Path(output_name))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
|
||||||
parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies')
|
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
|
||||||
args = parser.parse_args()
|
|
||||||
merge_lists(args.inputs, args.output, balance=args.balance)
|
|
||||||
|
|
@ -1,27 +0,0 @@
|
|||||||
from wordfreq_builder.word_counts import WordCountBuilder
|
|
||||||
from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
|
|
||||||
from pathlib import Path
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def count_twitter(pathname, offset=0, nsplit=1, surface=True):
|
|
||||||
path = Path(pathname)
|
|
||||||
if surface == True:
|
|
||||||
tokenizer = rosette_surface_tokenizer
|
|
||||||
else:
|
|
||||||
tokenizer = rosette_tokenizer
|
|
||||||
builder = WordCountBuilder(tokenizer=tokenizer)
|
|
||||||
save_filename = 'twitter-counts-%d.csv' % offset
|
|
||||||
save_pathname = path.parent / save_filename
|
|
||||||
builder.count_twitter(path, offset, nsplit)
|
|
||||||
builder.save_wordlist(save_pathname)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
|
||||||
parser.add_argument('offset', type=int)
|
|
||||||
parser.add_argument('nsplit', type=int)
|
|
||||||
args = parser.parse_args()
|
|
||||||
count_twitter(args.filename, args.offset, args.nsplit, surface=True)
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
|||||||
from wordfreq_builder.word_counts import WordCountBuilder
|
|
||||||
from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
|
|
||||||
from pathlib import Path
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def count_wikipedia(filename, surface=True):
|
|
||||||
path = Path(filename)
|
|
||||||
if surface == True:
|
|
||||||
tokenizer = rosette_surface_tokenizer
|
|
||||||
else:
|
|
||||||
tokenizer = rosette_tokenizer
|
|
||||||
builder = WordCountBuilder(tokenizer=tokenizer, unique_docs=False)
|
|
||||||
builder.count_wikipedia(path)
|
|
||||||
builder.save_wordlist(path.parent / 'counts.csv')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('filename', help='flat text file containing extracted Wikipedia text')
|
|
||||||
args = parser.parse_args()
|
|
||||||
count_wikipedia(args.filename, surface=True)
|
|
||||||
|
|
69
wordfreq_builder/wordfreq_builder/config.py
Normal file
69
wordfreq_builder/wordfreq_builder/config.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
CONFIG = {
|
||||||
|
'version': '0.9.0',
|
||||||
|
# data_dir is a relative or absolute path to where the wordlist data
|
||||||
|
# is stored
|
||||||
|
'data_dir': 'data',
|
||||||
|
'sources': {
|
||||||
|
# A list of language codes (possibly un-standardized) that we'll
|
||||||
|
# look up in filenames for these various data sources.
|
||||||
|
'twitter': [
|
||||||
|
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
||||||
|
'pt', 'ru',
|
||||||
|
# can be added later: 'th', 'tr'
|
||||||
|
],
|
||||||
|
'wikipedia': [
|
||||||
|
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
||||||
|
'pt', 'ru'
|
||||||
|
],
|
||||||
|
'opensubtitles': [
|
||||||
|
# All languages where the most common word in OpenSubtitles
|
||||||
|
# appears at least 5000 times
|
||||||
|
'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
|
||||||
|
'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv',
|
||||||
|
'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sq', 'sr',
|
||||||
|
'sv', 'tr', 'uk', 'zh'
|
||||||
|
],
|
||||||
|
'leeds': [
|
||||||
|
'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh'
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'wordlist_paths': {
|
||||||
|
'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}',
|
||||||
|
'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
|
||||||
|
'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
|
||||||
|
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
|
||||||
|
'combined': 'generated/combined/combined_{lang}.{ext}'
|
||||||
|
},
|
||||||
|
'min_sources': 2
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def data_filename(filename):
|
||||||
|
return os.path.join(CONFIG['data_dir'], filename)
|
||||||
|
|
||||||
|
|
||||||
|
def wordlist_filename(source, language, extension='txt'):
|
||||||
|
path = CONFIG['wordlist_paths'][source].format(
|
||||||
|
lang=language, ext=extension
|
||||||
|
)
|
||||||
|
return data_filename(path)
|
||||||
|
|
||||||
|
|
||||||
|
def source_names(language):
|
||||||
|
"""
|
||||||
|
Get the names of data sources that supply data for the given language.
|
||||||
|
"""
|
||||||
|
return sorted([key for key in CONFIG['sources']
|
||||||
|
if language in CONFIG['sources'][key]])
|
||||||
|
|
||||||
|
|
||||||
|
def all_languages():
|
||||||
|
languages = set()
|
||||||
|
for langlist in CONFIG['sources'].values():
|
||||||
|
languages |= set(langlist)
|
||||||
|
return [lang for lang in sorted(languages)
|
||||||
|
if len(source_names(lang))
|
||||||
|
>= CONFIG['min_sources']]
|
||||||
|
|
199
wordfreq_builder/wordfreq_builder/ninja.py
Normal file
199
wordfreq_builder/wordfreq_builder/ninja.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
from wordfreq_builder.config import (
|
||||||
|
CONFIG, data_filename, wordlist_filename, all_languages, source_names
|
||||||
|
)
|
||||||
|
import sys
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
HEADER = """# This file is automatically generated. Do not edit it.
|
||||||
|
# You can regenerate it using the 'wordfreq-build-deps' command.
|
||||||
|
"""
|
||||||
|
TMPDIR = data_filename('tmp')
|
||||||
|
|
||||||
|
|
||||||
|
# Set this to True to rebuild the Twitter tokenization (which takes days)
|
||||||
|
PRETOKENIZE_TWITTER = False
|
||||||
|
|
||||||
|
|
||||||
|
def add_dep(lines, rule, input, output, extra=None, params=None):
|
||||||
|
if isinstance(output, list):
|
||||||
|
output = ' '.join(output)
|
||||||
|
if isinstance(input, list):
|
||||||
|
input = ' '.join(input)
|
||||||
|
if extra:
|
||||||
|
if isinstance(extra, list):
|
||||||
|
extra = ' '.join(extra)
|
||||||
|
extrastr = ' | ' + extra
|
||||||
|
else:
|
||||||
|
extrastr = ''
|
||||||
|
build_rule = "build {output}: {rule} {input}{extra}".format(
|
||||||
|
output=output, rule=rule, input=input, extra=extrastr
|
||||||
|
)
|
||||||
|
lines.append(build_rule)
|
||||||
|
if params:
|
||||||
|
for key, val in params.items():
|
||||||
|
lines.append(" {key} = {val}".format(locals()))
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
|
||||||
|
def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||||
|
"""
|
||||||
|
Output a complete Ninja file describing how to build the wordfreq data.
|
||||||
|
"""
|
||||||
|
print(HEADER, file=out)
|
||||||
|
# Copy in the rules section
|
||||||
|
with open(rules_filename, encoding='utf-8') as rulesfile:
|
||||||
|
print(rulesfile.read(), file=out)
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
if PRETOKENIZE_TWITTER:
|
||||||
|
lines.extend(
|
||||||
|
twitter_preprocess_deps(
|
||||||
|
data_filename('raw-input/twitter/all-2014.txt'),
|
||||||
|
slice_prefix=data_filename('slices/twitter/tweets-2014'),
|
||||||
|
combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
|
||||||
|
slices=40,
|
||||||
|
languages=CONFIG['sources']['twitter']
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lines.extend(
|
||||||
|
twitter_deps(
|
||||||
|
data_filename('intermediate/twitter/tweets-2014'),
|
||||||
|
languages=CONFIG['sources']['twitter']
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lines.extend(
|
||||||
|
wikipedia_deps(
|
||||||
|
data_filename('raw-input/wikipedia'),
|
||||||
|
CONFIG['sources']['wikipedia']
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lines.extend(
|
||||||
|
leeds_deps(
|
||||||
|
data_filename('source-lists/leeds'),
|
||||||
|
CONFIG['sources']['leeds']
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lines.extend(
|
||||||
|
opensubtitles_deps(
|
||||||
|
data_filename('source-lists/opensubtitles'),
|
||||||
|
CONFIG['sources']['opensubtitles']
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lines.extend(combine_lists(all_languages()))
|
||||||
|
|
||||||
|
print('\n'.join(lines), file=out)
|
||||||
|
|
||||||
|
|
||||||
|
def wikipedia_deps(dirname_in, languages):
|
||||||
|
lines = []
|
||||||
|
path_in = pathlib.Path(dirname_in)
|
||||||
|
for language in languages:
|
||||||
|
# Find the most recent file for this language
|
||||||
|
input_file = max(path_in.glob(
|
||||||
|
'{}wiki*.bz2'.format(language)
|
||||||
|
))
|
||||||
|
raw_file = wordlist_filename('wikipedia', language, 'txt')
|
||||||
|
token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
|
||||||
|
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
|
||||||
|
|
||||||
|
add_dep(lines, 'wiki2text', input_file, raw_file)
|
||||||
|
add_dep(lines, 'wiki2tokens', input_file, token_file)
|
||||||
|
add_dep(lines, 'count', token_file, count_file)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def twitter_preprocess_deps(input_filename, slice_prefix,
|
||||||
|
combined_prefix, slices, languages):
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
|
||||||
|
for num in range(slices)]
|
||||||
|
# split the input into slices
|
||||||
|
add_dep(lines,
|
||||||
|
'split', input_filename, slice_files,
|
||||||
|
{'prefix': '{}.part'.format(slice_prefix),
|
||||||
|
'slices': slices})
|
||||||
|
|
||||||
|
for slicenum in range(slices):
|
||||||
|
slice_file = slice_files[slicenum]
|
||||||
|
language_outputs = [
|
||||||
|
'{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
|
||||||
|
for language in languages
|
||||||
|
]
|
||||||
|
add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
|
||||||
|
{'prefix': slice_file})
|
||||||
|
|
||||||
|
for language in languages:
|
||||||
|
combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language)
|
||||||
|
|
||||||
|
language_inputs = [
|
||||||
|
'{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
|
||||||
|
for slicenum in range(slices)
|
||||||
|
]
|
||||||
|
add_dep(lines, 'cat', language_inputs, combined_output)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def twitter_deps(prefix_in, languages):
|
||||||
|
lines = []
|
||||||
|
for language in languages:
|
||||||
|
input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language)
|
||||||
|
token_file = wordlist_filename('twitter', language, 'tokens.txt')
|
||||||
|
add_dep(lines,
|
||||||
|
'format_twitter', input_file, token_file,
|
||||||
|
extra='wordfreq_builder/tokenizers.py')
|
||||||
|
|
||||||
|
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
||||||
|
add_dep(lines, 'count', token_file, count_file)
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def leeds_deps(dirname_in, languages):
|
||||||
|
lines = []
|
||||||
|
for language in languages:
|
||||||
|
input_file = '{prefix}/internet-{lang}-forms.num'.format(
|
||||||
|
prefix=dirname_in, lang=language
|
||||||
|
)
|
||||||
|
reformatted_file = wordlist_filename('leeds', language, 'counts.txt')
|
||||||
|
add_dep(lines, 'convert_leeds', input_file, reformatted_file)
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def opensubtitles_deps(dirname_in, languages):
|
||||||
|
lines = []
|
||||||
|
for language in languages:
|
||||||
|
input_file = '{prefix}/{lang}.txt'.format(
|
||||||
|
prefix=dirname_in, lang=language
|
||||||
|
)
|
||||||
|
reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
|
||||||
|
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def combine_lists(languages):
|
||||||
|
lines = []
|
||||||
|
for language in languages:
|
||||||
|
sources = source_names(language)
|
||||||
|
input_files = [
|
||||||
|
wordlist_filename(source, language, 'counts.txt')
|
||||||
|
for source in sources
|
||||||
|
]
|
||||||
|
output_file = wordlist_filename('combined', language)
|
||||||
|
add_dep(lines, 'merge', input_files, output_file,
|
||||||
|
extra='wordfreq_builder/word_counts.py')
|
||||||
|
|
||||||
|
output_dBpack = wordlist_filename('combined', language, 'msgpack.gz')
|
||||||
|
add_dep(lines, 'freqs2dB', output_file, output_dBpack,
|
||||||
|
extra='wordfreq_builder/word_counts.py')
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
make_ninja_deps('rules.ninja')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -1,33 +1,153 @@
|
|||||||
from lumi_science.text_readers.rosette_readers import RosetteReader
|
from lumi_science.text_readers.rosette_readers import RosetteReader
|
||||||
|
from html.entities import name2codepoint
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
ROSETTE = RosetteReader()
|
ROSETTE = RosetteReader()
|
||||||
|
|
||||||
|
|
||||||
def rosette_tokenizer(text):
|
# Some of Rosette's language codes are incorrect. For example, 'zh_sc' should
|
||||||
analysis, lang = ROSETTE.rosette.analyze(text)
|
# mean "Chinese as used in Seychelles", which is kind of nonsense. What Rosette
|
||||||
# I'm aware this doesn't do the right things with multi-word stems.
|
# really means is "Simplified Chinese", whose code is 'zh-Hans'.
|
||||||
# Wordfreq doesn't either. And wordfreq isn't designed to look up
|
ROSETTE_LANG_MAP = {
|
||||||
# multiple words anyway.
|
'zh_sc': 'zh-Hans',
|
||||||
tokens = []
|
'zh_tc': 'zh-Hant',
|
||||||
for (stem, pos, span) in analysis:
|
'en_uc': 'en',
|
||||||
for subtoken in stem.split(' '):
|
}
|
||||||
tokens.append(subtoken + '|' + lang)
|
|
||||||
return tokens
|
|
||||||
|
NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]')
|
||||||
|
|
||||||
|
EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF'
|
||||||
|
RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE))
|
||||||
|
|
||||||
|
|
||||||
|
def last_tab(line):
|
||||||
|
"""
|
||||||
|
Read lines by keeping only the last tab-separated value.
|
||||||
|
"""
|
||||||
|
return line.split('\t')[-1].strip()
|
||||||
|
|
||||||
|
|
||||||
|
def lowercase_text_filter(token):
|
||||||
|
if NON_PUNCT_RE.search(token):
|
||||||
|
return token.lower()
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def is_url(token):
|
||||||
|
return token.startswith('http:') or token.startswith('https:')
|
||||||
|
|
||||||
|
|
||||||
|
def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
||||||
|
"""
|
||||||
|
Process a file by running it through the given tokenizer, sorting the
|
||||||
|
results by the language of each line, and inserting spaces into lines
|
||||||
|
to mark the token boundaries. This computes the 'hard part' of
|
||||||
|
tokenization and allows the results to be saved, so that we can change
|
||||||
|
the finer details of the output without re-running everything.
|
||||||
|
"""
|
||||||
|
out_files = {}
|
||||||
|
for line in open(in_filename, encoding='utf-8'):
|
||||||
|
text = line_reader(line)
|
||||||
|
tokens, language = tokenizer(text)
|
||||||
|
tokenized = ' '.join(tokens)
|
||||||
|
if language is not None:
|
||||||
|
out_filename = '%s.%s.txt' % (out_prefix, language)
|
||||||
|
if out_filename in out_files:
|
||||||
|
out_file = out_files[out_filename]
|
||||||
|
else:
|
||||||
|
out_file = open(out_filename, 'w', encoding='utf-8')
|
||||||
|
out_files[out_filename] = out_file
|
||||||
|
print(tokenized, file=out_file)
|
||||||
|
for out_file in out_files.values():
|
||||||
|
out_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
|
||||||
|
|
||||||
|
|
||||||
|
def fix_entities(text):
|
||||||
|
"""
|
||||||
|
Fix the few HTML entities that Twitter uses -- even if they've
|
||||||
|
already been tokenized.
|
||||||
|
"""
|
||||||
|
def replace_entity(match):
|
||||||
|
return chr(name2codepoint[match.group(1)])
|
||||||
|
return ENTITY_RE.sub(replace_entity, text)
|
||||||
|
|
||||||
|
|
||||||
|
def retokenize(text):
|
||||||
|
text = fix_entities(text)
|
||||||
|
tokens = RETOKENIZE_RE.findall(text)
|
||||||
|
skip_next = False
|
||||||
|
for token in tokens:
|
||||||
|
if token == '/' or token == '@':
|
||||||
|
# Avoid idiosyncratic tokens such as URLs and
|
||||||
|
# usernames
|
||||||
|
skip_next = True
|
||||||
|
elif skip_next:
|
||||||
|
skip_next = False
|
||||||
|
else:
|
||||||
|
if not is_url(token):
|
||||||
|
filtered = lowercase_text_filter(token)
|
||||||
|
if filtered:
|
||||||
|
yield filtered
|
||||||
|
|
||||||
|
|
||||||
|
def retokenize_file(in_filename, out_filename):
|
||||||
|
"""
|
||||||
|
Process a file that has been tokenized (by inserting spaces) in a
|
||||||
|
language-specific way by Rosette.
|
||||||
|
"""
|
||||||
|
with open(in_filename, encoding='utf-8') as in_file:
|
||||||
|
with open(out_filename, 'w', encoding='utf-8') as out_file:
|
||||||
|
for line in in_file:
|
||||||
|
skip_next = False
|
||||||
|
for token in retokenize(line.strip()):
|
||||||
|
if skip_next:
|
||||||
|
skip_next = False
|
||||||
|
elif token == '/' or token == '@':
|
||||||
|
# Avoid idiosyncratic tokens such as URLs and
|
||||||
|
# usernames
|
||||||
|
skip_next = True
|
||||||
|
elif lowercase_text_filter(token):
|
||||||
|
print(token, file=out_file)
|
||||||
|
|
||||||
|
|
||||||
|
def monolingual_tokenize_file(in_filename, out_filename, language,
|
||||||
|
tokenizer, line_reader=last_tab,
|
||||||
|
token_filter=lowercase_text_filter,
|
||||||
|
sample_proportion=100):
|
||||||
|
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
|
||||||
|
with open(out_filename, 'w', encoding='utf-8') as out_file:
|
||||||
|
for i, line in enumerate(in_file):
|
||||||
|
if i % sample_proportion == 0:
|
||||||
|
text = line_reader(line)
|
||||||
|
tokens, line_language = tokenizer(text)
|
||||||
|
if line_language == language:
|
||||||
|
filtered = [token_filter(t) for t in tokens]
|
||||||
|
filtered = [t for t in filtered if t is not None]
|
||||||
|
for token in filtered:
|
||||||
|
print(token, file=out_file)
|
||||||
|
|
||||||
|
|
||||||
def rosette_surface_tokenizer(text):
|
def rosette_surface_tokenizer(text):
|
||||||
|
try:
|
||||||
analysis, lang = ROSETTE.rosette.analyze(text)
|
analysis, lang = ROSETTE.rosette.analyze(text)
|
||||||
|
except (RuntimeError, UnicodeError):
|
||||||
|
# Our Rosette interface throws errors given arbitrary data. :(
|
||||||
|
return text, None
|
||||||
|
language = ROSETTE_LANG_MAP.get(lang, lang)
|
||||||
tokens = []
|
tokens = []
|
||||||
for (stem, pos, span) in analysis:
|
for (stem, pos, span) in analysis:
|
||||||
surface_text = text[span[0]:span[1]]
|
surface_text = text[span[0]:span[1]]
|
||||||
for subtoken in surface_text.split(' '):
|
tokens.append(surface_text)
|
||||||
tokens.append(subtoken + '|' + lang)
|
return tokens, language
|
||||||
return tokens
|
|
||||||
|
|
||||||
|
|
||||||
def treebank_surface_tokenizer(text):
|
def treebank_surface_tokenizer(text, language='en'):
|
||||||
"""
|
"""
|
||||||
This is a simplified version of the Treebank tokenizer in NLTK.
|
This is a simplified version of the Treebank tokenizer in NLTK.
|
||||||
|
|
||||||
@ -45,6 +165,10 @@ def treebank_surface_tokenizer(text):
|
|||||||
as a result -- for example, it splits "wanna" into "wan" and "na", which
|
as a result -- for example, it splits "wanna" into "wan" and "na", which
|
||||||
are supposed to be considered unusual surface forms of "want" and "to".
|
are supposed to be considered unusual surface forms of "want" and "to".
|
||||||
We just leave it as the word "wanna".
|
We just leave it as the word "wanna".
|
||||||
|
|
||||||
|
The language will just be returned, as this function isn't doing any
|
||||||
|
language detection. It defaults to 'en', as English is the language that
|
||||||
|
Treebank tokenization is designed for.
|
||||||
"""
|
"""
|
||||||
#starting quotes
|
#starting quotes
|
||||||
text = re.sub(r'^\"', r'``', text)
|
text = re.sub(r'^\"', r'``', text)
|
||||||
@ -80,4 +204,4 @@ def treebank_surface_tokenizer(text):
|
|||||||
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
||||||
text)
|
text)
|
||||||
|
|
||||||
return text.split()
|
return text.split(), language
|
||||||
|
@ -1,116 +1,85 @@
|
|||||||
from wordfreq_builder.tokenizers import treebank_surface_tokenizer
|
from wordfreq_builder.tokenizers import retokenize
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from pathlib import Path
|
from ftfy import fix_text
|
||||||
from unicodedata import normalize
|
import math
|
||||||
import csv
|
import csv
|
||||||
import sys
|
import msgpack
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
|
||||||
def read_counts(path):
|
def count_tokens(filename):
|
||||||
counts = defaultdict(int)
|
counts = defaultdict(int)
|
||||||
with path.open(encoding='utf-8', newline='') as infile:
|
with open(filename, encoding='utf-8') as infile:
|
||||||
reader = csv.reader(infile)
|
for line in infile:
|
||||||
for key, strval in reader:
|
for token in retokenize(line.strip()):
|
||||||
val = float(strval)
|
counts[token] += 1
|
||||||
# Use += so that, if we give the reader concatenated files with
|
|
||||||
# duplicates, it does the right thing
|
|
||||||
counts[key] += val
|
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
def count_languages(counts):
|
def read_freqs(filename, cutoff=0):
|
||||||
langcounts = defaultdict(int)
|
raw_counts = defaultdict(float)
|
||||||
for key, strval in counts.items():
|
total = 0.
|
||||||
val = int(strval)
|
with open(filename, encoding='utf-8', newline='') as infile:
|
||||||
text, lang = key.rsplit('|', 1)
|
reader = csv.reader(infile)
|
||||||
langcounts[lang] += val
|
for key, strval in reader:
|
||||||
return langcounts
|
val = float(strval)
|
||||||
|
if val < cutoff:
|
||||||
|
break
|
||||||
|
for token in retokenize(key):
|
||||||
|
token = fix_text(token)
|
||||||
|
total += val
|
||||||
|
# Use += so that, if we give the reader concatenated files with
|
||||||
|
# duplicates, it does the right thing
|
||||||
|
raw_counts[token] += val
|
||||||
|
|
||||||
|
freqs = {key: raw_count / total
|
||||||
|
for (key, raw_count) in raw_counts.items()}
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
def merge_counts(count_dicts, balance=False):
|
def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
|
||||||
|
freq_cutoff = 10 ** (cutoff / 10.)
|
||||||
|
freqs = read_freqs(in_filename, freq_cutoff)
|
||||||
|
dBpack = []
|
||||||
|
for token, freq in freqs.items():
|
||||||
|
dB = round(math.log10(freq) * 10)
|
||||||
|
if dB >= cutoff:
|
||||||
|
neg_dB = -dB
|
||||||
|
while neg_dB >= len(dBpack):
|
||||||
|
dBpack.append([])
|
||||||
|
dBpack[neg_dB].append(token)
|
||||||
|
|
||||||
|
with gzip.open(out_filename, 'wb') as outfile:
|
||||||
|
msgpack.dump(dBpack, outfile)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_freqs(freq_dicts):
|
||||||
|
vocab = set()
|
||||||
|
for freq_dict in freq_dicts:
|
||||||
|
vocab |= set(freq_dict)
|
||||||
|
|
||||||
merged = defaultdict(float)
|
merged = defaultdict(float)
|
||||||
maxweight = None
|
N = len(freq_dicts)
|
||||||
for counts in count_dicts:
|
for term in vocab:
|
||||||
if balance:
|
term_total = 0.
|
||||||
if maxweight is None:
|
for freq_dict in freq_dicts:
|
||||||
maxweight = max(counts.values())
|
term_total += freq_dict.get(term, 0.)
|
||||||
weight = maxweight / max(counts.values()) / len(count_dicts)
|
merged[term] = term_total / N
|
||||||
else:
|
|
||||||
weight = 1.
|
|
||||||
for key, val in counts.items():
|
|
||||||
merged[key] += val * weight
|
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def write_counts(counts, path, cutoff=2):
|
def write_wordlist(freqs, filename, cutoff=1e-8):
|
||||||
print("Writing to %s" % path)
|
"""
|
||||||
with path.open('w', encoding='utf-8', newline='') as outfile:
|
Write a dictionary of either raw counts or frequencies to a file of
|
||||||
|
comma-separated values.
|
||||||
|
"""
|
||||||
|
with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
|
||||||
writer = csv.writer(outfile)
|
writer = csv.writer(outfile)
|
||||||
items = sorted(counts.items(), key=itemgetter(1), reverse=True)
|
items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
|
||||||
for word, count in items:
|
for word, freq in items:
|
||||||
if count < cutoff:
|
if freq < cutoff:
|
||||||
# Don't write all the terms that appeared too infrequently
|
|
||||||
break
|
break
|
||||||
if not ('"' in word or ',' in word):
|
if not ('"' in word or ',' in word):
|
||||||
writer.writerow([word, str(int(count))])
|
writer.writerow([word, str(freq)])
|
||||||
|
|
||||||
|
|
||||||
class WordCountBuilder:
|
|
||||||
def __init__(self, unique_docs=True, tokenizer=None):
|
|
||||||
self.counts = defaultdict(int)
|
|
||||||
self.unique_docs = unique_docs
|
|
||||||
if tokenizer is None:
|
|
||||||
self.tokenizer = treebank_surface_tokenizer
|
|
||||||
else:
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
|
|
||||||
def add_text(self, text):
|
|
||||||
text = normalize('NFKC', text).lower()
|
|
||||||
try:
|
|
||||||
tokens = self.tokenizer(text)
|
|
||||||
# print(' '.join(tokens))
|
|
||||||
except Exception as e:
|
|
||||||
print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr)
|
|
||||||
return
|
|
||||||
if self.unique_docs:
|
|
||||||
tokens = set(tokens)
|
|
||||||
for tok in tokens:
|
|
||||||
self.counts[tok] += 1
|
|
||||||
|
|
||||||
def count_wikipedia(self, path):
|
|
||||||
"""
|
|
||||||
Read a directory of extracted Wikipedia articles. The articles can be
|
|
||||||
grouped together into files, in which case they should be separated by
|
|
||||||
lines beginning with ##.
|
|
||||||
"""
|
|
||||||
with path.open(encoding='utf-8') as file:
|
|
||||||
article_lines = []
|
|
||||||
for line in file:
|
|
||||||
line = line.strip()
|
|
||||||
if line.startswith('= ') and line.endswith(' ='):
|
|
||||||
# Fake level-1 headings indicate boundaries between articles
|
|
||||||
print(line)
|
|
||||||
self.try_wiki_article(' '.join(article_lines))
|
|
||||||
article_lines.clear()
|
|
||||||
else:
|
|
||||||
# Skip other headings, so that "external" doesn't look
|
|
||||||
# ridiculously common, for example
|
|
||||||
if not (line.startswith('==') and line.endswith('==')):
|
|
||||||
article_lines.append(line)
|
|
||||||
self.try_wiki_article(' '.join(article_lines))
|
|
||||||
|
|
||||||
def try_wiki_article(self, text):
|
|
||||||
if len(text) > 1000:
|
|
||||||
self.add_text(text)
|
|
||||||
|
|
||||||
def count_twitter(self, path, offset, nsplit):
|
|
||||||
with path.open(encoding='utf-8') as file:
|
|
||||||
for i, line in enumerate(file):
|
|
||||||
if i % nsplit == offset:
|
|
||||||
line = line.strip()
|
|
||||||
text = line.split('\t')[-1]
|
|
||||||
self.add_text(text)
|
|
||||||
|
|
||||||
def save_wordlist(self, path):
|
|
||||||
write_counts(self.counts, path)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user