diff --git a/wordfreq_builder/.gitignore b/wordfreq_builder/.gitignore deleted file mode 100644 index a1da2e9..0000000 --- a/wordfreq_builder/.gitignore +++ /dev/null @@ -1,12 +0,0 @@ -*.pyc -__pycache__ -.coverage -.idea -dist -*.egg-info -build -_build -build.ninja -data -.ninja_deps -.ninja_log diff --git a/wordfreq_builder/Makefile b/wordfreq_builder/Makefile deleted file mode 100644 index 626cf46..0000000 --- a/wordfreq_builder/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -PYTHON = python - -all: build.ninja - -# build the Ninja file that will take over the build process -build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO - $(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja - diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md deleted file mode 100644 index af47613..0000000 --- a/wordfreq_builder/README.md +++ /dev/null @@ -1,194 +0,0 @@ -# wordfreq\_builder - -This package builds the data files for [wordfreq](https://github.com/LuminosoInsight/wordfreq). - -It requires a fair amount of external input data (42 GB of it, as of this -writing), which unfortunately we don't have a plan for how to distribute -outside of Luminoso yet. - -The data can be publicly obtained in various ways, so here we'll at least -document where it comes from. We hope to come up with a process that's more -reproducible eventually. - -The good news is that you don't need to be able to run this process to use -wordfreq. The built results are already in the `wordfreq/data` directory. - -## How to build it - -Set up your external hard disk, your networked file system, or whatever thing -you have that's got a couple hundred GB of space free. Let's suppose the -directory of it that you want to use is called `/ext/data`. - -Get the input data. At Luminoso, this is available in the directory -`/nfs/broadway/data/wordfreq_builder`. The sections below explain where the -data comes from. - -Copy the input data: - - cp -rv /nfs/broadway/data/wordfreq_builder /ext/data/ - -Make a symbolic link so that `data/` in this directory points to -your copy of the input data: - - ln -s /ext/data/wordfreq_builder data - -Install the Ninja build system: - - sudo apt-get install ninja-build - -We need to build a Ninja build file using the Python code in -`wordfreq_builder/ninja.py`. We could do this with Ninja, but... you see the -chicken-and-egg problem, don't you. So this is the one thing the Makefile -knows how to do. - - make - -Start the build, and find something else to do for a few hours: - - ninja -v - -You can copy the results into wordfreq with this command: - - cp data/dist/*.msgpack.gz ../wordfreq/data/ - - -## The Ninja build process - -Ninja is a lot like Make, except with one big {drawback|advantage}: instead of -writing bizarre expressions in an idiosyncratic language to let Make calculate -which files depend on which other files... - -...you just tell Ninja which files depend on which other files. - -The Ninja documentation suggests using your favorite scripting language to -create the dependency list, so that's what we've done in `ninja.py`. - -Dependencies in Ninja refer to build rules. These do need to be written by hand -in Ninja's own format, but the task is simpler. In this project, the build -rules are defined in `rules.ninja`. They'll be concatenated with the -Python-generated dependency definitions to form the complete build file, -`build.ninja`, which is the default file that Ninja looks at when you run -`ninja`. - -So a lot of the interesting work in this package is done in `rules.ninja`. -This file defines shorthand names for long commands. As a simple example, -the rule named `format_twitter` applies the command - - python -m wordfreq_builder.cli.format_twitter $in $out - -to the dependency file `$in` and the output file `$out`. - -The specific rules are described by the comments in `rules.ninja`. - -## Data sources - -### Leeds Internet Corpus - -Also known as the "Web as Corpus" project, this is a University of Leeds -project that collected wordlists in assorted languages by crawling the Web. -The results are messy, but they're something. We've been using them for quite -a while. - -These files can be downloaded from the [Leeds corpus page][leeds]. - -The original files are in `data/source-lists/leeds`, and they're processed -by the `convert_leeds` rule in `rules.ninja`. - -[leeds]: http://corpus.leeds.ac.uk/list.html - -### Twitter - -The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets -collected by the `ftfy.streamtester` package in 2014. - -We are not allowed to distribute the text of tweets. However, this process could -be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for -a couple of weeks. - -[ftfy]: https://github.com/LuminosoInsight/python-ftfy - -### Google Books - -We use English word frequencies from [Google Books Syntactic Ngrams][gbsn]. -We pretty much ignore the syntactic information, and only use this version -because it's cleaner. The data comes in the form of 99 gzipped text files in -`data/raw-input/google-books`. - -[gbsn]: http://commondatastorage.googleapis.com/books/syntactic-ngrams/index.html - -### Wikipedia - -Another source we use is the full text of Wikipedia in various languages. This -text can be difficult to extract efficiently, and for this purpose we use a -custom tool written in Nim 0.11, called [wiki2text][]. To build the Wikipedia -data, you need to separately install Nim and wiki2text. - -The input data files are the XML dumps that can be found on the [Wikimedia -backup index][wikidumps]. For example, to get the latest Spanish data, go to -https://dumps.wikimedia.org/frwiki/latest and look for the filename of the form -`*.pages-articles.xml.bz2`. If this file isn't there, look for an older dump -where it is. You'll need to download such a file for each language that's -configured for Wikipedia in `wordfreq_builder/config.py`. - -[wiki2text]: https://github.com/rspeer/wiki2text -[wikidumps]: https://dumps.wikimedia.org/backup-index.html - -### OpenSubtitles - -[Hermit Dave](https://invokeit.wordpress.com/frequency-word-lists/) made word -frequency lists out of the subtitle text on OpenSubtitles. This data was -used to make Wiktionary word frequency lists at one point, but it's been -updated significantly since the version Wiktionary got. - -The wordlists are in `data/source-lists/opensubtitles`. - -In order to fit into the wordfreq pipeline, we renamed lists with different variants -of the same language code, to distinguish them fully according to BCP 47. Then we -concatenated the different variants into a single list, as follows: - -* `zh_tw.txt` was renamed to `zh-Hant.txt` -* `zh_cn.txt` was renamed to `zh-Hans.txt` -* `zh.txt` was renamed to `zh-Hani.txt` -* `zh-Hant.txt`, `zh-Hans.txt`, and `zh-Hani.txt` were concatenated into `zh.txt` -* `pt.txt` was renamed to `pt-PT.txt` -* `pt_br.txt` was renamed to `pt-BR.txt` -* `pt-BR.txt` and `pt-PT.txt` were concatenated into `pt.txt` - -We also edited the English data to re-add "'t" to words that had obviously lost -it, such as "didn" in the place of "didn't". We applied this to words that -became much less common words in the process, which means this wordlist no -longer represents the words 'don' and 'won', as we assume most of their -frequency comes from "don't" and "won't". Words that turned into similarly -common words, however, were left alone: this list doesn't represent "can't" -because the word was left as "can". - -### SUBTLEX - -Marc Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in -wordfreq and derived works without the "academic use" restriction, under the -following reasonable conditions: - -- Wordfreq and code derived from it must credit the SUBTLEX authors. - (See the citations in the top-level `README.md` file.) -- It must remain clear that SUBTLEX is freely available data. - -`data/source-lists/subtlex` contains the following files: - -- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de], - and exported from Excel format to tab-separated UTF-8 using LibreOffice -- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us], - extracted, and converted from ISO-8859-1 to UTF-8 -- `subtlex.en-GB.txt`, which was downloaded as - [SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to - tab-separated UTF-8 using LibreOffice -- `subtlex.nl.txt`, which was downloaded as - [SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted -- `subtlex.zh.txt`, which was downloaded as - [subtlexch131210.zip][subtlex-ch] and extracted - -[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx -[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip -[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx -[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip -[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip - diff --git a/wordfreq_builder/build.png b/wordfreq_builder/build.png deleted file mode 100644 index 64e5945..0000000 Binary files a/wordfreq_builder/build.png and /dev/null differ diff --git a/wordfreq_builder/lib/jq-linux64 b/wordfreq_builder/lib/jq-linux64 deleted file mode 100755 index 939227e..0000000 Binary files a/wordfreq_builder/lib/jq-linux64 and /dev/null differ diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja deleted file mode 100644 index 29d90a4..0000000 --- a/wordfreq_builder/rules.ninja +++ /dev/null @@ -1,117 +0,0 @@ -# This defines the rules on how to build parts of the wordfreq lists, using the -# Ninja build system: -# -# http://martine.github.io/ninja/manual.html -# -# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with -# better parallelism and the ability for build steps to produce multiple -# outputs. The tradeoff is that its rule syntax isn't full of magic for -# expanding wildcards and finding dependencies, so in general you have to -# write the dependencies using a script. -# -# This file will become the header of the larger build.ninja file, which also -# contains the programatically-defined dependency graph. - -# Variables -JQ = lib/jq-linux64 - -# How to build the build.ninja file itself. (Use the Makefile to get it the -# first time.) -rule build_deps - command = python -m wordfreq_builder.cli.build_deps $in > $out - -# Splits the single file $in into $slices parts, whose names will be -# $prefix plus a two-digit numeric suffix. -rule split - command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix - -# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from -# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at -# https://github.com/rspeer/wiki2text. -rule wiki2text - command = bunzip2 -c $in | wiki2text > $out - -# To tokenize Japanese, we run it through Mecab and take the first column. -rule tokenize_japanese - command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out - -# Process Chinese by converting all Traditional Chinese characters to -# Simplified equivalents -- not because that's a good way to get readable -# text, but because that's how we're going to look them up. -rule simplify_chinese - command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out - -# Tokenizing text from Twitter requires us to language-detect and tokenize -# in the same step. -rule tokenize_twitter - command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix - -rule tokenize_reddit - command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix - -# To convert the Leeds corpus, look for space-separated lines that start with -# an integer and a decimal. The integer is the rank, which we discard. The -# decimal is the frequency, and the remaining text is the term. Use sed -n -# with /p to output only lines where the match was successful. -# -# Grep out the term "EOS", an indication that Leeds used MeCab and didn't -# strip out the EOS lines. -rule convert_leeds - command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out - -# To convert the OpenSubtitles frequency data, simply replace spaces with -# commas. -rule convert_opensubtitles - command = tr ' ' ',' < $in > $out - -# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, -# run it through ftfy, convert tabs to commas and spurious CSV formatting to -# spaces, and remove lines with unfixable half-mojibake. -rule convert_subtlex - command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out - -rule convert_jieba - command = cut -d ' ' -f 1,2 $in | grep -v '[,"]' | tr ' ' ',' > $out - -rule counts_to_jieba - command = python -m wordfreq_builder.cli.counts_to_jieba $in $out - - -# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all -# the input files, keep only the single words and their counts, and only keep -# lines with counts of 100 or more. -# -# (These will still be repeated as the word appears in different grammatical -# roles, information that the source data provides that we're discarding. The -# source data was already filtered to only show words in roles with at least -# two-digit counts of occurences.) -rule convert_google_syntactic_ngrams - command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out - -rule count - command = python -m wordfreq_builder.cli.count_tokens $in $out - -rule count_langtagged - command = python -m wordfreq_builder.cli.count_tokens_langtagged $in $out -l $language - -rule merge - command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in - -rule merge_counts - command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in - -rule freqs2cB - command = python -m wordfreq_builder.cli.freqs_to_cB $in $out -b $buckets - -rule cat - command = cat $in > $out - -# A pipeline that extracts text from Reddit comments: -# - Unzip the input files -# - Select the body of comments, but only those whose Reddit score is positive -# (skipping the downvoted ones) -# - Skip deleted comments -# - Replace HTML escapes -rule extract_reddit - command = bunzip2 -c $in | $JQ -r 'select(.score > 0) | .body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out - diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py deleted file mode 100755 index 5b0f739..0000000 --- a/wordfreq_builder/setup.py +++ /dev/null @@ -1,13 +0,0 @@ -from setuptools import setup - -setup( - name="wordfreq_builder", - version='0.2', - maintainer='Luminoso Technologies, Inc.', - maintainer_email='info@luminoso.com', - url='http://github.com/LuminosoInsight/wordfreq_builder', - platforms=["any"], - description="Turns raw data into word frequency lists", - packages=['wordfreq_builder'], - install_requires=['msgpack-python', 'pycld2', 'langcodes'] -) diff --git a/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/tests/test_tokenizer.py deleted file mode 100644 index 2fbc477..0000000 --- a/wordfreq_builder/tests/test_tokenizer.py +++ /dev/null @@ -1,51 +0,0 @@ -from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language -from nose.tools import eq_ - - -def test_tokenizer_1(): - text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."' - tokens = [ - 'this', 'is', 'a', 'test', 'she', 'said', - 'and', "i'll", 'bet', "y", "all", '3.50', 'that', - 'it', "won't", 'fail', - ] - result = cld2_surface_tokenizer(text) - eq_(result[1], tokens) - eq_(result[0], 'en') - -def test_tokenizer_2(): - text = "i use punctuation informally...see?like this." - tokens = [ - 'i', 'use', 'punctuation', 'informally', 'see', - 'like', 'this' - ] - result = cld2_surface_tokenizer(text) - eq_(result[1], tokens) - eq_(result[0], 'en') - -def test_tokenizer_3(): - text = "@ExampleHandle This parser removes twitter handles!" - tokens = ['this', 'parser', 'removes', 'twitter', 'handles'] - result = cld2_surface_tokenizer(text) - eq_(result[1], tokens) - eq_(result[0], 'en') - -def test_tokenizer_4(): - text = "This is a really boring example tco http://t.co/n15ASlkase" - tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco'] - result = cld2_surface_tokenizer(text) - eq_(result[1], tokens) - eq_(result[0], 'en') - - -def test_language_recognizer_1(): - text = "Il est le meilleur livre que je ai jamais lu" - result = cld2_detect_language(text) - eq_(result, 'fr') - -def test_language_recognizer_2(): - text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort, - é uma nuvem esférica de planetesimais voláteis que se acredita - localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol.""" - result = cld2_detect_language(text) - eq_(result, 'pt') diff --git a/wordfreq_builder/tests/test_urls.py b/wordfreq_builder/tests/test_urls.py deleted file mode 100644 index 688a0b8..0000000 --- a/wordfreq_builder/tests/test_urls.py +++ /dev/null @@ -1,20 +0,0 @@ -from wordfreq_builder.word_counts import URL_RE -from nose.tools import eq_ - - -def check_url(url): - match = URL_RE.match(url) - assert match - eq_(match.span(), (0, len(url))) - - -def test_url_re(): - # URLs like this are all over the Arabic Wikipedia. Here's one with the - # student ID blanked out. - yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx' - - yield check_url, 'https://example.com/űnicode.html' - yield check_url, 'http://☃.net' - - assert not URL_RE.match('ftp://127.0.0.1') - diff --git a/wordfreq_builder/wordfreq_builder/__init__.py b/wordfreq_builder/wordfreq_builder/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/wordfreq_builder/wordfreq_builder/cli/__init__.py b/wordfreq_builder/wordfreq_builder/cli/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/wordfreq_builder/wordfreq_builder/cli/build_deps.py b/wordfreq_builder/wordfreq_builder/cli/build_deps.py deleted file mode 100644 index 3fd74ad..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/build_deps.py +++ /dev/null @@ -1,15 +0,0 @@ -from wordfreq_builder.ninja import make_ninja_deps -import argparse - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('in_filename', help='filename of rules file') - args = parser.parse_args() - - # Make the complete ninja file and write it to standard out - make_ninja_deps(args.in_filename) - - -if __name__ == '__main__': - main() diff --git a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py deleted file mode 100644 index 56b93cb..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py +++ /dev/null @@ -1,15 +0,0 @@ -from wordfreq_builder.word_counts import count_tokens, write_wordlist -import argparse - - -def handle_counts(filename_in, filename_out): - counts = count_tokens(filename_in) - write_wordlist(counts, filename_out) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('filename_in', help='name of input file containing tokens') - parser.add_argument('filename_out', help='name of output file') - args = parser.parse_args() - handle_counts(args.filename_in, args.filename_out) diff --git a/wordfreq_builder/wordfreq_builder/cli/count_tokens_langtagged.py b/wordfreq_builder/wordfreq_builder/cli/count_tokens_langtagged.py deleted file mode 100644 index 8b601ac..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/count_tokens_langtagged.py +++ /dev/null @@ -1,21 +0,0 @@ -""" -Count tokens of text in a particular language, taking input from a -tab-separated file whose first column is a language code. Lines in all -languages except the specified one will be skipped. -""" -from wordfreq_builder.word_counts import count_tokens_langtagged, write_wordlist -import argparse - - -def handle_counts(filename_in, filename_out, lang): - counts = count_tokens_langtagged(filename_in, lang) - write_wordlist(counts, filename_out) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('filename_in', help='name of input file containing tokens') - parser.add_argument('filename_out', help='name of output file') - parser.add_argument('-l', '--language', help='language tag to filter lines for') - args = parser.parse_args() - handle_counts(args.filename_in, args.filename_out, args.language) diff --git a/wordfreq_builder/wordfreq_builder/cli/counts_to_jieba.py b/wordfreq_builder/wordfreq_builder/cli/counts_to_jieba.py deleted file mode 100644 index b5273c7..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/counts_to_jieba.py +++ /dev/null @@ -1,15 +0,0 @@ -from wordfreq_builder.word_counts import read_values, write_jieba -import argparse - - -def handle_counts(filename_in, filename_out): - freqs, total = read_values(filename_in, cutoff=1e-6) - write_jieba(freqs, filename_out) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('filename_in', help='name of input wordlist') - parser.add_argument('filename_out', help='name of output Jieba-compatible wordlist') - args = parser.parse_args() - handle_counts(args.filename_in, args.filename_out) diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py deleted file mode 100644 index 5dc6966..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py +++ /dev/null @@ -1,14 +0,0 @@ -from wordfreq_builder.word_counts import freqs_to_cBpack -import argparse - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('filename_in', help='name of input file containing tokens') - parser.add_argument('filename_out', help='name of output file') - parser.add_argument('-b', '--buckets', type=int, default=600, - help='Number of centibel buckets to include (default 600). ' - 'Increasing this number creates a longer wordlist with ' - 'rarer words.') - args = parser.parse_args() - freqs_to_cBpack(args.filename_in, args.filename_out, cutoff=-(args.buckets)) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py deleted file mode 100644 index 6413024..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ /dev/null @@ -1,25 +0,0 @@ -from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist -import argparse - - -def merge_lists(input_names, output_name, cutoff=0, max_words=1000000): - count_dicts = [] - for input_name in input_names: - values, total = read_values(input_name, cutoff=cutoff, max_words=max_words) - count_dicts.append(values) - merged = merge_counts(count_dicts) - write_wordlist(merged, output_name) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', default='combined-counts.csv', - help='filename to write the output to') - parser.add_argument('-c', '--cutoff', type=int, default=0, - help='minimum count to read from an input file') - parser.add_argument('-m', '--max-words', type=int, default=1000000, - help='maximum number of words to read from each list') - parser.add_argument('inputs', nargs='+', - help='names of input files to merge') - args = parser.parse_args() - merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_words=args.max_words) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py deleted file mode 100644 index e16660b..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py +++ /dev/null @@ -1,31 +0,0 @@ -from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist -import argparse - - -def merge_lists(input_names, output_name, cutoff, lang): - freq_dicts = [] - - # Don't use Chinese tokenization while building wordlists, as that would - # create a circular dependency. - if lang == 'zh': - lang = None - - for input_name in input_names: - freq_dicts.append(read_freqs(input_name, cutoff=cutoff, lang=lang)) - merged = merge_freqs(freq_dicts) - write_wordlist(merged, output_name) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', default='combined-freqs.csv', - help='filename to write the output to') - parser.add_argument('-c', '--cutoff', type=int, default=2, - help='stop after seeing a count below this') - parser.add_argument('-l', '--language', default=None, - help='language code for which language the words are in') - parser.add_argument('inputs', nargs='+', - help='names of input files to merge') - args = parser.parse_args() - merge_lists(args.inputs, args.output, args.cutoff, args.language) - diff --git a/wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py b/wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py deleted file mode 100644 index 13237b6..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py +++ /dev/null @@ -1,11 +0,0 @@ -from wordfreq.chinese import simplify_chinese -import sys - - -def main(): - for line in sys.stdin: - sys.stdout.write(simplify_chinese(line)) - - -if __name__ == '__main__': - main() diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py deleted file mode 100644 index 829853c..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py +++ /dev/null @@ -1,18 +0,0 @@ -from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language -import argparse - - -def reddit_tokenizer(text): - return cld2_surface_tokenizer(text, mode='reddit') - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('filename', help='filename of input file containing one comment per line') - parser.add_argument('outprefix', help='prefix of output filenames') - args = parser.parse_args() - tokenize_by_language(args.filename, args.outprefix, tokenizer=reddit_tokenizer) - - -if __name__ == '__main__': - main() diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py deleted file mode 100644 index d144866..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py +++ /dev/null @@ -1,14 +0,0 @@ -from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language -import argparse - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('filename', help='filename of input file containing one tweet per line') - parser.add_argument('outprefix', help='prefix of output filenames') - args = parser.parse_args() - tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer) - - -if __name__ == '__main__': - main() diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py deleted file mode 100644 index a623cac..0000000 --- a/wordfreq_builder/wordfreq_builder/config.py +++ /dev/null @@ -1,131 +0,0 @@ -import os - -CONFIG = { - # data_dir is a relative or absolute path to where the wordlist data - # is stored - 'data_dir': 'data', - 'sources': { - # A list of language codes that we'll look up in filenames for these - # various data sources. - # - # Consider adding: - # 'th' when we get tokenization for it - # 'tl' with one more data source - # 'el' if we can filter out kaomoji - 'twitter': [ - 'ar', 'ca', 'de', 'en', 'es', 'fr', 'he', 'hi', 'id', 'it', - 'ja', 'ko', 'ms', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr' - ], - # Languages with large Wikipedias. (Languages whose Wikipedia dump is - # at least 200 MB of .xml.bz2 are included. Some widely-spoken - # languages with 100 MB are also included, specifically Malay and - # Hindi.) - 'wikipedia': [ - 'ar', 'ca', 'de', 'el', 'en', 'es', 'fr', 'he', 'hi', 'id', 'it', - 'ja', 'ko', 'ms', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr', 'zh', - 'bg', 'da', 'fi', 'hu', 'ro', 'uk' - ], - 'opensubtitles': [ - # This list includes languages where the most common word in - # OpenSubtitles appears at least 5000 times. However, we exclude - # languages where SUBTLEX has apparently done a better job, - # specifically German and Chinese. - 'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'el', 'en', 'es', 'et', - 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv', - 'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', - 'sr', 'sv', 'tr', 'uk' - ], - 'leeds': [ - 'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh' - ], - 'google-books': [ - 'en', - # Using the 2012 data, we could get French, German, Italian, - # Russian, Spanish, and (Simplified) Chinese. - ], - 'subtlex-en': ['en'], - 'subtlex-other': ['de', 'nl', 'zh'], - 'jieba': ['zh'], - - # About 99.2% of Reddit is in English. There are pockets of - # conversation in other languages, some of which may not be - # representative enough for learning general word frequencies. - # - # However, there seem to be Spanish subreddits that are general enough - # (including /r/es and /r/mexico). - 'reddit': ['en', 'es'], - - # Well-represented languages in the Common Crawl - # It's possible we could add 'uk' to the list, needs more checking - 'commoncrawl': [ - 'ar', 'bg', 'cs', 'da', 'de', 'el', 'es', 'fa', 'fi', 'fr', - 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ko', 'ms', 'nb', 'nl', - 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'ta', 'tr', 'vi', 'zh' - ], - }, - # Subtlex languages that need to be pre-processed - 'wordlist_paths': { - 'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}', - 'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}', - 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}', - 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', - 'google-books': 'generated/google-books/google_books_{lang}.{ext}', - 'commoncrawl': 'generated/commoncrawl/commoncrawl_{lang}.{ext}', - 'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}', - 'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}', - 'jieba': 'generated/jieba/jieba_{lang}.{ext}', - 'reddit': 'generated/reddit/reddit_{lang}.{ext}', - 'combined': 'generated/combined/combined_{lang}.{ext}', - 'combined-dist': 'dist/combined_{lang}.{ext}', - 'combined-dist-large': 'dist/large_{lang}.{ext}', - 'twitter-dist': 'dist/twitter_{lang}.{ext}', - 'jieba-dist': 'dist/jieba_{lang}.{ext}' - }, - 'min_sources': 3, - 'big-lists': ['en', 'fr', 'es', 'pt', 'de', 'ar', 'it', 'nl', 'ru'], - # When dealing with language tags that come straight from cld2, we need - # to un-standardize a few of them - 'cld2-language-aliases': { - 'nb': 'no', - 'he': 'iw', - 'jw': 'jv' - } -} - - -def data_filename(filename): - """ - Convert a relative filename to a path inside the configured data_dir. - """ - return os.path.join(CONFIG['data_dir'], filename) - - -def wordlist_filename(source, language, extension='txt'): - """ - Get the path where a particular built wordlist should go, parameterized by - its language and its file extension. - """ - path = CONFIG['wordlist_paths'][source].format( - lang=language, ext=extension - ) - return data_filename(path) - - -def source_names(language): - """ - Get the names of data sources that supply data for the given language. - """ - return sorted(key for key in CONFIG['sources'] - if language in CONFIG['sources'][key]) - - -def all_languages(): - """ - Get all languages that should have their data built, which is those that - are supported by at least `min_sources` sources. - """ - languages = set() - for langlist in CONFIG['sources'].values(): - languages |= set(langlist) - return [lang for lang in sorted(languages) - if len(source_names(lang)) >= CONFIG['min_sources']] diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py deleted file mode 100644 index c522c7b..0000000 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ /dev/null @@ -1,421 +0,0 @@ -from wordfreq_builder.config import ( - CONFIG, data_filename, wordlist_filename, all_languages, source_names -) -import sys -import pathlib -import itertools -from collections import defaultdict - - -HEADER = """# This file is automatically generated. Do not edit it. -# You can change its behavior by editing wordfreq_builder/ninja.py, -# and regenerate it by running 'make'. -""" -TMPDIR = data_filename('tmp') - - -def add_dep(lines, rule, input, output, extra=None, params=None): - if isinstance(output, list): - output = ' '.join(output) - if isinstance(input, list): - input = ' '.join(input) - if extra: - if isinstance(extra, list): - extra = ' '.join(extra) - extrastr = ' | ' + extra - else: - extrastr = '' - build_rule = "build {output}: {rule} {input}{extra}".format( - output=output, rule=rule, input=input, extra=extrastr - ) - lines.append(build_rule) - if params: - for key, val in params.items(): - lines.append(" {key} = {val}".format(key=key, val=val)) - lines.append("") - - -def make_ninja_deps(rules_filename, out=sys.stdout): - """ - Output a complete Ninja file describing how to build the wordfreq data. - """ - print(HEADER, file=out) - # Copy in the rules section - with open(rules_filename, encoding='utf-8') as rulesfile: - print(rulesfile.read(), file=out) - - lines = [] - # The first dependency is to make sure the build file is up to date. - add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja', - extra='wordfreq_builder/ninja.py') - lines.extend(itertools.chain( - twitter_deps( - data_filename('raw-input/twitter/all-2014.txt'), - slice_prefix=data_filename('slices/twitter/tweets-2014'), - combined_prefix=data_filename('generated/twitter/tweets-2014'), - slices=40, - languages=CONFIG['sources']['twitter'] - ), - wikipedia_deps( - data_filename('raw-input/wikipedia'), - CONFIG['sources']['wikipedia'] - ), - google_books_deps( - data_filename('raw-input/google-books') - ), - leeds_deps( - data_filename('source-lists/leeds'), - CONFIG['sources']['leeds'] - ), - opensubtitles_deps( - data_filename('source-lists/opensubtitles'), - CONFIG['sources']['opensubtitles'] - ), - subtlex_en_deps( - data_filename('source-lists/subtlex'), - CONFIG['sources']['subtlex-en'] - ), - subtlex_other_deps( - data_filename('source-lists/subtlex'), - CONFIG['sources']['subtlex-other'] - ), - reddit_deps( - data_filename('raw-input/reddit'), - CONFIG['sources']['reddit'] - ), - jieba_deps( - data_filename('source-lists/jieba'), - CONFIG['sources']['jieba'] - ), - commoncrawl_deps( - data_filename('raw-input/commoncrawl'), - CONFIG['sources']['commoncrawl'] - ), - combine_lists(all_languages()) - )) - - print('\n'.join(lines), file=out) - - -def wikipedia_deps(dirname_in, languages): - lines = [] - path_in = pathlib.Path(dirname_in) - for language in languages: - # Find the most recent file for this language - input_file = max(path_in.glob('{}wiki*.bz2'.format(language))) - plain_text_file = wordlist_filename('wikipedia', language, 'txt') - count_file = wordlist_filename('wikipedia', language, 'counts.txt') - - add_dep(lines, 'wiki2text', input_file, plain_text_file) - if language == 'ja': - mecab_token_file = wordlist_filename( - 'wikipedia', language, 'mecab-tokens.txt' - ) - add_dep( - lines, 'tokenize_japanese', plain_text_file, mecab_token_file - ) - add_dep(lines, 'count', mecab_token_file, count_file) - else: - add_dep(lines, 'count', plain_text_file, count_file) - - return lines - - -def commoncrawl_deps(dirname_in, languages): - lines = [] - for language in languages: - if language in CONFIG['cld2-language-aliases']: - language_alias = CONFIG['cld2-language-aliases'][language] - else: - language_alias = language - input_file = dirname_in + '/{}.txt.gz'.format(language_alias) - count_file = wordlist_filename('commoncrawl', language, 'counts.txt') - add_dep(lines, 'count_langtagged', input_file, count_file, params={'language': language_alias}) - return lines - - -def google_books_deps(dirname_in): - # Get English data from the split-up files of the Google Syntactic N-grams - # 2013 corpus. - lines = [] - - # Yes, the files are numbered 00 through 98 of 99. This is not an - # off-by-one error. Not on my part, anyway. - input_files = [ - '{}/nodes.{:>02d}-of-99.gz'.format(dirname_in, i) - for i in range(99) - ] - output_file = wordlist_filename('google-books', 'en', 'counts.txt') - add_dep(lines, 'convert_google_syntactic_ngrams', input_files, output_file) - return lines - - -def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, - languages): - lines = [] - - slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, - num=num) - for num in range(slices)] - # split the input into slices - add_dep(lines, 'split', input_filename, slice_files, - params={'prefix': '{}.part'.format(slice_prefix), - 'slices': slices}) - - for slicenum in range(slices): - slice_file = slice_files[slicenum] - language_outputs = [ - '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) - for language in languages - ] - add_dep(lines, 'tokenize_twitter', slice_file, language_outputs, - params={'prefix': slice_file}, - extra='wordfreq_builder/tokenizers.py') - - for language in languages: - combined_output = wordlist_filename('twitter', language, 'tokens.txt') - language_inputs = [ - '{prefix}.{lang}.txt'.format( - prefix=slice_files[slicenum], lang=language - ) - for slicenum in range(slices) - ] - add_dep(lines, 'cat', language_inputs, combined_output) - - count_file = wordlist_filename('twitter', language, 'counts.txt') - - if language == 'ja': - mecab_token_file = wordlist_filename( - 'twitter', language, 'mecab-tokens.txt') - add_dep( - lines, 'tokenize_japanese', combined_output, mecab_token_file) - combined_output = mecab_token_file - - add_dep(lines, 'count', combined_output, count_file, - extra='wordfreq_builder/tokenizers.py') - - return lines - - -def leeds_deps(dirname_in, languages): - lines = [] - for language in languages: - input_file = '{prefix}/internet-{lang}-forms.num'.format( - prefix=dirname_in, lang=language - ) - if language == 'zh': - step2_file = wordlist_filename('leeds', 'zh-Hans', 'converted.txt') - add_dep(lines, 'simplify_chinese', input_file, step2_file) - else: - step2_file = input_file - - reformatted_file = wordlist_filename('leeds', language, 'counts.txt') - add_dep(lines, 'convert_leeds', step2_file, reformatted_file) - - return lines - - -def opensubtitles_deps(dirname_in, languages): - lines = [] - for language in languages: - input_file = '{prefix}/{lang}.txt'.format( - prefix=dirname_in, lang=language - ) - if language == 'zh': - step2_file = wordlist_filename('opensubtitles', 'zh-Hans', 'converted.txt') - add_dep(lines, 'simplify_chinese', input_file, step2_file) - else: - step2_file = input_file - reformatted_file = wordlist_filename( - 'opensubtitles', language, 'counts.txt' - ) - add_dep(lines, 'convert_opensubtitles', step2_file, reformatted_file) - - return lines - - -def jieba_deps(dirname_in, languages): - lines = [] - # Because there's Chinese-specific handling here, the valid options for - # 'languages' are [] and ['zh']. Make sure it's one of those. - if not languages: - return lines - assert languages == ['zh'] - input_file = '{prefix}/dict.txt.big'.format(prefix=dirname_in) - transformed_file = wordlist_filename( - 'jieba', 'zh-Hans', 'converted.txt' - ) - reformatted_file = wordlist_filename( - 'jieba', 'zh', 'counts.txt' - ) - add_dep(lines, 'simplify_chinese', input_file, transformed_file) - add_dep(lines, 'convert_jieba', transformed_file, reformatted_file) - return lines - - -def reddit_deps(dirname_in, languages): - lines = [] - path_in = pathlib.Path(dirname_in) - slices = {} - counts_by_language = defaultdict(list) - - # Extract text from the Reddit comment dumps, and write them to - # .txt.gz files - for filepath in path_in.glob('*/*.bz2'): - base = filepath.stem - transformed_file = wordlist_filename('reddit', base + '.all', 'txt') - slices[base] = transformed_file - add_dep(lines, 'extract_reddit', str(filepath), transformed_file) - - for base in sorted(slices): - transformed_file = slices[base] - language_outputs = [] - for language in languages: - filename = wordlist_filename('reddit', base + '.' + language, 'txt') - language_outputs.append(filename) - - count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt') - add_dep(lines, 'count', filename, count_filename) - counts_by_language[language].append(count_filename) - - # find the prefix by constructing a filename, then stripping off - # '.xx.txt' from the end - prefix = wordlist_filename('reddit', base + '.xx', 'txt')[:-7] - add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs, - params={'prefix': prefix}, - extra='wordfreq_builder/tokenizers.py') - - for language in languages: - output_file = wordlist_filename('reddit', language, 'counts.txt') - add_dep( - lines, 'merge_counts', counts_by_language[language], output_file, - params={'cutoff': 3} - ) - return lines - - -# Which columns of the SUBTLEX data files do the word and its frequency appear -# in? -SUBTLEX_COLUMN_MAP = { - 'de': (1, 3), - 'el': (2, 3), - 'en': (1, 2), - 'nl': (1, 2), - 'zh': (1, 5) -} - - -def subtlex_en_deps(dirname_in, languages): - lines = [] - # Either subtlex_en is turned off, or it's just in English - if not languages: - return lines - assert languages == ['en'] - regions = ['en-US', 'en-GB'] - processed_files = [] - for region in regions: - input_file = '{prefix}/subtlex.{region}.txt'.format( - prefix=dirname_in, region=region - ) - textcol, freqcol = SUBTLEX_COLUMN_MAP['en'] - processed_file = wordlist_filename('subtlex-en', region, 'processed.txt') - processed_files.append(processed_file) - add_dep( - lines, 'convert_subtlex', input_file, processed_file, - params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} - ) - - output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') - add_dep( - lines, 'merge_counts', processed_files, output_file, - params={'cutoff': 0} - ) - - return lines - - -def subtlex_other_deps(dirname_in, languages): - lines = [] - for language in languages: - input_file = '{prefix}/subtlex.{lang}.txt'.format( - prefix=dirname_in, lang=language - ) - processed_file = wordlist_filename('subtlex-other', language, 'processed.txt') - output_file = wordlist_filename('subtlex-other', language, 'counts.txt') - textcol, freqcol = SUBTLEX_COLUMN_MAP[language] - - if language == 'zh': - step2_file = wordlist_filename('subtlex-other', 'zh-Hans', 'converted.txt') - add_dep(lines, 'simplify_chinese', input_file, step2_file) - else: - step2_file = input_file - - # Skip one header line by setting 'startrow' to 2 (because tail is 1-based). - # I hope we don't need to configure this by language anymore. - add_dep( - lines, 'convert_subtlex', step2_file, processed_file, - params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} - ) - add_dep( - lines, 'merge_counts', processed_file, output_file, - params={'cutoff': 0} - ) - return lines - - -def combine_lists(languages): - lines = [] - for language in languages: - sources = source_names(language) - input_files = [ - wordlist_filename(source, language, 'counts.txt') - for source in sources - ] - output_file = wordlist_filename('combined', language) - add_dep(lines, 'merge', input_files, output_file, - extra='wordfreq_builder/word_counts.py', - params={'cutoff': 2, 'lang': language}) - - output_cBpack = wordlist_filename( - 'combined-dist', language, 'msgpack.gz' - ) - output_cBpack_big = wordlist_filename( - 'combined-dist-large', language, 'msgpack.gz' - ) - add_dep(lines, 'freqs2cB', output_file, output_cBpack, - extra='wordfreq_builder/word_counts.py', - params={'lang': language, 'buckets': 600}) - add_dep(lines, 'freqs2cB', output_file, output_cBpack_big, - extra='wordfreq_builder/word_counts.py', - params={'lang': language, 'buckets': 800}) - - lines.append('default {}'.format(output_cBpack)) - if language in CONFIG['big-lists']: - lines.append('default {}'.format(output_cBpack_big)) - - # Write standalone lists for Twitter frequency - if language in CONFIG['sources']['twitter']: - input_file = wordlist_filename('twitter', language, 'counts.txt') - output_cBpack = wordlist_filename( - 'twitter-dist', language, 'msgpack.gz') - add_dep(lines, 'freqs2cB', input_file, output_cBpack, - extra='wordfreq_builder/word_counts.py', - params={'lang': language, 'buckets': 600}) - - lines.append('default {}'.format(output_cBpack)) - - # Write a Jieba-compatible frequency file for Chinese tokenization - chinese_combined = wordlist_filename('combined', 'zh') - jieba_output = wordlist_filename('jieba-dist', 'zh') - add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output, - extra=['wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py']) - lines.append('default {}'.format(jieba_output)) - return lines - - -def main(): - make_ninja_deps('rules.ninja') - - -if __name__ == '__main__': - main() diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py deleted file mode 100644 index d8bfd12..0000000 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ /dev/null @@ -1,132 +0,0 @@ -from wordfreq import tokenize -from ftfy.fixes import unescape_html -import regex -import pycld2 -import langcodes - -CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( - [ - '\x00-\x08', - '\x0b', - '\x0e-\x1f', - '\x7f-\x9f', - '\ud800-\udfff', - '\ufdd0-\ufdef', - '\N{HANGUL FILLER}', - '\N{HANGUL CHOSEONG FILLER}', - '\N{HANGUL JUNGSEONG FILLER}', - '<>' - ] + - [chr(65534+65536*x+y) for x in range(17) for y in range(2)] -) -CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE) - -TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+') -TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') -URL_RE = regex.compile(r'http(?:s)?://[^) ]*') -MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)') - - -# Low-frequency languages tend to be detected incorrectly by cld2. The -# following list of languages are languages that appear in our data with any -# reasonable frequency, and seem to usually be detected *correctly*. These are -# the languages we'll keep in the Reddit and Twitter results. -# -# This list is larger than the list that wordfreq ultimately generates, so we -# can look here as a source of future data. - -KEEP_THESE_LANGUAGES = { - 'af', 'ar', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi', - 'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'ko', 'lv', - 'ms', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sr', 'sv', 'sw', 'tl', - 'tr', 'uk', 'vi' -} - -# Semi-frequent languages that are excluded by the above: -# -# - Chinese, not because it's detected incorrectly, but because we can't -# handle it until we already have word frequencies -# - Thai (seems to be detected whenever someone uses Thai characters in -# an emoticon) -# - Welsh (which is detected for "ohmygodohmygodohmygod") -# - Turkmen (detected for ASCII art) -# - Irish Gaelic (detected for Cthulhu-related text) -# - Kannada (looks of disapproval) -# - Lao, Tamil, Xhosa, Slovak (various emoticons and Internet memes) -# - Breton (the word "memes" itself) - - -def cld2_surface_tokenizer(text, mode='twitter'): - """ - Uses CLD2 to detect the language and wordfreq tokenizer to create tokens. - - The `mode` can be 'twitter' or 'reddit', which slightly changes the - pre-processing of the text. - """ - text = unescape_html(text) - if mode == 'twitter': - text = TWITTER_HANDLE_RE.sub('', text) - text = TCO_RE.sub('', text) - elif mode == 'reddit': - text = URL_RE.sub('', text) - text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) - - lang = cld2_detect_language(text) - - # If the detected language isn't in our pretty generous list of languages, - # return no tokens. - if lang not in KEEP_THESE_LANGUAGES: - return 'xx', [] - - # cld2's accuracy seems to improve dramatically with at least 50 - # bytes of input, so throw away non-English below this length. - if len(text.encode('utf-8')) < 50 and lang != 'en': - return 'xx', [] - - tokens = tokenize(text, lang) - return lang, tokens - - -def cld2_detect_language(text): - """ - Uses CLD2 to detect the language. - """ - # Format of pycld2.detect: - # (Confident in result: bool, - # Number of bytes of text: Int, - # Triples of detected languages in order of certainty: - # (Language name: str, - # Language code: str - # Percent of text in this language: float - # Confidence score: float)) - - text = CLD2_BAD_CHARS_RE.sub('', text) - lang = pycld2.detect(text)[2][0][1] - - # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant' - # becomes 'zh' - code = langcodes.get(lang).language - return code - - -def tokenize_by_language(in_filename, out_prefix, tokenizer): - """ - Process a file by running it through a given tokenizer. - - Produces output files that are separated by language, with spaces - between the tokens. - """ - out_files = { - language: open('%s.%s.txt' % (out_prefix, language), 'w', encoding='utf-8') - for language in KEEP_THESE_LANGUAGES - } - with open(in_filename, encoding='utf-8') as in_file: - for line in in_file: - text = line.split('\t')[-1].strip() - language, tokens = tokenizer(text) - if language in KEEP_THESE_LANGUAGES: - out_file = out_files[language] - tokenized = ' '.join(tokens) - print(tokenized, file=out_file) - for out_file in out_files.values(): - out_file.close() diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py deleted file mode 100644 index 76f07cc..0000000 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ /dev/null @@ -1,289 +0,0 @@ -from wordfreq import simple_tokenize, tokenize -from collections import defaultdict -from operator import itemgetter -from ftfy import fix_text -import statistics -import math -import csv -import msgpack -import gzip -import unicodedata -import regex - - -# Match common cases of URLs: the schema http:// or https:// followed by -# non-whitespace characters. -URL_RE = regex.compile(r'https?://(?:\S)+') -HAN_RE = regex.compile(r'[\p{Script=Han}]+') - - -def count_tokens(filename): - """ - Count tokens that appear in a file, running each line through our - simple tokenizer. - - URLs will be skipped, and Unicode errors will become separate tokens - containing '�'. - """ - counts = defaultdict(int) - if filename.endswith('gz'): - infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace') - else: - infile = open(filename, encoding='utf-8', errors='replace') - for line in infile: - line = URL_RE.sub('', line.strip()) - for token in simple_tokenize(line): - counts[token] += 1 - infile.close() - return counts - - -def count_tokens_langtagged(filename, lang): - """ - Count tokens that appear in an already language-tagged file, in which each - line begins with a language code followed by a tab. - """ - counts = defaultdict(int) - if filename.endswith('gz'): - infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace') - else: - infile = open(filename, encoding='utf-8', errors='replace') - for line in infile: - if '\t' not in line: - continue - line_lang, text = line.split('\t', 1) - if line_lang == lang: - tokens = tokenize(text.strip(), lang) - for token in tokens: - counts[token] += 1 - infile.close() - return counts - - -def read_values(filename, cutoff=0, max_words=1e8, lang=None): - """ - Read words and their frequency or count values from a CSV file. Returns - a dictionary of values and the total of all values. - - Only words with a value greater than or equal to `cutoff` are returned. - In addition, only up to `max_words` words are read. - - If `cutoff` is greater than 0 or `max_words` is smaller than the list, - the csv file must be sorted by value in descending order, so that the - most frequent words are kept. - - If `lang` is given, it will apply language-specific tokenization to the - words that it reads. - """ - values = defaultdict(float) - total = 0. - with open(filename, encoding='utf-8', newline='') as infile: - for key, strval in csv.reader(infile): - val = float(strval) - key = fix_text(key) - if val < cutoff or len(values) >= max_words: - break - tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) - for token in tokens: - # Use += so that, if we give the reader concatenated files with - # duplicates, it does the right thing - values[token] += val - total += val - return values, total - - -def read_freqs(filename, cutoff=0, lang=None): - """ - Read words and their frequencies from a CSV file, normalizing the - frequencies to add up to 1. - - Only words with a frequency greater than or equal to `cutoff` are returned. - - If `cutoff` is greater than 0, the csv file must be sorted by frequency - in descending order. - - If lang is given, read_freqs will apply language specific preprocessing - operations. - """ - values, total = read_values(filename, cutoff, lang=lang) - for word in values: - values[word] /= total - - if lang == 'en': - values = correct_apostrophe_trimming(values) - - return values - - -def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): - """ - Convert a csv file of words and their frequencies to a file in the - idiosyncratic 'cBpack' format. - - Only words with a frequency greater than `cutoff` centibels will be - written to the new file. - - This cutoff should not be stacked with a cutoff in `read_freqs`; doing - so would skew the resulting frequencies. - """ - freqs = read_freqs(in_filename, cutoff=0, lang=None) - cBpack = [] - for token, freq in freqs.items(): - cB = round(math.log10(freq) * 100) - if cB <= cutoff: - continue - neg_cB = -cB - while neg_cB >= len(cBpack): - cBpack.append([]) - cBpack[neg_cB].append(token) - - for sublist in cBpack: - sublist.sort() - - # Write a "header" consisting of a dictionary at the start of the file - cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack - - with gzip.open(out_filename, 'wb') as outfile: - msgpack.dump(cBpack_data, outfile) - - -def merge_counts(count_dicts): - """ - Merge multiple dictionaries of counts by adding their entries. - """ - merged = defaultdict(int) - for count_dict in count_dicts: - for term, count in count_dict.items(): - merged[term] += count - return merged - - -def merge_freqs(freq_dicts): - """ - Merge multiple dictionaries of frequencies, representing each word with - the median of the word's frequency over all sources. - """ - vocab = set() - for freq_dict in freq_dicts: - vocab.update(freq_dict) - - merged = defaultdict(float) - N = len(freq_dicts) - for term in vocab: - freqs = [] - missing_values = 0 - for freq_dict in freq_dicts: - freq = freq_dict.get(term, 0.) - if freq < 1e-8: - # Usually we trust the median of the wordlists, but when at - # least 2 wordlists say a word exists and the rest say it - # doesn't, we kind of want to listen to the two that have - # information about the word. The word might be a word that's - # inconsistently accounted for, such as an emoji or a word - # containing an apostrophe. - # - # So, once we see at least 2 values that are very low or - # missing, we ignore further low values in the median. A word - # that appears in 2 sources gets a reasonable frequency, while - # a word that appears in 1 source still gets dropped. - - missing_values += 1 - if missing_values > 2: - continue - freqs.append(0.) - else: - freqs.append(freq) - - if freqs: - median = statistics.median(freqs) - if median > 0.: - merged[term] = median - - total = sum(merged.values()) - - # Normalize the merged values so that they add up to 0.99 (based on - # a rough estimate that 1% of tokens will be out-of-vocabulary in a - # wordlist of this size). - for term in merged: - merged[term] = merged[term] / total * 0.99 - return merged - - -def write_wordlist(freqs, filename, cutoff=1e-8): - """ - Write a dictionary of either raw counts or frequencies to a file of - comma-separated values. - - Keep the CSV format simple by explicitly skipping words containing - commas or quotation marks. We don't believe we want those in our tokens - anyway. - """ - with open(filename, 'w', encoding='utf-8', newline='\n') as outfile: - writer = csv.writer(outfile) - items = sorted(freqs.items(), key=itemgetter(1), reverse=True) - for word, freq in items: - if freq < cutoff: - break - if not ('"' in word or ',' in word): - writer.writerow([word, str(freq)]) - - -def write_jieba(freqs, filename): - """ - Write a dictionary of frequencies in a format that can be used for Jieba - tokenization of Chinese. - """ - with open(filename, 'w', encoding='utf-8', newline='\n') as outfile: - items = sorted(freqs.items(), key=lambda item: (-item[1], item[0])) - for word, freq in items: - if HAN_RE.search(word): - # Only store this word as a token if it contains at least one - # Han character. - fake_count = round(freq * 1e9) - print('%s %d' % (word, fake_count), file=outfile) - - -# APOSTROPHE_TRIMMED_PROB represents the probability that this word has had -# "'t" removed from it, based on counts from Twitter, which we know -# accurate token counts for based on our own tokenizer. - -APOSTROPHE_TRIMMED_PROB = { - 'don': 0.99, - 'didn': 1., - 'can': 0.35, - 'won': 0.74, - 'isn': 1., - 'wasn': 1., - 'wouldn': 1., - 'doesn': 1., - 'couldn': 1., - 'ain': 0.99, - 'aren': 1., - 'shouldn': 1., - 'haven': 0.96, - 'weren': 1., - 'hadn': 1., - 'hasn': 1., - 'mustn': 1., - 'needn': 1., -} - - -def correct_apostrophe_trimming(freqs): - """ - If what we got was an English wordlist that has been tokenized with - apostrophes as token boundaries, as indicated by the frequencies of the - words "wouldn" and "couldn", then correct the spurious tokens we get by - adding "'t" in about the proportion we expect to see in the wordlist. - - We could also adjust the frequency of "t", but then we would be favoring - the token "s" over it, as "'s" leaves behind no indication when it's been - removed. - """ - if (freqs.get('wouldn', 0) > 1e-6 and freqs.get('couldn', 0) > 1e-6): - for trim_word, trim_prob in APOSTROPHE_TRIMMED_PROB.items(): - if trim_word in freqs: - freq = freqs[trim_word] - freqs[trim_word] = freq * (1 - trim_prob) - freqs[trim_word + "'t"] = freq * trim_prob - return freqs