From 37f9e12b93d598c2ee77e0055a1001526f195456 Mon Sep 17 00:00:00 2001 From: Sara Jewett Date: Wed, 23 Dec 2015 15:49:13 -0500 Subject: [PATCH 1/6] Specify encoding when dealing with files --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4621d12..2926f3f 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,8 @@ classifiers = [ ] current_dir = os.path.dirname(__file__) -README_contents = open(os.path.join(current_dir, 'README.md')).read() +README_contents = open(os.path.join(current_dir, 'README.md'), + encoding='utf-8').read() doclines = README_contents.split("\n") dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015'] if sys.version_info < (3, 4): From 511fcb6f91fb9a1c3c3a22ca07dd7bd96e21cb82 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 13 Jan 2016 12:05:07 -0500 Subject: [PATCH 2/6] reformat some argparse argument definitions --- .../wordfreq_builder/cli/merge_counts.py | 9 ++++++--- wordfreq_builder/wordfreq_builder/cli/merge_freqs.py | 12 ++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index c44f0cf..4efe1d9 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -13,9 +13,12 @@ def merge_lists(input_names, output_name, cutoff=0): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') - parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file') - parser.add_argument('inputs', help='names of input files to merge', nargs='+') + parser.add_argument('-o', '--output', default='combined-counts.csv', + help='filename to write the output to') + parser.add_argument('-c', '--cutoff', type=int, default=0 + help='minimum count to read from an input file') + parser.add_argument('inputs', nargs='+', + help='names of input files to merge') args = parser.parse_args() merge_lists(args.inputs, args.output, cutoff=args.cutoff) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py index ddc308c..e16660b 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py @@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv') - parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2) - parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None) - parser.add_argument('inputs', help='names of input files to merge', nargs='+') + parser.add_argument('-o', '--output', default='combined-freqs.csv', + help='filename to write the output to') + parser.add_argument('-c', '--cutoff', type=int, default=2, + help='stop after seeing a count below this') + parser.add_argument('-l', '--language', default=None, + help='language code for which language the words are in') + parser.add_argument('inputs', nargs='+', + help='names of input files to merge') args = parser.parse_args() merge_lists(args.inputs, args.output, args.cutoff, args.language) From 8ddc19a5ca598fc278a2f0a80c825ccdde5194fa Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 13 Jan 2016 15:18:12 -0500 Subject: [PATCH 3/6] fix documentation in wordfreq_builder.tokenizers --- wordfreq_builder/wordfreq_builder/tokenizers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index ae17546..b47e94a 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = { def cld2_reddit_tokenizer(text): + """ + A language-detecting tokenizer with special cases for handling text from + Reddit. + """ text = URL_RE.sub('', text) text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) lang = cld2_detect_language(text) if lang not in KEEP_THESE_LANGUAGES: + # Reddit is 99.9% English, so if we detected a rare language, it's + # much more likely that it's actually English. lang = 'en' tokens = tokenize(text, lang, include_punctuation=True) @@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer): """ Process a file by running it through a given tokenizer. - Produces output files that are separated by language, with newlines + Produces output files that are separated by language, with spaces between the tokens. """ out_files = {} From 07f16e6f03cc42436a467eaab935996f22d37d46 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Mon, 22 Feb 2016 14:26:50 -0500 Subject: [PATCH 4/6] Leave Thai segments alone in the default regex Our regex already has a special case to leave Chinese and Japanese alone when an appropriate tokenizer for the language isn't being used, as Unicode's default segmentation would make every character into its own token. The same thing happens in Thai, and we don't even *have* an appropriate tokenizer for Thai, so I've added a similar fallback. --- tests/test.py | 8 +++++++- wordfreq/tokens.py | 21 +++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tests/test.py b/tests/test.py index 0013dcb..177ebf4 100644 --- a/tests/test.py +++ b/tests/test.py @@ -100,7 +100,7 @@ def test_tokenization(): # data eq_(tokenize("I don't split at apostrophes, you see.", 'en'), ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) - + eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) @@ -116,6 +116,12 @@ def test_tokenization(): eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), ['this', 'text', 'has', '...', 'punctuation', ':)']) + # Test that we leave Thai letters stuck together. If we had better Thai support, + # we would actually split this into a three-word phrase. + eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) + eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), + ['การเล่นดนตรี', 'means', 'playing', 'music']) + def test_casefolding(): eq_(tokenize('WEISS', 'de'), ['weiss']) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index f4d1339..cc275f0 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -3,23 +3,24 @@ import unicodedata TOKEN_RE = regex.compile(r""" - # Case 1: a special case for Chinese and Japanese + # Case 1: a special case for non-spaced languages # ----------------------------------------------- - # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana - # (\p{Script=Hiragana}), we allow a sequence of those characters to be - # glued together as a single token. Without this case, the standard rule - # (case 2) would make each character a separate token. This would be the - # correct behavior for word-wrapping, but a messy failure mode for NLP - # tokenization. + # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana + # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence + # of those characters to be glued together as a single token. # - # It is, of course, better to use a tokenizer that is designed for Chinese - # or Japanese text. This is effectively a fallback for when the wrong + # Without this case, the standard rule (case 2) would make each character + # a separate token. This would be the correct behavior for word-wrapping, + # but a messy failure mode for NLP tokenization. + # + # It is, of course, better to use a tokenizer that is designed for Chinese, + # Japanese, or Thai text. This is effectively a fallback for when the wrong # tokenizer is used. # # This rule is listed first so that it takes precedence. - [\p{IsIdeo}\p{Script=Hiragana}]+ | + [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ | # Case 2: standard Unicode segmentation # ------------------------------------- From 4ec6b56faab4bc5a698e48cca1493ed45c9de6ea Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 10 Mar 2016 11:56:04 -0500 Subject: [PATCH 5/6] move Thai test to where it makes more sense --- tests/test.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test.py b/tests/test.py index 177ebf4..07f8bef 100644 --- a/tests/test.py +++ b/tests/test.py @@ -116,12 +116,6 @@ def test_tokenization(): eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), ['this', 'text', 'has', '...', 'punctuation', ':)']) - # Test that we leave Thai letters stuck together. If we had better Thai support, - # we would actually split this into a three-word phrase. - eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) - eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), - ['การเล่นดนตรี', 'means', 'playing', 'music']) - def test_casefolding(): eq_(tokenize('WEISS', 'de'), ['weiss']) @@ -186,3 +180,10 @@ def test_ideographic_fallback(): tokenize(ja_text, 'en'), ['ひらがな', 'カタカナ', 'romaji'] ) + + # Test that we leave Thai letters stuck together. If we had better Thai support, + # we would actually split this into a three-word phrase. + eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) + eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), + ['การเล่นดนตรี', 'means', 'playing', 'music']) + From 38016cf62bb2c86472bbb7fce849dd9a1571f2ec Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Thu, 24 Mar 2016 13:57:18 -0400 Subject: [PATCH 6/6] Restore a missing comma. --- wordfreq_builder/wordfreq_builder/cli/merge_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 4efe1d9..170b9c2 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -15,7 +15,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-o', '--output', default='combined-counts.csv', help='filename to write the output to') - parser.add_argument('-c', '--cutoff', type=int, default=0 + parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file') parser.add_argument('inputs', nargs='+', help='names of input files to merge')