diff --git a/setup.py b/setup.py index ffc8098..87a0936 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,8 @@ classifiers = [ ] current_dir = os.path.dirname(__file__) -README_contents = open(os.path.join(current_dir, 'README.md')).read() +README_contents = open(os.path.join(current_dir, 'README.md'), + encoding='utf-8').read() doclines = README_contents.split("\n") dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015'] if sys.version_info < (3, 4): diff --git a/tests/test.py b/tests/test.py index 0013dcb..07f8bef 100644 --- a/tests/test.py +++ b/tests/test.py @@ -100,7 +100,7 @@ def test_tokenization(): # data eq_(tokenize("I don't split at apostrophes, you see.", 'en'), ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) - + eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) @@ -180,3 +180,10 @@ def test_ideographic_fallback(): tokenize(ja_text, 'en'), ['ひらがな', 'カタカナ', 'romaji'] ) + + # Test that we leave Thai letters stuck together. If we had better Thai support, + # we would actually split this into a three-word phrase. + eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) + eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), + ['การเล่นดนตรี', 'means', 'playing', 'music']) + diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index f4d1339..cc275f0 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -3,23 +3,24 @@ import unicodedata TOKEN_RE = regex.compile(r""" - # Case 1: a special case for Chinese and Japanese + # Case 1: a special case for non-spaced languages # ----------------------------------------------- - # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana - # (\p{Script=Hiragana}), we allow a sequence of those characters to be - # glued together as a single token. Without this case, the standard rule - # (case 2) would make each character a separate token. This would be the - # correct behavior for word-wrapping, but a messy failure mode for NLP - # tokenization. + # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana + # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence + # of those characters to be glued together as a single token. # - # It is, of course, better to use a tokenizer that is designed for Chinese - # or Japanese text. This is effectively a fallback for when the wrong + # Without this case, the standard rule (case 2) would make each character + # a separate token. This would be the correct behavior for word-wrapping, + # but a messy failure mode for NLP tokenization. + # + # It is, of course, better to use a tokenizer that is designed for Chinese, + # Japanese, or Thai text. This is effectively a fallback for when the wrong # tokenizer is used. # # This rule is listed first so that it takes precedence. - [\p{IsIdeo}\p{Script=Hiragana}]+ | + [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ | # Case 2: standard Unicode segmentation # ------------------------------------- diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index d3be5ce..2e740cf 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -13,10 +13,14 @@ def merge_lists(input_names, output_name, cutoff=0, max_size=1000000): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') - parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file') - parser.add_argument('-m', '--max-words', type=int, default=1000000, help='maximum number of words to read from each list') - parser.add_argument('inputs', help='names of input files to merge', nargs='+') + parser.add_argument('-o', '--output', default='combined-counts.csv', + help='filename to write the output to') + parser.add_argument('-c', '--cutoff', type=int, default=0, + help='minimum count to read from an input file') + parser.add_argument('-m', '--max-words', type=int, default=1000000, + help='maximum number of words to read from each list') + parser.add_argument('inputs', nargs='+', + help='names of input files to merge') args = parser.parse_args() merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py index ddc308c..e16660b 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py @@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv') - parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2) - parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None) - parser.add_argument('inputs', help='names of input files to merge', nargs='+') + parser.add_argument('-o', '--output', default='combined-freqs.csv', + help='filename to write the output to') + parser.add_argument('-c', '--cutoff', type=int, default=2, + help='stop after seeing a count below this') + parser.add_argument('-l', '--language', default=None, + help='language code for which language the words are in') + parser.add_argument('inputs', nargs='+', + help='names of input files to merge') args = parser.parse_args() merge_lists(args.inputs, args.output, args.cutoff, args.language) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index ae17546..b47e94a 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = { def cld2_reddit_tokenizer(text): + """ + A language-detecting tokenizer with special cases for handling text from + Reddit. + """ text = URL_RE.sub('', text) text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) lang = cld2_detect_language(text) if lang not in KEEP_THESE_LANGUAGES: + # Reddit is 99.9% English, so if we detected a rare language, it's + # much more likely that it's actually English. lang = 'en' tokens = tokenize(text, lang, include_punctuation=True) @@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer): """ Process a file by running it through a given tokenizer. - Produces output files that are separated by language, with newlines + Produces output files that are separated by language, with spaces between the tokens. """ out_files = {}