Merge remote-tracking branch 'origin/master' into big-list

Conflicts: wordfreq_builder/wordfreq_builder/cli/merge_counts.py
2024-12-23 17:31:41 +00:00 · 2016-03-24 14:11:44 -04:00 · 2016-03-24 14:11:44 -04:00 · 164a5b1a05
commit 164a5b1a05
parent 178a8b1494 7b539f9057
6 changed files with 44 additions and 21 deletions
--- a/setup.py
+++ b/setup.py
@ -24,7 +24,8 @@ classifiers = [
 ]

 current_dir = os.path.dirname(__file__)
-README_contents = open(os.path.join(current_dir, 'README.md')).read()
+README_contents = open(os.path.join(current_dir, 'README.md'),
+                       encoding='utf-8').read()
 doclines = README_contents.split("\n")
 dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
 if sys.version_info < (3, 4):
--- a/tests/test.py
+++ b/tests/test.py
@ -100,7 +100,7 @@ def test_tokenization():
    # data
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
-    
+
    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])

@ -180,3 +180,10 @@ def test_ideographic_fallback():
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )
+
+    # Test that we leave Thai letters stuck together. If we had better Thai support,
+    # we would actually split this into a three-word phrase.
+    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
+    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
+        ['การเล่นดนตรี', 'means', 'playing', 'music'])
+
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -3,23 +3,24 @@ import unicodedata


 TOKEN_RE = regex.compile(r"""
-    # Case 1: a special case for Chinese and Japanese
+    # Case 1: a special case for non-spaced languages
    # -----------------------------------------------

-    # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
-    # (\p{Script=Hiragana}), we allow a sequence of those characters to be
-    # glued together as a single token. Without this case, the standard rule
-    # (case 2) would make each character a separate token. This would be the
-    # correct behavior for word-wrapping, but a messy failure mode for NLP
-    # tokenization.
+    # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana
+    # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence
+    # of those characters to be glued together as a single token.
    #
-    # It is, of course, better to use a tokenizer that is designed for Chinese
-    # or Japanese text. This is effectively a fallback for when the wrong
+    # Without this case, the standard rule (case 2) would make each character
+    # a separate token. This would be the correct behavior for word-wrapping,
+    # but a messy failure mode for NLP tokenization.
+    #
+    # It is, of course, better to use a tokenizer that is designed for Chinese,
+    # Japanese, or Thai text. This is effectively a fallback for when the wrong
    # tokenizer is used.
    #
    # This rule is listed first so that it takes precedence.

-    [\p{IsIdeo}\p{Script=Hiragana}]+ |
+    [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ |

    # Case 2: standard Unicode segmentation
    # -------------------------------------
--- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py
@ -13,10 +13,14 @@ def merge_lists(input_names, output_name, cutoff=0, max_size=1000000):

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
-    parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
-    parser.add_argument('-m', '--max-words', type=int, default=1000000, help='maximum number of words to read from each list')
-    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
+    parser.add_argument('-o', '--output', default='combined-counts.csv',
+                        help='filename to write the output to')
+    parser.add_argument('-c', '--cutoff', type=int, default=0,
+                        help='minimum count to read from an input file')
+    parser.add_argument('-m', '--max-words', type=int, default=1000000,
+                        help='maximum number of words to read from each list')
+    parser.add_argument('inputs', nargs='+',
+                        help='names of input files to merge')
    args = parser.parse_args()
    merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)

--- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
+++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
-    parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
-    parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None)
-    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
+    parser.add_argument('-o', '--output', default='combined-freqs.csv',
+                        help='filename to write the output to')
+    parser.add_argument('-c', '--cutoff', type=int, default=2,
+                        help='stop after seeing a count below this')
+    parser.add_argument('-l', '--language', default=None,
+                        help='language code for which language the words are in')
+    parser.add_argument('inputs', nargs='+',
+                        help='names of input files to merge')
    args = parser.parse_args()
    merge_lists(args.inputs, args.output, args.cutoff, args.language)

--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = {


 def cld2_reddit_tokenizer(text):
+    """
+    A language-detecting tokenizer with special cases for handling text from
+    Reddit.
+    """
    text = URL_RE.sub('', text)
    text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)

    lang = cld2_detect_language(text)
    if lang not in KEEP_THESE_LANGUAGES:
+        # Reddit is 99.9% English, so if we detected a rare language, it's
+        # much more likely that it's actually English.
        lang = 'en'

    tokens = tokenize(text, lang, include_punctuation=True)
@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
    """
    Process a file by running it through a given tokenizer.

-    Produces output files that are separated by language, with newlines
+    Produces output files that are separated by language, with spaces
    between the tokens.
    """
    out_files = {}