diff --git a/tests/test.py b/tests/test.py
index 0013dcb..07f8bef 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -100,7 +100,7 @@ def test_tokenization():
     # data
     eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
         ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
-    
+
     eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
         ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
 
@@ -180,3 +180,10 @@ def test_ideographic_fallback():
         tokenize(ja_text, 'en'),
         ['ひらがな', 'カタカナ', 'romaji']
     )
+
+    # Test that we leave Thai letters stuck together. If we had better Thai support,
+    # we would actually split this into a three-word phrase.
+    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
+    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
+        ['การเล่นดนตรี', 'means', 'playing', 'music'])
+
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index f4d1339..cc275f0 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -3,23 +3,24 @@ import unicodedata
 
 
 TOKEN_RE = regex.compile(r"""
-    # Case 1: a special case for Chinese and Japanese
+    # Case 1: a special case for non-spaced languages
     # -----------------------------------------------
 
-    # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
-    # (\p{Script=Hiragana}), we allow a sequence of those characters to be
-    # glued together as a single token. Without this case, the standard rule
-    # (case 2) would make each character a separate token. This would be the
-    # correct behavior for word-wrapping, but a messy failure mode for NLP
-    # tokenization.
+    # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana
+    # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence
+    # of those characters to be glued together as a single token.
     #
-    # It is, of course, better to use a tokenizer that is designed for Chinese
-    # or Japanese text. This is effectively a fallback for when the wrong
+    # Without this case, the standard rule (case 2) would make each character
+    # a separate token. This would be the correct behavior for word-wrapping,
+    # but a messy failure mode for NLP tokenization.
+    #
+    # It is, of course, better to use a tokenizer that is designed for Chinese,
+    # Japanese, or Thai text. This is effectively a fallback for when the wrong
     # tokenizer is used.
     #
     # This rule is listed first so that it takes precedence.
 
-    [\p{IsIdeo}\p{Script=Hiragana}]+ |
+    [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ |
 
     # Case 2: standard Unicode segmentation
     # -------------------------------------