diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index 958398b..f0f0bf0 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -97,6 +97,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
 
 MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
 
+DIGIT_RE = regex.compile('\d')
+MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
+
 
 def simple_tokenize(text, include_punctuation=False):
     """
@@ -129,9 +132,15 @@ def simple_tokenize(text, include_punctuation=False):
     """
     text = unicodedata.normalize('NFC', text)
     if include_punctuation:
-        return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
+        return [
+            smash_numbers(token.casefold())
+            for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
+        ]
     else:
-        return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+        return [
+            smash_numbers(token.strip("'").casefold())
+            for token in TOKEN_RE.findall(text)
+        ]
 
 
 def turkish_tokenize(text, include_punctuation=False):
@@ -142,7 +151,7 @@ def turkish_tokenize(text, include_punctuation=False):
     text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
     token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
     return [
-        commas_to_cedillas(token.strip("'").casefold())
+        smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
         for token in token_expr.findall(text)
     ]
 
@@ -154,7 +163,7 @@ def romanian_tokenize(text, include_punctuation=False):
     """
     token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
     return [
-        cedillas_to_commas(token.strip("'").casefold())
+        smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
         for token in token_expr.findall(text)
     ]
 
@@ -170,7 +179,8 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
         from wordfreq.mecab import mecab_tokenize
     tokens = mecab_tokenize(text, lang)
     token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [token.casefold() for token in tokens if token_expr.match(token)]
+    return [smash_numbers(token.casefold()) for token in tokens
+            if token_expr.match(token)]
 
 
 def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
@@ -182,7 +192,8 @@ def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
         from wordfreq.chinese import jieba_tokenize
     tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
     token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [token.casefold() for token in tokens if token_expr.match(token)]
+    return [smash_numbers(token.casefold()) for token in tokens
+            if token_expr.match(token)]
 
 
 def remove_marks(text):
@@ -230,6 +241,21 @@ def cedillas_to_commas(text):
         '\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
     )
 
+def sub_zeroes(match):
+    """
+    Given a regex match, return what it matched with digits replaced by
+    zeroes.
+    """
+    return DIGIT_RE.sub('0', match.group(0))
+
+
+def smash_numbers(text):
+    """
+    Replace sequences of multiple digits with zeroes, so we don't need to
+    distinguish the frequencies of thousands of numbers.
+    """
+    return MULTI_DIGIT_RE.sub(sub_zeroes, text)
+
 
 def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
     """