From d376f4e2e260f177f2fbef6120b303e190d7b1b3 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Tue, 13 Dec 2016 14:43:29 -0500
Subject: [PATCH] fixes to tokenization

---
 wordfreq/tokens.py | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index f0f0bf0..ac0665b 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -149,11 +149,16 @@ def turkish_tokenize(text, include_punctuation=False):
     in Turkish, and modifies 'comma-below' characters to use cedillas.
     """
     text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
-    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [
-        smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
-        for token in token_expr.findall(text)
-    ]
+    if include_punctuation:
+        return [
+            smash_numbers(commas_to_cedillas(token.casefold()))
+            for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
+        ]
+    else:
+        return [
+            smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
+            for token in TOKEN_RE.findall(text)
+        ]
 
 
 def romanian_tokenize(text, include_punctuation=False):
@@ -161,11 +166,16 @@ def romanian_tokenize(text, include_punctuation=False):
     Like `simple_tokenize`, but modifies the letters ş and ţ (with cedillas)
     to use commas-below instead.
     """
-    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [
-        smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
-        for token in token_expr.findall(text)
-    ]
+    if include_punctuation:
+        return [
+            smash_numbers(cedillas_to_commas(token.casefold()))
+            for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
+        ]
+    else:
+        return [
+            smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
+            for token in TOKEN_RE.findall(text)
+        ]
 
 
 def tokenize_mecab_language(text, lang, include_punctuation=False):
@@ -353,6 +363,9 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
     does not support these languages yet. It will split on spaces and
     punctuation, giving tokens that are far too long.
     """
+    # A really simple way to handle language codes with more than just the
+    # language
+    lang = lang.split('-')[0]
     if lang == 'ja' or lang == 'ko':
         return tokenize_mecab_language(text, lang, include_punctuation)
     elif lang == 'zh':