From d94428d454c90bdb90bc5b90e530b5b120131982 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Fri, 4 Sep 2015 00:57:04 -0400
Subject: [PATCH] support Turkish and more Greek; document more

---
 .gitignore                                  |  2 ++
 README.md                                   | 31 ++++++++++++++++++++-
 wordfreq/tokens.py                          | 14 ++++++++++
 wordfreq_builder/README.md                  | 24 ++++++++++++++++
 wordfreq_builder/wordfreq_builder/config.py |  3 +-
 5 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 975f163..a68e8ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ pip-log.txt
 .coverage
 *~
 wordfreq-data.tar.gz
+.idea
+build.dot
diff --git a/README.md b/README.md
index d95bae3..9a584f9 100644
--- a/README.md
+++ b/README.md
@@ -223,7 +223,11 @@ sources:
 
 It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and
 SUBTLEX-CH, created by Marc Brysbaert et al. and available at
-http://crr.ugent.be/programs-data/subtitle-frequencies. I (Rob Speer) have
+http://crr.ugent.be/programs-data/subtitle-frequencies. SUBTLEX was first
+published in this paper:
+
+
+I (Rob Speer) have
 obtained permission by e-mail from Marc Brysbaert to distribute these wordlists
 in wordfreq, to be used for any purpose, not just for academic use, under these
 conditions:
@@ -237,3 +241,28 @@ Some additional data was collected by a custom application that watches the
 streaming Twitter API, in accordance with Twitter's Developer Agreement &
 Policy. This software gives statistics about words that are commonly used on
 Twitter; it does not display or republish any Twitter content.
+
+## Citations to work that wordfreq is built on
+
+- Brysbaert, M. & New, B. (2009). Moving beyond Kucera and Francis: A Critical
+  Evaluation of Current Word Frequency Norms and the Introduction of a New and
+  Improved Word Frequency Measure for American English. Behavior Research
+  Methods, 41 (4), 977-990.
+  http://sites.google.com/site/borisnew/pub/BrysbaertNew2009.pdf
+
+- Cai, Q., & Brysbaert, M. (2010). SUBTLEX-CH: Chinese word and character
+  frequencies based on film subtitles. PLoS One, 5(6), e10729.
+  http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729
+
+- Davis, M. (2012). Unicode text segmentation. Unicode Standard Annex, 29.
+  http://unicode.org/reports/tr29/
+
+- Kudo, T. (2005). Mecab: Yet another part-of-speech and morphological
+  analyzer.
+  http://mecab.sourceforge.net/
+
+- van Heuven, W. J., Mandera, P., Keuleers, E., & Brysbaert, M. (2014).
+  SUBTLEX-UK: A new and improved word frequency database for British English.
+  The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190.
+  http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521
+
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index eb2c631..e33ca1d 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -65,6 +65,15 @@ def simple_tokenize(text):
     return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
 
 
+def turkish_tokenize(text):
+    """
+    Like `simple_tokenize`, but modifies i's so that they case-fold correctly
+    in Turkish.
+    """
+    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
+    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+
+
 def remove_arabic_marks(text):
     """
     Remove decorations from Arabic words:
@@ -90,6 +99,8 @@ def tokenize(text, lang):
     - Chinese or Japanese texts that aren't identified as the appropriate
       language will only split on punctuation and script boundaries, giving
       you untokenized globs of characters that probably represent many words.
+    - Turkish will use a different case-folding procedure, so that capital
+      I and İ map to ı and i respectively.
     - All other languages will be tokenized using a regex that mostly
       implements the Word Segmentation section of Unicode Annex #29.
       See `simple_tokenize` for details.
@@ -107,6 +118,9 @@ def tokenize(text, lang):
             from wordfreq.mecab import mecab_tokenize
         return mecab_tokenize(text)
 
+    if lang == 'tr':
+        return turkish_tokenize(text)
+
     if lang == 'ar':
         text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
 
diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md
index 2aedf27..021bc0f 100644
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@@ -161,3 +161,27 @@ longer represents the words 'don' and 'won', as we assume most of their
 frequency comes from "don't" and "won't". Words that turned into similarly
 common words, however, were left alone: this list doesn't represent "can't"
 because the word was left as "can".
+
+### SUBTLEX
+
+Mark Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in
+wordfreq and derived works without the "academic use" restriction, under the
+following reasonable conditions:
+
+- Wordfreq and code derived from it must credit the SUBTLEX authors.
+  (See the citations in the top-level `README.md` file.)
+- It must remain clear that SUBTLEX is freely available data.
+
+`data/source-lists/subtlex` contains the following files:
+
+- `subtlex.en-US.txt`, which was downloaded from [here][subtlex-us],
+  extracted, and converted from ISO-8859-1 to UTF-8
+- `subtlex.en-GB.txt`, which was exported as tab-separated UTF-8
+  from [this Excel file][subtlex-uk]
+- `subtlex.zh.txt`, which was downloaded and extracted from
+  [here][subtlex-ch]
+
+[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
+[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
+[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip
+
diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py
index 8ccb317..142c7ab 100644
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@@ -14,8 +14,7 @@ CONFIG = {
         ],
         'wikipedia': [
             'ar', 'de', 'en', 'el', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
-            'pt', 'ru'
-            # consider adding 'tr'
+            'pt', 'ru', 'tr'
         ],
         'opensubtitles': [
             # All languages where the most common word in OpenSubtitles