From a6ef3224a6b8677657de6353a7e2d54e56166a7a Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Fri, 4 Sep 2015 00:57:04 -0400 Subject: [PATCH] support Turkish and more Greek; document more Former-commit-id: d94428d454c90bdb90bc5b90e530b5b120131982 --- .gitignore | 2 ++ README.md | 31 ++++++++++++++++++++- wordfreq/tokens.py | 14 ++++++++++ wordfreq_builder/README.md | 24 ++++++++++++++++ wordfreq_builder/wordfreq_builder/config.py | 3 +- 5 files changed, 71 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 975f163..a68e8ca 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ pip-log.txt .coverage *~ wordfreq-data.tar.gz +.idea +build.dot diff --git a/README.md b/README.md index d95bae3..9a584f9 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,11 @@ sources: It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, and SUBTLEX-CH, created by Marc Brysbaert et al. and available at -http://crr.ugent.be/programs-data/subtitle-frequencies. I (Rob Speer) have +http://crr.ugent.be/programs-data/subtitle-frequencies. SUBTLEX was first +published in this paper: + + +I (Rob Speer) have obtained permission by e-mail from Marc Brysbaert to distribute these wordlists in wordfreq, to be used for any purpose, not just for academic use, under these conditions: @@ -237,3 +241,28 @@ Some additional data was collected by a custom application that watches the streaming Twitter API, in accordance with Twitter's Developer Agreement & Policy. This software gives statistics about words that are commonly used on Twitter; it does not display or republish any Twitter content. + +## Citations to work that wordfreq is built on + +- Brysbaert, M. & New, B. (2009). Moving beyond Kucera and Francis: A Critical + Evaluation of Current Word Frequency Norms and the Introduction of a New and + Improved Word Frequency Measure for American English. Behavior Research + Methods, 41 (4), 977-990. + http://sites.google.com/site/borisnew/pub/BrysbaertNew2009.pdf + +- Cai, Q., & Brysbaert, M. (2010). SUBTLEX-CH: Chinese word and character + frequencies based on film subtitles. PLoS One, 5(6), e10729. + http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729 + +- Davis, M. (2012). Unicode text segmentation. Unicode Standard Annex, 29. + http://unicode.org/reports/tr29/ + +- Kudo, T. (2005). Mecab: Yet another part-of-speech and morphological + analyzer. + http://mecab.sourceforge.net/ + +- van Heuven, W. J., Mandera, P., Keuleers, E., & Brysbaert, M. (2014). + SUBTLEX-UK: A new and improved word frequency database for British English. + The Quarterly Journal of Experimental Psychology, 67(6), 1176-1190. + http://www.tandfonline.com/doi/pdf/10.1080/17470218.2013.850521 + diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index eb2c631..e33ca1d 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -65,6 +65,15 @@ def simple_tokenize(text): return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] +def turkish_tokenize(text): + """ + Like `simple_tokenize`, but modifies i's so that they case-fold correctly + in Turkish. + """ + text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') + return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] + + def remove_arabic_marks(text): """ Remove decorations from Arabic words: @@ -90,6 +99,8 @@ def tokenize(text, lang): - Chinese or Japanese texts that aren't identified as the appropriate language will only split on punctuation and script boundaries, giving you untokenized globs of characters that probably represent many words. + - Turkish will use a different case-folding procedure, so that capital + I and İ map to ı and i respectively. - All other languages will be tokenized using a regex that mostly implements the Word Segmentation section of Unicode Annex #29. See `simple_tokenize` for details. @@ -107,6 +118,9 @@ def tokenize(text, lang): from wordfreq.mecab import mecab_tokenize return mecab_tokenize(text) + if lang == 'tr': + return turkish_tokenize(text) + if lang == 'ar': text = remove_arabic_marks(unicodedata.normalize('NFKC', text)) diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md index 2aedf27..021bc0f 100644 --- a/wordfreq_builder/README.md +++ b/wordfreq_builder/README.md @@ -161,3 +161,27 @@ longer represents the words 'don' and 'won', as we assume most of their frequency comes from "don't" and "won't". Words that turned into similarly common words, however, were left alone: this list doesn't represent "can't" because the word was left as "can". + +### SUBTLEX + +Mark Brysbaert gave us permission by e-mail to use the SUBTLEX word lists in +wordfreq and derived works without the "academic use" restriction, under the +following reasonable conditions: + +- Wordfreq and code derived from it must credit the SUBTLEX authors. + (See the citations in the top-level `README.md` file.) +- It must remain clear that SUBTLEX is freely available data. + +`data/source-lists/subtlex` contains the following files: + +- `subtlex.en-US.txt`, which was downloaded from [here][subtlex-us], + extracted, and converted from ISO-8859-1 to UTF-8 +- `subtlex.en-GB.txt`, which was exported as tab-separated UTF-8 + from [this Excel file][subtlex-uk] +- `subtlex.zh.txt`, which was downloaded and extracted from + [here][subtlex-ch] + +[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip +[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx +[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip + diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 8ccb317..142c7ab 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -14,8 +14,7 @@ CONFIG = { ], 'wikipedia': [ 'ar', 'de', 'en', 'el', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', - 'pt', 'ru' - # consider adding 'tr' + 'pt', 'ru', 'tr' ], 'opensubtitles': [ # All languages where the most common word in OpenSubtitles