diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index a895fbc..800fcee 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -247,13 +247,14 @@ def word_frequency(word, lang, wordlist='combined', default=0.): """ Get the frequency of `word` in the language with code `lang`, from the specified `wordlist`. The default wordlist is 'combined', built from - whichever of these four sources have sufficient data for the language: + whichever of these five sources have sufficient data for the language: - Full text of Wikipedia - A sample of 72 million tweets collected from Twitter in 2014, divided roughly into languages using automatic language detection - Frequencies extracted from OpenSubtitles - The Leeds Internet Corpus + - Google Books Ngrams and Google Books Syntactic Ngrams Another available wordlist is 'twitter', which uses only the data from Twitter.