mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Separate preprocessing from tokenization
This commit is contained in:
parent
72646f16a1
commit
5ab5d2ea55
@ -1,6 +1,6 @@
|
||||
from wordfreq import (
|
||||
word_frequency, available_languages, cB_to_freq,
|
||||
top_n_list, random_words, random_ascii_words, tokenize
|
||||
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
|
||||
)
|
||||
from nose.tools import (
|
||||
eq_, assert_almost_equal, assert_greater, raises
|
||||
@ -164,13 +164,13 @@ def test_casefolding():
|
||||
def test_number_smashing():
|
||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||
['715', 'crσσks', 'by', 'bon', 'iver'])
|
||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
|
||||
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||
['000', 'crσσks', 'by', 'bon', 'iver'])
|
||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
|
||||
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
|
||||
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
|
||||
eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
|
||||
eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
|
||||
eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
|
||||
eq_(lossy_tokenize('1', 'en'), ['1'])
|
||||
eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
|
||||
eq_(lossy_tokenize('24601', 'en'), ['00000'])
|
||||
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
|
||||
|
||||
|
||||
@ -231,6 +231,7 @@ def test_ideographic_fallback():
|
||||
['ひらがな', 'カタカナ', 'romaji']
|
||||
)
|
||||
|
||||
|
||||
def test_other_languages():
|
||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||
# we would actually split this into a three-word phrase.
|
||||
|
@ -83,5 +83,3 @@ def test_alternate_codes():
|
||||
# Separate codes for Mandarin and Cantonese
|
||||
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
|
||||
eq_(tokenize('谢谢谢谢', 'yue'), tokens)
|
||||
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
from wordfreq.tokens import tokenize, simple_tokenize
|
||||
from pkg_resources import resource_filename
|
||||
from functools import lru_cache
|
||||
import langcodes
|
||||
@ -10,6 +9,9 @@ import random
|
||||
import logging
|
||||
import math
|
||||
|
||||
from .tokens import tokenize, simple_tokenize, lossy_tokenize
|
||||
from .language_info import get_language_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -30,8 +32,9 @@ INFERRED_SPACE_LANGUAGES = {'zh'}
|
||||
# frequency.)
|
||||
INFERRED_SPACE_FACTOR = 10.0
|
||||
|
||||
# simple_tokenize is imported so that other things can import it from here.
|
||||
# Suppress the pyflakes warning.
|
||||
# tokenize and simple_tokenize are imported so that other things can import
|
||||
# them from here. Suppress the pyflakes warning.
|
||||
tokenize = tokenize
|
||||
simple_tokenize = simple_tokenize
|
||||
|
||||
|
||||
@ -215,8 +218,9 @@ def iter_wordlist(lang, wordlist='combined'):
|
||||
# it takes to look up frequencies from scratch, so something faster is needed.
|
||||
_wf_cache = {}
|
||||
|
||||
|
||||
def _word_frequency(word, lang, wordlist, minimum):
|
||||
tokens = tokenize(word, lang, combine_numbers=True)
|
||||
tokens = lossy_tokenize(word, lang)
|
||||
if not tokens:
|
||||
return minimum
|
||||
|
||||
@ -234,7 +238,10 @@ def _word_frequency(word, lang, wordlist, minimum):
|
||||
|
||||
freq = 1.0 / one_over_result
|
||||
|
||||
if lang in INFERRED_SPACE_LANGUAGES:
|
||||
if get_language_info(lang)['tokenizer'] == 'jieba':
|
||||
# If we used the Jieba tokenizer, we could tokenize anything to match
|
||||
# our wordlist, even nonsense. To counteract this, we multiply by a
|
||||
# probability for each word break that was inferred.
|
||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
||||
|
||||
return max(freq, minimum)
|
||||
|
148
wordfreq/language_info.py
Normal file
148
wordfreq/language_info.py
Normal file
@ -0,0 +1,148 @@
|
||||
from langcodes import Language, best_match
|
||||
|
||||
|
||||
# Text in scripts written without spaces has to be handled specially in our
|
||||
# tokenization regex (see TOKEN_RE in tokens.py). Also, when one of these is
|
||||
# the script of the language we're analyzing, then we need to either have
|
||||
# a specific tokenizer for the language or give up.
|
||||
SPACELESS_SCRIPTS = [
|
||||
# Han ideographs are spaceless, but they don't need to appear in this list
|
||||
# because they have their own cases in get_language_info and TOKEN_RE.
|
||||
'Hiragana',
|
||||
# We omit katakana because Unicode regular expressions can already
|
||||
# tokenize sequences of katakana, and omitting it here means we can also
|
||||
# recognize a switch between hiragana and katakana as a token boundary.
|
||||
'Thai', # Thai script
|
||||
'Khmr', # Khmer script
|
||||
'Laoo', # Lao script
|
||||
'Mymr', # Burmese script
|
||||
'Tale', # Tai Le script
|
||||
'Talu', # Tai Lü script
|
||||
'Lana', # Lanna script
|
||||
]
|
||||
|
||||
|
||||
def _language_in_list(language, targets, min_score=80):
|
||||
"""
|
||||
A helper function to determine whether this language matches one of the
|
||||
target languages, with a match score above a certain threshold.
|
||||
|
||||
The languages can be given as strings (language tags) or as Language
|
||||
objects. `targets` can be any iterable of such languages.
|
||||
"""
|
||||
matched = best_match(language, targets)
|
||||
return matched[1] > 0
|
||||
|
||||
|
||||
def get_language_info(language):
|
||||
"""
|
||||
Looks up the things we need to know about how to handle text in a given
|
||||
language. This will return a dictionary with the following fields:
|
||||
|
||||
'script': a BCP 47 script code such as 'Latn', 'Cyrl', 'Hans'...
|
||||
|
||||
Indicates the script that tokens in this language should be in,
|
||||
_after_ our preprocessing. The script for 'zh' is 'Hans', for example,
|
||||
because even if the input is in Traditional Chinese ('Hant'), we
|
||||
convert it to Simplified.
|
||||
|
||||
'tokenizer': 'regex', 'jieba', 'mecab', or None
|
||||
|
||||
Indicates the best way we know to separate tokens in the language.
|
||||
|
||||
'regex' is what will be used for most languages, meaning that we can
|
||||
segment the text with a Unicode-aware regular expression. If a language
|
||||
generally uses spaces to separate words, the regex will work well.
|
||||
|
||||
'jieba' and 'mecab' are tokenizers for specific languages written
|
||||
without spaces.
|
||||
|
||||
A tokenizer of None means we don't have a good way to segment the
|
||||
language. We'll use the regex anyway, but the results will be pretty
|
||||
bad.
|
||||
|
||||
'normal_form': 'NFC' or 'NFKC'
|
||||
|
||||
How "should" Unicode be normalized when comparing text in this
|
||||
language? This is not a standard, it's just based on experience.
|
||||
Many languages need NFKC normalization for text comparisons to work
|
||||
properly, but in many European languages, NFKC normalization is
|
||||
excessive and loses information.
|
||||
|
||||
'remove_marks': True or False
|
||||
|
||||
Determines whether marks and decorations, such as vowel points and
|
||||
tatweels, should be removed. True for languages in abjad scripts.
|
||||
|
||||
'dotless_i': True or False
|
||||
|
||||
Is "ı" the lowercase of "I" in this language, as in Turkish?
|
||||
|
||||
'diacritics_under': 'cedillas', 'commas', or None
|
||||
|
||||
Should we convert any diacritics that are under the letters "s" and
|
||||
"t" in this language? 'cedillas' means we should convert commas to
|
||||
cedillas, and 'commas' means we should convert cedillas to commas.
|
||||
|
||||
'transliteration': 'sr-Latn', 'az-Latn', or None
|
||||
|
||||
Indicates a type of transliteration that we should use for normalizing
|
||||
a multi-script language. 'sr-Latn' means to use Serbian romanization,
|
||||
and 'az-Latn' means to use Azerbaijani romanization.
|
||||
|
||||
'lookup_transliteration': 'zh-Hans' or None
|
||||
|
||||
Indicates a lossy transliteration that should be not be used for output,
|
||||
but should be applied when looking up words in a list. 'zh-Hans' means
|
||||
that we should convert Traditional Chinese characters to Simplified.
|
||||
"""
|
||||
# The input is probably a string, so parse it into a Language. If it's
|
||||
# already a Language, it will pass through.
|
||||
language = Language.get(language)
|
||||
|
||||
# Assume additional things about the language, such as what script it's in,
|
||||
# using the "likely subtags" table
|
||||
language_full = language.maximize()
|
||||
|
||||
# Start the `info` dictionary with default values, including the 'script'
|
||||
# value that we now know from `language_full`.
|
||||
info = {
|
||||
'script': language_full.script,
|
||||
'tokenizer': 'regex',
|
||||
'normal_form': 'NFKC',
|
||||
'remove_marks': False,
|
||||
'dotless_i': False,
|
||||
'diacritics_under': None,
|
||||
'transliteration': None,
|
||||
'lookup_transliteration': None
|
||||
}
|
||||
|
||||
if _language_in_list(language, ['ja', 'ko']):
|
||||
info['tokenizer'] = 'mecab'
|
||||
elif _language_in_list(language, ['zh', 'yue']):
|
||||
info['tokenizer'] = 'jieba'
|
||||
elif info['script'] in SPACELESS_SCRIPTS:
|
||||
info['tokenizer'] = None
|
||||
|
||||
# Cased alphabetic scripts get NFC normal form
|
||||
if info['script'] in ['Latn', 'Grek', 'Cyrl']:
|
||||
info['normal_form'] = 'NFC'
|
||||
|
||||
if info['script'] in ['Arab', 'Hebr']:
|
||||
info['remove_marks'] = True
|
||||
|
||||
if _language_in_list(language, ['tr', 'az', 'kk']):
|
||||
info['dotless_i'] = True
|
||||
info['diacritics_under'] = 'cedillas'
|
||||
elif _language_in_list(language, ['ro']):
|
||||
info['diacritics_under'] = 'commas'
|
||||
|
||||
if _language_in_list(language, ['sr']):
|
||||
info['transliteration'] = 'sr-Latn'
|
||||
elif _language_in_list(language, ['az']):
|
||||
info['transliteration'] = 'az-Latn'
|
||||
|
||||
if language.language == 'zh' and language.script != 'Hant':
|
||||
info['lookup_transliteration'] = 'zh-Hans'
|
||||
|
||||
return info
|
265
wordfreq/preprocess.py
Normal file
265
wordfreq/preprocess.py
Normal file
@ -0,0 +1,265 @@
|
||||
import regex
|
||||
import unicodedata
|
||||
|
||||
from .language_info import get_language_info
|
||||
from .transliterate import transliterate
|
||||
|
||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
|
||||
DIGIT_RE = regex.compile('\d')
|
||||
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
||||
|
||||
|
||||
def preprocess_text(text, language):
|
||||
"""
|
||||
This function applies pre-processing steps that convert forms of words
|
||||
considered equivalent into one standardized form.
|
||||
|
||||
As one straightforward step, it case-folds the text. For the purposes of
|
||||
wordfreq and related tools, a capitalized word shouldn't have a different
|
||||
frequency from its lowercase version.
|
||||
|
||||
The steps that are applied in order, only some of which apply to each
|
||||
language, are:
|
||||
|
||||
- NFC or NFKC normalization, as needed for the language
|
||||
- Transliteration of multi-script languages
|
||||
- Abjad mark removal
|
||||
- Case folding
|
||||
- Fixing of diacritics
|
||||
|
||||
We'll describe these steps out of order, to start with the more obvious
|
||||
steps.
|
||||
|
||||
|
||||
Case folding
|
||||
------------
|
||||
|
||||
The most common effect of this function is that it case-folds alphabetic
|
||||
text to lowercase:
|
||||
|
||||
>>> preprocess_text('Word', 'en')
|
||||
'word'
|
||||
|
||||
This is proper Unicode-aware case-folding, so it eliminates distinctions
|
||||
in lowercase letters that would not appear in uppercase. This accounts for
|
||||
the German ß and the Greek final sigma:
|
||||
|
||||
>>> preprocess_text('groß', 'de')
|
||||
'gross'
|
||||
>>> preprocess_text('λέξις', 'el')
|
||||
'λέξισ'
|
||||
|
||||
In Turkish (and Azerbaijani), case-folding is different, because the
|
||||
uppercase and lowercase I come in two variants, one with a dot and one
|
||||
without. They are matched in a way that preserves the number of dots, which
|
||||
the usual pair of "I" and "i" do not.
|
||||
|
||||
>>> preprocess_text('HAKKINDA İSTANBUL', 'tr')
|
||||
'hakkında istanbul'
|
||||
|
||||
|
||||
Fixing of diacritics
|
||||
--------------------
|
||||
|
||||
While we're talking about Turkish: the Turkish alphabet contains letters
|
||||
with cedillas attached to the bottom. In the case of "ş" and "ţ", these
|
||||
letters are very similar to two Romanian letters, "ș" and "ț", which have
|
||||
separate _commas_ below them.
|
||||
|
||||
(Did you know that a cedilla is not the same as a comma under a letter? I
|
||||
didn't until I started dealing with text normalization. My keyboard layout
|
||||
even inputs a letter with a cedilla when you hit Compose+comma.)
|
||||
|
||||
Because these letters look so similar, and because some fonts only include
|
||||
one pair of letters and not the other, there are many cases where the
|
||||
letters are confused with each other. Our preprocessing normalizes these
|
||||
Turkish and Romanian letters to the letters each language prefers.
|
||||
|
||||
>>> preprocess_text('kișinin', 'tr') # comma to cedilla
|
||||
'kişinin'
|
||||
>>> preprocess_text('ACELAŞI', 'ro') # cedilla to comma
|
||||
'același'
|
||||
|
||||
|
||||
Unicode normalization
|
||||
---------------------
|
||||
|
||||
Unicode text is NFC normalized in most languages, removing trivial
|
||||
distinctions between strings that should be considered equivalent in all
|
||||
cases:
|
||||
|
||||
>>> word = preprocess_text('natu\N{COMBINING DIAERESIS}rlich', 'de')
|
||||
>>> word
|
||||
'natürlich'
|
||||
>>> '\N{LATIN SMALL LETTER U WITH DIAERESIS}' in word
|
||||
True
|
||||
|
||||
NFC normalization is sufficient (and NFKC normalization is a bit too strong)
|
||||
for many languages that are written in cased, alphabetic scripts.
|
||||
Languages in other scripts tend to need stronger normalization to properly
|
||||
compare text. So we use NFC normalization when the language's script is
|
||||
Latin, Greek, or Cyrillic, and we use NFKC normalization for all other
|
||||
languages.
|
||||
|
||||
Here's an example in Japanese, where preprocessing changes the width (and
|
||||
the case) of a Latin letter that's used as part of a word:
|
||||
|
||||
>>> preprocess_text('Uターン', 'ja')
|
||||
'uターン'
|
||||
|
||||
In Korean, NFKC normalization is important because it aligns two different
|
||||
ways of encoding text -- as individual letters that are grouped together
|
||||
into square characters, or as the entire syllables that those characters
|
||||
represent:
|
||||
|
||||
>>> word = '\u1102\u1161\u11c0\u1106\u1161\u11af'
|
||||
>>> word
|
||||
'낱말'
|
||||
>>> len(word)
|
||||
6
|
||||
>>> word = preprocess_text(word, 'ko')
|
||||
>>> word
|
||||
'낱말'
|
||||
>>> len(word)
|
||||
2
|
||||
|
||||
|
||||
Abjad mark removal
|
||||
------------------
|
||||
|
||||
There are many abjad languages, such as Arabic, Hebrew, Persian, and Urdu,
|
||||
where words can be marked with vowel points but rarely are. In languages
|
||||
that use abjad scripts, we remove all modifiers that are classified by
|
||||
Unicode as "marks". We also remove an Arabic character called the tatweel,
|
||||
which is used to visually lengthen a word.
|
||||
|
||||
>>> preprocess_text("كَلِمَة", 'ar')
|
||||
'كلمة'
|
||||
>>> preprocess_text("الحمــــــد", 'ar')
|
||||
'الحمد'
|
||||
|
||||
Transliteration of multi-script languages
|
||||
-----------------------------------------
|
||||
|
||||
Some languages are written in multiple scripts, and require special care.
|
||||
These languages include Chinese, Serbian, and Azerbaijani.
|
||||
|
||||
In Serbian, there is a well-established mapping from Cyrillic letters to
|
||||
Latin letters. We apply this mapping so that Serbian is always represented
|
||||
in Latin letters.
|
||||
|
||||
>>> preprocess_text('схваташ', 'sr')
|
||||
'shvataš'
|
||||
|
||||
The transliteration is more complete than it needs to be to cover just
|
||||
Serbian, so that -- for example -- borrowings from Russian can be
|
||||
transliterated, instead of coming out in a mixed script.
|
||||
|
||||
>>> preprocess_text('культуры', 'sr')
|
||||
"kul'tury"
|
||||
|
||||
Azerbaijani (Azeri) has a similar transliteration step to Serbian,
|
||||
and then the Latin-alphabet text is handled similarly to Turkish.
|
||||
|
||||
We don't transliterate Traditional to Simplified Chinese in this step.
|
||||
There are some steps where we unify them internally: see chinese.py
|
||||
for more information.
|
||||
"""
|
||||
# NFC or NFKC normalization, as needed for the language
|
||||
info = get_language_info(language)
|
||||
text = unicodedata.normalize(info['normal_form'], text)
|
||||
|
||||
# Transliteration of multi-script languages
|
||||
if info['transliteration'] is not None:
|
||||
text = transliterate(info['transliteration'], text)
|
||||
|
||||
# Abjad mark removal
|
||||
if info['remove_marks']:
|
||||
text = remove_marks(text)
|
||||
|
||||
# Case folding
|
||||
if info['dotless_i']:
|
||||
text = casefold_with_i_dots(text)
|
||||
else:
|
||||
text = text.casefold()
|
||||
|
||||
# Fixing of diacritics
|
||||
if info['diacritics_under'] == 'commas':
|
||||
text = cedillas_to_commas(text)
|
||||
elif info['diacritics_under'] == 'cedillas':
|
||||
text = commas_to_cedillas(text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def remove_marks(text):
|
||||
"""
|
||||
Remove decorations from words in abjad scripts:
|
||||
|
||||
- Combining marks of class Mn, which tend to represent non-essential
|
||||
vowel markings.
|
||||
- Tatweels, horizontal segments that are used to extend or justify an
|
||||
Arabic word.
|
||||
"""
|
||||
return MARK_RE.sub('', text)
|
||||
|
||||
|
||||
def casefold_with_i_dots(text):
|
||||
"""
|
||||
Convert capital I's and capital dotted İ's to lowercase in the way
|
||||
that's appropriate for Turkish and related languages, then case-fold
|
||||
the rest of the letters.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||
return text.casefold()
|
||||
|
||||
|
||||
def commas_to_cedillas(text):
|
||||
"""
|
||||
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
|
||||
preferred in Turkish.
|
||||
|
||||
Only the lowercase versions are replaced, because this assumes the
|
||||
text has already been case-folded.
|
||||
"""
|
||||
return text.replace(
|
||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
|
||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
|
||||
).replace(
|
||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
|
||||
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
|
||||
)
|
||||
|
||||
|
||||
def cedillas_to_commas(text):
|
||||
"""
|
||||
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
|
||||
preferred in Romanian.
|
||||
|
||||
Only the lowercase versions are replaced, because this assumes the
|
||||
text has already been case-folded.
|
||||
"""
|
||||
return text.replace(
|
||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
|
||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
|
||||
).replace(
|
||||
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
|
||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
||||
)
|
||||
|
||||
|
||||
def sub_zeroes(match):
|
||||
"""
|
||||
Given a regex match, return what it matched with digits replaced by
|
||||
zeroes.
|
||||
"""
|
||||
return DIGIT_RE.sub('0', match.group(0))
|
||||
|
||||
|
||||
def smash_numbers(text):
|
||||
"""
|
||||
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||
distinguish the frequencies of thousands of numbers.
|
||||
"""
|
||||
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
@ -1,30 +1,22 @@
|
||||
import regex
|
||||
import unicodedata
|
||||
import logging
|
||||
import langcodes
|
||||
from .transliterate import serbian_cyrillic_to_latin
|
||||
|
||||
mecab_tokenize = None
|
||||
jieba_tokenize = None
|
||||
from .language_info import get_language_info, SPACELESS_SCRIPTS
|
||||
from .preprocess import preprocess_text, smash_numbers
|
||||
|
||||
# See the documentation inside TOKEN_RE for why we have to handle these
|
||||
# scripts specially.
|
||||
SPACELESS_SCRIPTS = [
|
||||
'Hiragana',
|
||||
'Thai', # Thai script
|
||||
'Khmr', # Khmer script
|
||||
'Laoo', # Lao script
|
||||
'Mymr', # Burmese script
|
||||
'Tale', # Tai Le script
|
||||
'Talu', # Tai Lü script
|
||||
'Lana', # Lanna script
|
||||
]
|
||||
# Placeholders for CJK functions that we'll import on demand
|
||||
_mecab_tokenize = None
|
||||
_jieba_tokenize = None
|
||||
_simplify_chinese = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ABJAD_LANGUAGES = {
|
||||
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
||||
}
|
||||
|
||||
def _make_spaceless_expr():
|
||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
||||
scripts = sorted(SPACELESS_SCRIPTS)
|
||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
|
||||
return ''.join(pieces)
|
||||
|
||||
|
||||
@ -116,10 +108,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||
\w'
|
||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
|
||||
DIGIT_RE = regex.compile('\d')
|
||||
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
||||
# Just identify punctuation, for cases where the tokenizer is separate
|
||||
PUNCT_RE = regex.compile(r"[\p{punct}]+")
|
||||
|
||||
|
||||
def simple_tokenize(text, include_punctuation=False):
|
||||
@ -162,197 +153,27 @@ def simple_tokenize(text, include_punctuation=False):
|
||||
for token in TOKEN_RE.findall(text)
|
||||
]
|
||||
|
||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||
"""
|
||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||
"""
|
||||
global mecab_tokenize
|
||||
if not (lang == 'ja' or lang == 'ko'):
|
||||
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
tokens = mecab_tokenize(text, lang)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
|
||||
|
||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
||||
"""
|
||||
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
|
||||
"""
|
||||
global jieba_tokenize
|
||||
if jieba_tokenize is None:
|
||||
from wordfreq.chinese import jieba_tokenize
|
||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
||||
|
||||
|
||||
def remove_marks(text):
|
||||
"""
|
||||
Remove decorations from words in abjad scripts:
|
||||
|
||||
- Combining marks of class Mn, which tend to represent non-essential
|
||||
vowel markings.
|
||||
- Tatweels, horizontal segments that are used to extend or justify an
|
||||
Arabic word.
|
||||
"""
|
||||
return MARK_RE.sub('', text)
|
||||
|
||||
|
||||
def commas_to_cedillas(text):
|
||||
"""
|
||||
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
|
||||
preferred in Turkish.
|
||||
|
||||
Only the lowercase versions are replaced, because this assumes the
|
||||
text has already been case-folded.
|
||||
"""
|
||||
return text.replace(
|
||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
|
||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
|
||||
).replace(
|
||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
|
||||
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
|
||||
)
|
||||
|
||||
|
||||
def cedillas_to_commas(text):
|
||||
"""
|
||||
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
|
||||
preferred in Romanian.
|
||||
|
||||
Only the lowercase versions are replaced, because this assumes the
|
||||
text has already been case-folded.
|
||||
"""
|
||||
return text.replace(
|
||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
|
||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
|
||||
).replace(
|
||||
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
|
||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
||||
)
|
||||
|
||||
def preprocess_turkish(text):
|
||||
"""
|
||||
Modifies i's so that they case-fold correctly in Turkish, and modifies
|
||||
'comma-below' characters to use cedillas.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||
return commas_to_cedillas(text.casefold())
|
||||
|
||||
|
||||
def preprocess_romanian(text):
|
||||
"""
|
||||
Modifies the letters ş and ţ (with cedillas) to use commas-below instead.
|
||||
"""
|
||||
return cedillas_to_commas(text.casefold())
|
||||
|
||||
|
||||
def preprocess_serbian(text):
|
||||
"""
|
||||
Serbian is written in two scripts, so transliterate from Cyrillic to Latin
|
||||
(which is the unambiguous direction).
|
||||
"""
|
||||
return serbian_cyrillic_to_latin(text)
|
||||
|
||||
|
||||
def sub_zeroes(match):
|
||||
"""
|
||||
Given a regex match, return what it matched with digits replaced by
|
||||
zeroes.
|
||||
"""
|
||||
return DIGIT_RE.sub('0', match.group(0))
|
||||
|
||||
|
||||
def smash_numbers(text):
|
||||
"""
|
||||
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||
distinguish the frequencies of thousands of numbers.
|
||||
"""
|
||||
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
||||
|
||||
|
||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
||||
combine_numbers=False):
|
||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
"""
|
||||
Tokenize this text in a way that's relatively simple but appropriate for
|
||||
the language. Strings that are looked up in wordfreq will be run through
|
||||
this function first, so that they can be expected to match the data.
|
||||
|
||||
Some of the processing steps are specific to one language, such as Chinese,
|
||||
but what broadly happens to the text depends on what general writing system
|
||||
the language uses, out of these categories:
|
||||
|
||||
- Alphabetic scripts: English, Spanish, Russian, etc.
|
||||
- Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
|
||||
- CJK scripts: Chinese, Japanese, Korean
|
||||
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
||||
|
||||
The options `include_punctuation`, `external_wordlist`, and
|
||||
`combine_numbers` are passed on to the appropriate tokenizer:
|
||||
|
||||
- `include_punctuation` preserves punctuation as tokens, instead of
|
||||
removing it.
|
||||
|
||||
- `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
|
||||
instead of wordfreq's wordlist.
|
||||
|
||||
- `combine_numbers` replaces multi-digit numbers with strings of zeroes.
|
||||
|
||||
|
||||
Alphabetic scripts
|
||||
------------------
|
||||
|
||||
The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
|
||||
European languages, which are relatively straightforward to tokenize.
|
||||
|
||||
Text in these scripts will be normalized to NFC form, then passed
|
||||
through a regular expression that implements the Word Segmentation section
|
||||
of Unicode Annex #29, and then case-folded to lowercase.
|
||||
|
||||
The effect is mostly to split the text on spaces and punctuation. There are
|
||||
some subtleties involving apostrophes inside words, which the regex will
|
||||
only split when they occur before a vowel. ("Hasn't" is one token, but
|
||||
"l'enfant" is two.)
|
||||
|
||||
If the language is Turkish, the case-folding rules will take this into
|
||||
account, so that capital I and İ map to ı and i respectively.
|
||||
|
||||
|
||||
Abjad scripts
|
||||
-------------
|
||||
|
||||
Languages in the Arabic or Hebrew scripts are written with optional vowel
|
||||
marks, and sometimes other decorative markings and ligatures. In these
|
||||
languages:
|
||||
|
||||
- The text will be NFKC-normalized, which is a stronger and lossier form
|
||||
than NFC. Here its purpose is to reduce ligatures to simpler characters.
|
||||
|
||||
- Marks will be removed, as well as the Arabic tatweel (an extension of
|
||||
a word that is used for justification or decoration).
|
||||
|
||||
After these steps, the text will go through the same process as the
|
||||
alphabetic scripts above.
|
||||
The text will be run through a number of pre-processing steps that vary
|
||||
by language; see the docstring of `wordfreq.preprocess.preprocess_text`.
|
||||
|
||||
If `include_punctuation` is True, punctuation will be included as separate
|
||||
tokens. Otherwise, punctuation will be omitted in the output.
|
||||
|
||||
CJK scripts
|
||||
-----------
|
||||
|
||||
In the CJK languages, word boundaries can't usually be identified by a
|
||||
regular expression. Instead, there needs to be some language-specific
|
||||
handling.
|
||||
|
||||
- Chinese text first gets converted to a canonical representation we call
|
||||
"Oversimplified Chinese", where all characters are replaced by their
|
||||
Simplified Chinese form, no matter what, even when this misspells a word or
|
||||
a name. This representation is then tokenized using the Jieba tokenizer,
|
||||
trained on the list of Chinese words that can be looked up in wordfreq.
|
||||
|
||||
- Japanese and Korean will be NFKC-normalized, then tokenized using the
|
||||
MeCab tokenizer, using dictionary files that are included in this
|
||||
package.
|
||||
handling. In Chinese, we use the Jieba tokenizer, with a custom word list
|
||||
to match the words whose frequencies we can look up. In Japanese and
|
||||
Korean, we use the MeCab tokenizer.
|
||||
|
||||
The `external_wordlist` option only affects Chinese tokenization. If it's
|
||||
True, then wordfreq will not use its own Chinese wordlist for tokenization.
|
||||
@ -364,39 +185,64 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
||||
If you end up seeing tokens that are entire phrases or sentences glued
|
||||
together, that probably means you passed in CJK text with the wrong
|
||||
language code.
|
||||
|
||||
|
||||
Brahmic scripts and other languages
|
||||
-----------------------------------
|
||||
|
||||
Any kind of language not previously mentioned will just go through the same
|
||||
tokenizer that alphabetic languages use. This includes the Brahmic scripts
|
||||
used in Hindi, Tamil, and Telugu, for example.
|
||||
|
||||
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
|
||||
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
|
||||
does not support these languages yet. It will split on spaces and
|
||||
punctuation, giving tokens that are far too long.
|
||||
"""
|
||||
# Reduce whatever language code was passed in to a normal form,
|
||||
# containing just the language subtag.
|
||||
lang = langcodes.get(lang).prefer_macrolanguage().language
|
||||
if lang == 'ja' or lang == 'ko':
|
||||
result = tokenize_mecab_language(text, lang, include_punctuation)
|
||||
elif lang == 'zh' or lang == 'yue':
|
||||
result = chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||
elif lang == 'tr':
|
||||
result = simple_tokenize(preprocess_turkish(text), include_punctuation)
|
||||
elif lang == 'ro':
|
||||
result = simple_tokenize(preprocess_romanian(text), include_punctuation)
|
||||
elif lang == 'sr':
|
||||
result = simple_tokenize(preprocess_serbian(text), include_punctuation)
|
||||
elif lang in ABJAD_LANGUAGES:
|
||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||
result = simple_tokenize(text, include_punctuation)
|
||||
else:
|
||||
result = simple_tokenize(text, include_punctuation)
|
||||
# Use globals to load CJK tokenizers on demand, so that we can still run
|
||||
# in environments that lack the CJK dependencies
|
||||
global _mecab_tokenize, _jieba_tokenize
|
||||
|
||||
if combine_numbers:
|
||||
result = [smash_numbers(token) for token in result]
|
||||
return result
|
||||
language = langcodes.get(lang)
|
||||
info = get_language_info(language)
|
||||
text = preprocess_text(text, language)
|
||||
|
||||
if info['tokenizer'] == 'mecab':
|
||||
from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
|
||||
# Get just the language code out of the Language object, so we can
|
||||
# use it to select a MeCab dictionary
|
||||
tokens = _mecab_tokenize(text, language.language)
|
||||
if not include_punctuation:
|
||||
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
|
||||
elif info['tokenizer'] == 'jieba':
|
||||
from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
|
||||
tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||
if not include_punctuation:
|
||||
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
|
||||
else:
|
||||
# This is the default case where we use the regex tokenizer. First
|
||||
# let's complain a bit if we ended up here because we don't have an
|
||||
# appropriate tokenizer.
|
||||
if info['tokenizer'] != 'regex':
|
||||
logger.warning(
|
||||
"The language '{}' is in the '{}' script, which we don't "
|
||||
"have a tokenizer for. The results will be bad."
|
||||
.format(lang, info['script'])
|
||||
)
|
||||
tokens = simple_tokenize(text, include_punctuation=include_punctuation)
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
"""
|
||||
Get a list of tokens for this text, with largely the same results and
|
||||
options as `tokenize`, but aggressively normalize some text in a lossy way
|
||||
that's good for counting word frequencies.
|
||||
|
||||
In particular:
|
||||
|
||||
- If a token has 2 adjacent digits, all its digits will be replaced with
|
||||
the digit '0', so that frequencies for numbers don't have to be counted
|
||||
separately. This is similar to word2vec, which replaces them with '#'.
|
||||
|
||||
- In Chinese, unless Traditional Chinese is specifically requested using
|
||||
'zh-Hant', all characters will be converted to Simplified Chinese.
|
||||
"""
|
||||
global _simplify_chinese
|
||||
|
||||
info = get_language_info(lang)
|
||||
tokens = tokenize(text, lang, include_punctuation, external_wordlist)
|
||||
|
||||
if info['lookup_transliteration'] == 'zh-Hans':
|
||||
from wordfreq.chinese import simplify_chinese as _simplify_chinese
|
||||
tokens = [_simplify_chinese(token) for token in tokens]
|
||||
|
||||
return [smash_numbers(token) for token in tokens]
|
||||
|
@ -1,6 +1,8 @@
|
||||
# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
|
||||
# from the 'cyrtranslit' module, which can't currently be imported in Python 3.
|
||||
SR_CYRL_TO_LATN_DICT = {
|
||||
# from the 'cyrtranslit' module. We originally had to reimplement it because
|
||||
# 'cyrtranslit' didn't work in Python 3; now it does, but we've made the table
|
||||
# more robust than the one in cyrtranslit.
|
||||
SR_LATN_TABLE = {
|
||||
ord('А'): 'A', ord('а'): 'a',
|
||||
ord('Б'): 'B', ord('б'): 'b',
|
||||
ord('В'): 'V', ord('в'): 'v',
|
||||
@ -55,7 +57,7 @@ SR_CYRL_TO_LATN_DICT = {
|
||||
# Ukrainian letters
|
||||
ord('Є'): 'Je', ord('є'): 'je',
|
||||
ord('І'): 'I', ord('і'): 'i',
|
||||
ord('Ї'): 'Ji', ord('ї'): 'ji',
|
||||
ord('Ї'): 'Ï', ord('ї'): 'ï',
|
||||
ord('Ґ'): 'G', ord('ґ'): 'g',
|
||||
|
||||
# Macedonian letters
|
||||
@ -64,7 +66,43 @@ SR_CYRL_TO_LATN_DICT = {
|
||||
ord('Ќ'): 'Ḱ', ord('ќ'): 'ḱ',
|
||||
}
|
||||
|
||||
AZ_LATN_TABLE = SR_LATN_TABLE.copy()
|
||||
AZ_LATN_TABLE.update({
|
||||
# Distinct Azerbaijani letters
|
||||
ord('Ҹ'): 'C', ord('ҹ'): 'c',
|
||||
ord('Ә'): 'Ə', ord('ә'): 'ə',
|
||||
ord('Ғ'): 'Ğ', ord('ғ'): 'ğ',
|
||||
ord('Һ'): 'H', ord('һ'): 'h',
|
||||
ord('Ө'): 'Ö', ord('ө'): 'ö',
|
||||
ord('Ҝ'): 'G', ord('ҝ'): 'g',
|
||||
ord('Ү'): 'Ü', ord('ү'): 'ü',
|
||||
|
||||
def serbian_cyrillic_to_latin(text):
|
||||
return text.translate(SR_CYRL_TO_LATN_DICT)
|
||||
# Azerbaijani letters with different transliterations
|
||||
ord('Ч'): 'Ç', ord('ч'): 'ç',
|
||||
ord('Х'): 'X', ord('х'): 'x',
|
||||
ord('Ы'): 'I', ord('ы'): 'ı',
|
||||
ord('И'): 'İ', ord('и'): 'ı',
|
||||
ord('Ж'): 'J', ord('ж'): 'j',
|
||||
ord('Ј'): 'Y', ord('ј'): 'y',
|
||||
ord('Г'): 'Q', ord('г'): 'q',
|
||||
ord('Ш'): 'Ş', ord('ш'): 'ş',
|
||||
})
|
||||
|
||||
|
||||
def transliterate(table, text):
|
||||
"""
|
||||
Transliterate text according to one of the tables above.
|
||||
|
||||
`table` chooses the table. It looks like a language code but comes from a
|
||||
very restricted set:
|
||||
|
||||
- 'sr-Latn' means to convert Serbian, which may be in Cyrillic, into the
|
||||
Latin alphabet.
|
||||
- 'az-Latn' means the same for Azerbaijani Cyrillic to Latn.
|
||||
"""
|
||||
if table == 'sr-Latn':
|
||||
return text.translate(SR_LATN_TABLE)
|
||||
elif table == 'az-Latn':
|
||||
return text.translate(AZ_LATN_TABLE)
|
||||
else:
|
||||
raise ValueError("Unknown transliteration table: {!r}".format(table))
|
||||
|
Loading…
Reference in New Issue
Block a user