mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge pull request #49 from LuminosoInsight/restore-langcodes
Use langcodes when tokenizing again
This commit is contained in:
commit
6c118c0b6a
4
setup.py
4
setup.py
@ -27,14 +27,14 @@ current_dir = os.path.dirname(__file__)
|
|||||||
README_contents = open(os.path.join(current_dir, 'README.md'),
|
README_contents = open(os.path.join(current_dir, 'README.md'),
|
||||||
encoding='utf-8').read()
|
encoding='utf-8').read()
|
||||||
doclines = README_contents.split("\n")
|
doclines = README_contents.split("\n")
|
||||||
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
|
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015']
|
||||||
if sys.version_info < (3, 4):
|
if sys.version_info < (3, 4):
|
||||||
dependencies.append('pathlib')
|
dependencies.append('pathlib')
|
||||||
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='1.6',
|
version='1.6.1',
|
||||||
maintainer='Luminoso Technologies, Inc.',
|
maintainer='Luminoso Technologies, Inc.',
|
||||||
maintainer_email='info@luminoso.com',
|
maintainer_email='info@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
@ -66,3 +66,22 @@ def test_combination():
|
|||||||
word_frequency('谢谢谢谢', 'zh'),
|
word_frequency('谢谢谢谢', 'zh'),
|
||||||
xiexie_freq / 20
|
xiexie_freq / 20
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_alternate_codes():
|
||||||
|
# Tokenization of Chinese works when you use other language codes
|
||||||
|
# that are not equal to 'zh'.
|
||||||
|
tokens = ['谢谢', '谢谢']
|
||||||
|
|
||||||
|
# Code with a region attached
|
||||||
|
eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)
|
||||||
|
|
||||||
|
# Over-long codes for Chinese
|
||||||
|
eq_(tokenize('谢谢谢谢', 'chi'), tokens)
|
||||||
|
eq_(tokenize('谢谢谢谢', 'zho'), tokens)
|
||||||
|
|
||||||
|
# Separate codes for Mandarin and Cantonese
|
||||||
|
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
|
||||||
|
eq_(tokenize('谢谢谢谢', 'yue'), tokens)
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,3 +29,12 @@ def test_catastrophes():
|
|||||||
['m', 'acabo', 'd', 'instal·lar'])
|
['m', 'acabo', 'd', 'instal·lar'])
|
||||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
||||||
["m'", 'acabo', "d'", 'instal·lar', '.'])
|
["m'", 'acabo', "d'", 'instal·lar', '.'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_alternate_codes():
|
||||||
|
# Try over-long language codes for French and Catalan
|
||||||
|
eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
|
||||||
|
eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
|
||||||
|
eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
|
||||||
|
['m', 'acabo', 'd', 'instal·lar'])
|
||||||
|
|
||||||
|
@ -23,3 +23,11 @@ def test_actually_russian():
|
|||||||
['sto', 'iz', 'sta', 'pacany'])
|
['sto', 'iz', 'sta', 'pacany'])
|
||||||
|
|
||||||
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
|
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_alternate_codes():
|
||||||
|
# Try language codes for Serbo-Croatian that have been split, and now
|
||||||
|
# are canonically mapped to Serbian
|
||||||
|
eq_(tokenize("культуры", 'sh'), ["kul'tury"])
|
||||||
|
eq_(tokenize("культуры", 'hbs'), ["kul'tury"])
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import regex
|
import regex
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import langcodes
|
||||||
from .transliterate import serbian_cyrillic_to_latin
|
from .transliterate import serbian_cyrillic_to_latin
|
||||||
|
|
||||||
mecab_tokenize = None
|
mecab_tokenize = None
|
||||||
@ -361,20 +362,18 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
|||||||
does not support these languages yet. It will split on spaces and
|
does not support these languages yet. It will split on spaces and
|
||||||
punctuation, giving tokens that are far too long.
|
punctuation, giving tokens that are far too long.
|
||||||
"""
|
"""
|
||||||
# A really simple way to handle language codes with more than just the
|
# Reduce whatever language code was passed in to a normal form,
|
||||||
# language
|
# containing just the language subtag.
|
||||||
lang = lang.split('-')[0]
|
lang = langcodes.get(lang).prefer_macrolanguage().language
|
||||||
if lang == 'ja' or lang == 'ko':
|
if lang == 'ja' or lang == 'ko':
|
||||||
result = tokenize_mecab_language(text, lang, include_punctuation)
|
result = tokenize_mecab_language(text, lang, include_punctuation)
|
||||||
elif lang == 'zh':
|
elif lang == 'zh' or lang == 'yue':
|
||||||
result = chinese_tokenize(text, include_punctuation, external_wordlist)
|
result = chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||||
elif lang == 'tr':
|
elif lang == 'tr':
|
||||||
result = simple_tokenize(preprocess_turkish(text), include_punctuation)
|
result = simple_tokenize(preprocess_turkish(text), include_punctuation)
|
||||||
elif lang == 'ro':
|
elif lang == 'ro':
|
||||||
result = simple_tokenize(preprocess_romanian(text), include_punctuation)
|
result = simple_tokenize(preprocess_romanian(text), include_punctuation)
|
||||||
elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
|
elif lang == 'sr':
|
||||||
# These are the three language codes that could include Serbian text,
|
|
||||||
# which could be in Cyrillic.
|
|
||||||
result = simple_tokenize(preprocess_serbian(text), include_punctuation)
|
result = simple_tokenize(preprocess_serbian(text), include_punctuation)
|
||||||
elif lang in ABJAD_LANGUAGES:
|
elif lang in ABJAD_LANGUAGES:
|
||||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||||
|
Loading…
Reference in New Issue
Block a user