Merge pull request #49 from LuminosoInsight/restore-langcodes

Use langcodes when tokenizing again
This commit is contained in:
Andrew Lin 2017-05-10 16:20:06 -04:00 committed by GitHub
commit 6c118c0b6a
5 changed files with 44 additions and 9 deletions

View File

@ -27,14 +27,14 @@ current_dir = os.path.dirname(__file__)
README_contents = open(os.path.join(current_dir, 'README.md'), README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read() encoding='utf-8').read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015'] dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes >= 1.4', 'regex >= 2015']
if sys.version_info < (3, 4): if sys.version_info < (3, 4):
dependencies.append('pathlib') dependencies.append('pathlib')
setup( setup(
name="wordfreq", name="wordfreq",
version='1.6', version='1.6.1',
maintainer='Luminoso Technologies, Inc.', maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com', maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -66,3 +66,22 @@ def test_combination():
word_frequency('谢谢谢谢', 'zh'), word_frequency('谢谢谢谢', 'zh'),
xiexie_freq / 20 xiexie_freq / 20
) )
def test_alternate_codes():
# Tokenization of Chinese works when you use other language codes
# that are not equal to 'zh'.
tokens = ['谢谢', '谢谢']
# Code with a region attached
eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)
# Over-long codes for Chinese
eq_(tokenize('谢谢谢谢', 'chi'), tokens)
eq_(tokenize('谢谢谢谢', 'zho'), tokens)
# Separate codes for Mandarin and Cantonese
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
eq_(tokenize('谢谢谢谢', 'yue'), tokens)

View File

@ -29,3 +29,12 @@ def test_catastrophes():
['m', 'acabo', 'd', 'instal·lar']) ['m', 'acabo', 'd', 'instal·lar'])
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True), eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
["m'", 'acabo', "d'", 'instal·lar', '.']) ["m'", 'acabo', "d'", 'instal·lar', '.'])
def test_alternate_codes():
# Try over-long language codes for French and Catalan
eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
['m', 'acabo', 'd', 'instal·lar'])

View File

@ -23,3 +23,11 @@ def test_actually_russian():
['sto', 'iz', 'sta', 'pacany']) ['sto', 'iz', 'sta', 'pacany'])
eq_(tokenize("культуры", 'sr'), ["kul'tury"]) eq_(tokenize("культуры", 'sr'), ["kul'tury"])
def test_alternate_codes():
# Try language codes for Serbo-Croatian that have been split, and now
# are canonically mapped to Serbian
eq_(tokenize("культуры", 'sh'), ["kul'tury"])
eq_(tokenize("культуры", 'hbs'), ["kul'tury"])

View File

@ -1,5 +1,6 @@
import regex import regex
import unicodedata import unicodedata
import langcodes
from .transliterate import serbian_cyrillic_to_latin from .transliterate import serbian_cyrillic_to_latin
mecab_tokenize = None mecab_tokenize = None
@ -361,20 +362,18 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
does not support these languages yet. It will split on spaces and does not support these languages yet. It will split on spaces and
punctuation, giving tokens that are far too long. punctuation, giving tokens that are far too long.
""" """
# A really simple way to handle language codes with more than just the # Reduce whatever language code was passed in to a normal form,
# language # containing just the language subtag.
lang = lang.split('-')[0] lang = langcodes.get(lang).prefer_macrolanguage().language
if lang == 'ja' or lang == 'ko': if lang == 'ja' or lang == 'ko':
result = tokenize_mecab_language(text, lang, include_punctuation) result = tokenize_mecab_language(text, lang, include_punctuation)
elif lang == 'zh': elif lang == 'zh' or lang == 'yue':
result = chinese_tokenize(text, include_punctuation, external_wordlist) result = chinese_tokenize(text, include_punctuation, external_wordlist)
elif lang == 'tr': elif lang == 'tr':
result = simple_tokenize(preprocess_turkish(text), include_punctuation) result = simple_tokenize(preprocess_turkish(text), include_punctuation)
elif lang == 'ro': elif lang == 'ro':
result = simple_tokenize(preprocess_romanian(text), include_punctuation) result = simple_tokenize(preprocess_romanian(text), include_punctuation)
elif lang == 'sr' or lang == 'sh' or lang == 'hbs': elif lang == 'sr':
# These are the three language codes that could include Serbian text,
# which could be in Cyrillic.
result = simple_tokenize(preprocess_serbian(text), include_punctuation) result = simple_tokenize(preprocess_serbian(text), include_punctuation)
elif lang in ABJAD_LANGUAGES: elif lang in ABJAD_LANGUAGES:
text = remove_marks(unicodedata.normalize('NFKC', text)) text = remove_marks(unicodedata.normalize('NFKC', text))