mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge pull request #11 from LuminosoInsight/split-emoji
wordfreq now splits emoji from text
Former-commit-id: 6c76942da2
This commit is contained in:
commit
96b75dcf2b
@ -1,2 +1,3 @@
|
||||
recursive-include wordfreq/data *.gz
|
||||
include README.md
|
||||
recursive-include wordfreq/data *.txt
|
||||
|
@ -1,6 +1,6 @@
|
||||
from wordfreq import (
|
||||
word_frequency, available_languages, cB_to_freq, iter_wordlist,
|
||||
top_n_list, random_words, random_ascii_words
|
||||
top_n_list, random_words, random_ascii_words, tokenize
|
||||
)
|
||||
from nose.tools import (
|
||||
eq_, assert_almost_equal, assert_greater, assert_less, raises
|
||||
@ -84,12 +84,16 @@ def test_failed_cB_conversion():
|
||||
def test_tokenization():
|
||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||
# data, while the fake word "plan't" can't be found.
|
||||
assert_greater(word_frequency("can't", 'en'), 0)
|
||||
eq_(word_frequency("plan't", 'en'), 0)
|
||||
eq_(tokenize("can't", 'en'), ["can't"])
|
||||
eq_(tokenize("plan't", 'en'), ["plan't"])
|
||||
|
||||
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
||||
|
||||
# We do split at other punctuation, causing the word-combining rule to
|
||||
# apply.
|
||||
assert_greater(word_frequency("can.t", 'en'), 0)
|
||||
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
||||
|
||||
def test_phrase_freq():
|
||||
plant = word_frequency("plan.t", 'en')
|
||||
assert_greater(plant, 0)
|
||||
assert_less(plant, word_frequency('plan', 'en'))
|
||||
|
@ -1,6 +1,9 @@
|
||||
from pkg_resources import resource_filename
|
||||
from functools import lru_cache
|
||||
import unicodedata
|
||||
from ftfy import chardata
|
||||
import langcodes
|
||||
import itertools
|
||||
import msgpack
|
||||
import re
|
||||
import gzip
|
||||
@ -9,14 +12,128 @@ import random
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]'
|
||||
NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
|
||||
TOKEN_RE = re.compile("{0}+(?:'{0}+)*".format(NON_PUNCT_RANGE))
|
||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||
|
||||
CACHE_SIZE = 100000
|
||||
|
||||
def _emoji_char_class():
|
||||
"""
|
||||
Build a regex for emoji substitution. First we create a regex character set
|
||||
(like "[a-cv-z]") matching characters we consider emoji (see the docstring
|
||||
of _replace_problem_text()). The final regex matches one such character
|
||||
followed by any number of spaces and identical characters.
|
||||
"""
|
||||
ranges = []
|
||||
for i, c in enumerate(chardata.CHAR_CLASS_STRING):
|
||||
if c == '3' and i >= 0x2600 and i != 0xfffd:
|
||||
if ranges and i == ranges[-1][1] + 1:
|
||||
ranges[-1][1] = i
|
||||
else:
|
||||
ranges.append([i, i])
|
||||
return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
|
||||
|
||||
EMOJI_RANGE = _emoji_char_class()
|
||||
|
||||
def _non_punct_class():
|
||||
"""
|
||||
Builds a regex that matches anything that is not a one of the following
|
||||
classes:
|
||||
- P: punctuation
|
||||
- S: symbols
|
||||
- Z: separators
|
||||
- C: control characters
|
||||
This will classify symbols, including emoji, as punctuation; callers that
|
||||
want to treat emoji separately should filter them out first.
|
||||
"""
|
||||
non_punct_file = DATA_PATH / 'non_punct.txt'
|
||||
try:
|
||||
with non_punct_file.open() as file:
|
||||
return file.read()
|
||||
except FileNotFoundError:
|
||||
|
||||
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
|
||||
|
||||
with non_punct_file.open(mode='w') as file:
|
||||
file.write(out)
|
||||
|
||||
return out
|
||||
|
||||
def _combining_mark_class():
|
||||
"""
|
||||
Builds a regex that matches anything that is a combining mark
|
||||
"""
|
||||
_combining_mark_file = DATA_PATH / 'combining_mark.txt'
|
||||
try:
|
||||
with _combining_mark_file.open() as file:
|
||||
return file.read()
|
||||
except FileNotFoundError:
|
||||
|
||||
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
|
||||
|
||||
with _combining_mark_file.open(mode='w') as file:
|
||||
file.write(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def func_to_ranges(accept):
|
||||
"""
|
||||
Converts a function that accepts a single unicode character into a list of
|
||||
ranges. Unassigned unicode are automatically accepted.
|
||||
"""
|
||||
ranges = []
|
||||
start = None
|
||||
for x in range(0x110000):
|
||||
cat = unicodedata.category(chr(x))
|
||||
if cat == 'Cn' or accept(chr(x)):
|
||||
if start is None:
|
||||
start = x
|
||||
else:
|
||||
if start is not None:
|
||||
ranges.append((start, x-1))
|
||||
start = None
|
||||
|
||||
if start is not None:
|
||||
ranges.append((start, x))
|
||||
|
||||
return ranges
|
||||
|
||||
unassigned_ranges = None
|
||||
|
||||
def func_to_regex(accept):
|
||||
"""
|
||||
Converts a function that accepts a single unicode character into a regex.
|
||||
Unassigned unicode characters are treated like their neighbors.
|
||||
"""
|
||||
ranges = []
|
||||
start = None
|
||||
for x in range(0x110000):
|
||||
cat = unicodedata.category(chr(x))
|
||||
if cat == 'Cn' or accept(chr(x)):
|
||||
if start is None:
|
||||
start = x
|
||||
else:
|
||||
if start is not None:
|
||||
ranges.append((start, x-1))
|
||||
start = None
|
||||
|
||||
if start is not None:
|
||||
ranges.append((start, x))
|
||||
|
||||
global unassigned_ranges
|
||||
if unassigned_ranges is None:
|
||||
unassigned_ranges = set(func_to_ranges(lambda _: False))
|
||||
|
||||
ranges = [range for range in ranges if range not in unassigned_ranges]
|
||||
|
||||
return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
|
||||
for start, end in ranges)
|
||||
|
||||
|
||||
COMBINING_MARK_RE = re.compile(_combining_mark_class())
|
||||
NON_PUNCT_RANGE = _non_punct_class()
|
||||
|
||||
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
|
||||
|
||||
def simple_tokenize(text):
|
||||
"""
|
||||
@ -34,21 +151,29 @@ def simple_tokenize(text):
|
||||
"""
|
||||
return [token.lower() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
"""
|
||||
Tokenize this text in a way that's straightforward but appropriate for
|
||||
the language.
|
||||
|
||||
So far, this means that Japanese is handled by mecab_tokenize, and
|
||||
everything else is handled by simple_tokenize.
|
||||
everything else is handled by simple_tokenize. Additionally, Arabic commas
|
||||
and combining marks are removed.
|
||||
|
||||
Strings that are looked up in wordfreq will be run through this function
|
||||
first, so that they can be expected to match the data.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
elif lang == 'ar':
|
||||
tokens = simple_tokenize(text)
|
||||
tokens = [token.replace('ـ', '') for token in tokens] # remove tatweel
|
||||
tokens = [COMBINING_MARK_RE.sub('', token) for token in tokens]
|
||||
return [token for token in tokens if token] # remove empty strings
|
||||
else:
|
||||
return simple_tokenize(text)
|
||||
|
||||
|
1
wordfreq/data/combining_mark.txt
Normal file
1
wordfreq/data/combining_mark.txt
Normal file
@ -0,0 +1 @@
|
||||
[̀-ͯ҃-҉-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙-ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ--়া-্-ৢ-ৼ--ੰ-ੱੵ--઼ા-ૢ---଼ା-ୢ--ஂ---ఄా-ౢ-ಀ-಄-಼ಾ-ೝೢ-ೳ-ഄാ-്൏-ൟൢ---ෳั-ัิ-็-๎ັ-ັິ-ຼ-༘-༙༵-༵༷-༹༷-༹༾-༿-྄྆-྇ྍ-࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ-፟ᜒ-ᜟᜲ-᜴ᝒ--឴-៓៝-᠋-᠍ᢩ-ᢩᤝ--ᧀᧈ-ᨗ-ᩕ-᩿-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-ᰤ--᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⳯-⳱-⵿-〪ⷿ-〯-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ-ꢁꢴ--꣱ꤦ-꤭ꥇ--ꦃ꦳-꧀ꨩ-ꩃ-ꩃꩌ-ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-ꯣ-ꯪ꯬-ﬞ-ﬞ﷾-️-𐇽︯-𐨁-𐨏𐨴-𐨿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺-𑄂𑄧-𑅄-𑆂𑆳-𑇀𑚫-𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄-]
|
1
wordfreq/data/non_punct.txt
Normal file
1
wordfreq/data/non_punct.txt
Normal file
@ -0,0 +1 @@
|
||||
[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ---ॣ०-९ॱ-ৱ৴-৹ৼ-૯-୯ୱ-௲-౾ಀ-൸ൺ-ෳ-เ-๎๐-๙-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-࿆-࿆-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ-ᐁ-ᙬᙯ-ᙿᚁ-ᚚ-ᛪᛮ-᜴-៓ៗ-ៗៜ-᠋-᠍᠏-᥆-ᨀ-ᨠ-ᪧ-ᪧ-᭙᭫-᭳᭽-ᰀ-᱀-ᱽ-᳔᳒-ᾼι-ιῂ-ῌῐ-ῠ-Ῥ-ῼ⁰-⁹ⁿ-₉-₻-ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐--⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳽-⳽ⴀ-ⵯ-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼-゚ゝ-ゟァ-ヺー-㆒-㆕ㆠ-ㆿ-ㇿ-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一--ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵-ꡳ-꣐-ꣷꣻ-꤭ꤰ-ꥠ-꧀-ꧠ-ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️-︯-0-9A-Za-zヲ---𐅀-𐅸𐆊-𐇽-𐎠-𐏏𐏑-𐡘-𐤠---𐩾𐪀-𐭀-𑁆-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑴-𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-🃠-🄏🝴--]
|
Loading…
Reference in New Issue
Block a user