mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge pull request #22 from LuminosoInsight/standard-tokenizer
Use a more standard Unicode tokenizer
Former-commit-id: e6d9b36203
This commit is contained in:
commit
9fedede771
27
README.md
27
README.md
@ -2,6 +2,7 @@ Tools for working with word frequencies from various corpora.
|
||||
|
||||
Author: Robyn Speer
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
wordfreq requires Python 3 and depends on a few other Python modules
|
||||
@ -21,11 +22,25 @@ install them on Ubuntu:
|
||||
sudo apt-get install mecab-ipadic-utf8 libmecab-dev
|
||||
pip3 install mecab-python3
|
||||
|
||||
## Unicode data
|
||||
|
||||
The tokenizers that split non-Japanese phrases utilize regexes built using the
|
||||
`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0. To
|
||||
update these regexes, run `scripts/gen_regex.py`.
|
||||
## Tokenization
|
||||
|
||||
wordfreq uses the Python package `regex`, which is a more advanced
|
||||
implementation of regular expressions than the standard library, to
|
||||
separate text into tokens that can be counted consistently. `regex`
|
||||
produces tokens that follow the recommendations in [Unicode
|
||||
Annex #29, Text Segmentation][uax29].
|
||||
|
||||
There are language-specific exceptions:
|
||||
|
||||
- In Arabic, it additionally normalizes ligatures and removes combining marks.
|
||||
- In Japanese, instead of using the regex library, it uses the external library
|
||||
`mecab-python3`. This is an optional dependency of wordfreq, and compiling
|
||||
it requires the `libmecab-dev` system package to be installed.
|
||||
- It does not yet attempt to tokenize Chinese ideograms.
|
||||
|
||||
[uax29]: http://unicode.org/reports/tr29/
|
||||
|
||||
|
||||
## License
|
||||
|
||||
@ -56,5 +71,5 @@ sources:
|
||||
|
||||
Some additional data was collected by a custom application that watches the
|
||||
streaming Twitter API, in accordance with Twitter's Developer Agreement &
|
||||
Policy. This software only gives statistics about words that are very commonly
|
||||
used on Twitter; it does not display or republish any Twitter content.
|
||||
Policy. This software gives statistics about words that are commonly used on
|
||||
Twitter; it does not display or republish any Twitter content.
|
||||
|
@ -1,76 +0,0 @@
|
||||
import unicodedata
|
||||
from ftfy import chardata
|
||||
import pathlib
|
||||
from pkg_resources import resource_filename
|
||||
|
||||
|
||||
CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
|
||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||
|
||||
|
||||
def func_to_regex(accept_func):
|
||||
"""
|
||||
Given a function that returns True or False for a numerical codepoint,
|
||||
return a regex character class accepting the characters resulting in True.
|
||||
Ranges separated only by unassigned characters are merged for efficiency.
|
||||
"""
|
||||
# parsing_range is True if the current codepoint might be in a range that
|
||||
# the regex will accept
|
||||
parsing_range = False
|
||||
ranges = []
|
||||
|
||||
for codepoint, category in enumerate(CATEGORIES):
|
||||
if accept_func(codepoint):
|
||||
if not parsing_range:
|
||||
ranges.append([codepoint, codepoint])
|
||||
parsing_range = True
|
||||
else:
|
||||
ranges[-1][1] = codepoint
|
||||
elif category != 'Cn':
|
||||
parsing_range = False
|
||||
|
||||
return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
|
||||
|
||||
|
||||
def cache_regex_from_func(filename, func):
|
||||
"""
|
||||
Generates a regex from a function that accepts a single unicode character,
|
||||
and caches it in the data path at filename.
|
||||
"""
|
||||
with (DATA_PATH / filename).open(mode='w') as file:
|
||||
file.write(func_to_regex(func))
|
||||
|
||||
|
||||
def _is_emoji_codepoint(i):
|
||||
"""
|
||||
Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
|
||||
character (as future-proofed by the ftfy chardata module) but excluding
|
||||
symbols like © and ™ below U+2600 and the replacement character U+FFFD.
|
||||
"""
|
||||
return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd
|
||||
|
||||
|
||||
def _is_non_punct_codepoint(i):
|
||||
"""
|
||||
Report whether a numerical codepoint is not one of the following classes:
|
||||
- P: punctuation
|
||||
- S: symbols
|
||||
- Z: separators
|
||||
- C: control characters
|
||||
This will classify symbols, including emoji, as punctuation; users that
|
||||
want to accept emoji should add them separately.
|
||||
"""
|
||||
return CATEGORIES[i][0] not in 'PSZC'
|
||||
|
||||
|
||||
def _is_combining_mark_codepoint(i):
|
||||
"""
|
||||
Report whether a numerical codepoint is a combining mark (Unicode 'M').
|
||||
"""
|
||||
return CATEGORIES[i][0] == 'M'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
|
||||
cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
|
||||
cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)
|
4
setup.py
4
setup.py
@ -26,14 +26,14 @@ classifiers = [
|
||||
current_dir = os.path.dirname(__file__)
|
||||
README_contents = open(os.path.join(current_dir, 'README.md')).read()
|
||||
doclines = README_contents.split("\n")
|
||||
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes']
|
||||
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
|
||||
if sys.version_info < (3, 4):
|
||||
dependencies.append('pathlib')
|
||||
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='1.0',
|
||||
version='1.1',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
|
@ -95,13 +95,17 @@ def test_failed_cB_conversion():
|
||||
def test_tokenization():
|
||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||
# data
|
||||
eq_(tokenize("can't", 'en'), ["can't"])
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||
|
||||
# Certain punctuation does not inherently split a word.
|
||||
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
||||
['anything', 'is', 'possible', 'at', 'zombo.com'])
|
||||
|
||||
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
||||
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
||||
|
||||
# We do split at other punctuation, causing the word-combining rule to
|
||||
# apply.
|
||||
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
||||
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
@ -110,11 +114,11 @@ def test_casefolding():
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
plant = word_frequency("plan.t", 'en')
|
||||
assert_greater(plant, 0)
|
||||
ff = word_frequency("flip-flop", 'en')
|
||||
assert_greater(ff, 0)
|
||||
assert_almost_equal(
|
||||
1.0 / plant,
|
||||
1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
|
||||
1.0 / ff,
|
||||
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||||
)
|
||||
|
||||
|
||||
@ -134,8 +138,8 @@ def test_not_really_random():
|
||||
def test_not_enough_ascii():
|
||||
random_ascii_words(lang='zh')
|
||||
|
||||
def test_ar():
|
||||
|
||||
def test_ar():
|
||||
# Remove tatweels
|
||||
eq_(
|
||||
tokenize('متــــــــعب', 'ar'),
|
||||
@ -152,3 +156,16 @@ def test_ar():
|
||||
tokenize('\ufefb', 'ar'), # An Arabic ligature...
|
||||
['\u0644\u0627'] # ...that is affected by NFKC normalization
|
||||
)
|
||||
|
||||
|
||||
def test_ideographic_fallback():
|
||||
# Try tokenizing Chinese text -- it should remain stuck together.
|
||||
eq_(tokenize('中国文字', 'zh'), ['中国文字'])
|
||||
|
||||
# When Japanese is tagged with the wrong language, it will be split
|
||||
# at script boundaries.
|
||||
ja_text = 'ひらがなカタカナromaji'
|
||||
eq_(
|
||||
tokenize(ja_text, 'en'),
|
||||
['ひらがな', 'カタカナ', 'romaji']
|
||||
)
|
||||
|
@ -1,14 +1,13 @@
|
||||
from wordfreq.tokens import tokenize, simple_tokenize
|
||||
from pkg_resources import resource_filename
|
||||
from functools import lru_cache
|
||||
import langcodes
|
||||
import msgpack
|
||||
import re
|
||||
import gzip
|
||||
import itertools
|
||||
import pathlib
|
||||
import random
|
||||
import logging
|
||||
import unicodedata
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
|
||||
CACHE_SIZE = 100000
|
||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||
|
||||
def load_range(filename):
|
||||
"""
|
||||
Load a file from the data path.
|
||||
"""
|
||||
with (DATA_PATH / filename).open() as file:
|
||||
return file.read()
|
||||
|
||||
EMOJI_RANGE = load_range('emoji.txt')
|
||||
NON_PUNCT_RANGE = load_range('non_punct.txt')
|
||||
COMBINING_MARK_RANGE = load_range('combining_mark.txt')
|
||||
|
||||
COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
|
||||
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
|
||||
|
||||
|
||||
def simple_tokenize(text):
|
||||
"""
|
||||
A simple tokenizer that can be applied to most languages.
|
||||
|
||||
It considers a word to be made of a sequence of 'token characters', an
|
||||
overly inclusive range that includes letters, Han characters, emoji, and a
|
||||
bunch of miscellaneous whatnot, but excludes most punctuation and
|
||||
whitespace.
|
||||
|
||||
The single complication for the sake of English is that apostrophes are not
|
||||
considered part of the token if they appear on the edge of the character
|
||||
sequence, but they are if they appear internally. "cats'" is not a token,
|
||||
but "cat's" is.
|
||||
"""
|
||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
"""
|
||||
Tokenize this text in a way that's straightforward but appropriate for
|
||||
the language.
|
||||
|
||||
So far, this means that Japanese is handled by mecab_tokenize, and
|
||||
everything else is handled by simple_tokenize. Additionally, Arabic commas
|
||||
and combining marks are removed.
|
||||
|
||||
Strings that are looked up in wordfreq will be run through this function
|
||||
first, so that they can be expected to match the data.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
text = standardize_arabic(text)
|
||||
|
||||
return simple_tokenize(text)
|
||||
|
||||
|
||||
def standardize_arabic(text):
|
||||
"""
|
||||
Standardizes arabic text by removing combining marks and tatweels.
|
||||
"""
|
||||
return unicodedata.normalize(
|
||||
'NFKC',
|
||||
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
)
|
||||
# simple_tokenize is imported so that other things can import it from here.
|
||||
# Suppress the pyflakes warning.
|
||||
simple_tokenize = simple_tokenize
|
||||
|
||||
|
||||
def read_cBpack(filename):
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +0,0 @@
|
||||
[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]
|
@ -1 +0,0 @@
|
||||
[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽¦-¦│-│■-○-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🄍-]
|
@ -1 +0,0 @@
|
||||
[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ0-9A-Za-zヲ-ᅵ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,4 +1,5 @@
|
||||
import MeCab
|
||||
import unicodedata
|
||||
|
||||
|
||||
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
|
||||
@ -14,6 +15,7 @@ def mecab_tokenize(text):
|
||||
contains the same table that the command-line version of MeCab would output.
|
||||
We find the tokens in the first column of this table.
|
||||
"""
|
||||
text = unicodedata.normalize('NFKC', text.strip())
|
||||
return [line.split('\t')[0]
|
||||
for line in MECAB_ANALYZER.parse(text.strip()).split('\n')
|
||||
for line in MECAB_ANALYZER.parse(text).split('\n')
|
||||
if line != '' and line != 'EOS']
|
||||
|
114
wordfreq/tokens.py
Normal file
114
wordfreq/tokens.py
Normal file
@ -0,0 +1,114 @@
|
||||
import regex
|
||||
import unicodedata
|
||||
|
||||
|
||||
TOKEN_RE = regex.compile(r"""
|
||||
# Case 1: a special case for Chinese and Japanese
|
||||
# -----------------------------------------------
|
||||
|
||||
# When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
|
||||
# (\p{Script=Hiragana}), we allow a sequence of those characters to be
|
||||
# glued together as a single token. Without this case, the standard rule
|
||||
# (case 2) would make each character a separate token. This would be the
|
||||
# correct behavior for word-wrapping, but a messy failure mode for NLP
|
||||
# tokenization.
|
||||
#
|
||||
# It is, of course, better to use a tokenizer that is designed for Chinese
|
||||
# or Japanese text. This is effectively a fallback for when the wrong
|
||||
# tokenizer is used.
|
||||
#
|
||||
# This rule is listed first so that it takes precedence.
|
||||
|
||||
[\p{IsIdeo}\p{Script=Hiragana}]+ |
|
||||
|
||||
# Case 2: standard Unicode segmentation
|
||||
# -------------------------------------
|
||||
|
||||
# The start of the token must be 'word-like', not punctuation or whitespace
|
||||
# or various other things. However, we allow characters of category So
|
||||
# (Symbol - Other) because many of these are emoji, which can convey
|
||||
# meaning.
|
||||
|
||||
[\w\p{So}]
|
||||
|
||||
# The rest of the token matches characters that are not any sort of space
|
||||
# (\S) and do not cause word breaks according to the Unicode word
|
||||
# segmentation heuristic (\B).
|
||||
|
||||
(?:\B\S)*
|
||||
""", regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
|
||||
|
||||
def simple_tokenize(text):
|
||||
"""
|
||||
Tokenize the given text using a straightforward, Unicode-aware token
|
||||
expression.
|
||||
|
||||
The expression mostly implements the rules of Unicode Annex #29 that
|
||||
are contained in the `regex` module's word boundary matching, including
|
||||
the refinement that splits words between apostrophes and vowels in order
|
||||
to separate tokens such as the French article «l'». Our customizations
|
||||
to the expression are:
|
||||
|
||||
- It leaves sequences of Chinese or Japanese characters (specifically, Han
|
||||
ideograms and hiragana) relatively untokenized, instead of splitting each
|
||||
character into its own token.
|
||||
|
||||
- It outputs only the tokens that start with a word-like character, or
|
||||
miscellaneous symbols such as emoji.
|
||||
|
||||
- It breaks on all spaces, even the "non-breaking" ones.
|
||||
"""
|
||||
text = unicodedata.normalize('NFC', text)
|
||||
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
|
||||
def remove_arabic_marks(text):
|
||||
"""
|
||||
Remove decorations from Arabic words:
|
||||
|
||||
- Combining marks of class Mn, which tend to represent non-essential
|
||||
vowel markings.
|
||||
- Tatweels, horizontal segments that are used to extend or justify a
|
||||
word.
|
||||
"""
|
||||
return ARABIC_MARK_RE.sub('', text)
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
"""
|
||||
Tokenize this text in a way that's relatively simple but appropriate for
|
||||
the language.
|
||||
|
||||
So far, this means:
|
||||
|
||||
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
|
||||
- Japanese will be delegated to the external mecab-python module.
|
||||
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||
language will only split on punctuation and script boundaries, giving
|
||||
you untokenized globs of characters that probably represent many words.
|
||||
- All other languages will be tokenized using a regex that mostly
|
||||
implements the Word Segmentation section of Unicode Annex #29.
|
||||
See `simple_tokenize` for details.
|
||||
|
||||
Additionally, the text will be case-folded to lowercase, and text marked
|
||||
as Arabic will be normalized more strongly and have combining marks and
|
||||
tatweels removed.
|
||||
|
||||
Strings that are looked up in wordfreq will be run through this function
|
||||
first, so that they can be expected to match the data.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
|
||||
|
||||
return simple_tokenize(text)
|
||||
|
@ -6,7 +6,7 @@ def test_tokenizer_1():
|
||||
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
|
||||
tokens = [
|
||||
'this', 'is', 'a', 'test', 'she', 'said',
|
||||
'and', "i'll", 'bet', "y'all", '3', '50', 'that',
|
||||
'and', "i'll", 'bet', "y", "all", '3.50', 'that',
|
||||
'it', "won't", 'fail',
|
||||
]
|
||||
result = cld2_surface_tokenizer(text)
|
||||
|
20
wordfreq_builder/tests/test_urls.py
Normal file
20
wordfreq_builder/tests/test_urls.py
Normal file
@ -0,0 +1,20 @@
|
||||
from wordfreq_builder.word_counts import URL_RE
|
||||
from nose.tools import eq_
|
||||
|
||||
|
||||
def check_url(url):
|
||||
match = URL_RE.match(url)
|
||||
assert match
|
||||
eq_(match.span(), (0, len(url)))
|
||||
|
||||
|
||||
def test_url_re():
|
||||
# URLs like this are all over the Arabic Wikipedia. Here's one with the
|
||||
# student ID blanked out.
|
||||
yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
|
||||
|
||||
yield check_url, 'https://example.com/űnicode.html'
|
||||
yield check_url, 'http://☃.net'
|
||||
|
||||
assert not URL_RE.match('ftp://127.0.0.1')
|
||||
|
@ -123,7 +123,6 @@ def google_books_deps(dirname_in):
|
||||
|
||||
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
||||
languages):
|
||||
|
||||
lines = []
|
||||
|
||||
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
|
||||
|
@ -1,7 +1,6 @@
|
||||
from html.entities import name2codepoint
|
||||
from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
|
||||
from wordfreq import tokenize
|
||||
from ftfy.fixes import unescape_html
|
||||
import re
|
||||
import regex
|
||||
import pycld2
|
||||
|
||||
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
||||
@ -11,19 +10,22 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
||||
'\x0e-\x1f',
|
||||
'\x7f-\x9f',
|
||||
'\ud800-\udfff',
|
||||
'\ufdd0-\ufdef'
|
||||
'\ufdd0-\ufdef',
|
||||
'\N{HANGUL FILLER}',
|
||||
'\N{HANGUL CHOSEONG FILLER}',
|
||||
'\N{HANGUL JUNGSEONG FILLER}'
|
||||
] +
|
||||
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
||||
)
|
||||
CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
|
||||
CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
|
||||
|
||||
TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
|
||||
TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
||||
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
|
||||
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
||||
|
||||
|
||||
def cld2_surface_tokenizer(text):
|
||||
"""
|
||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
|
||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
|
||||
"""
|
||||
text = unescape_html(text)
|
||||
text = TWITTER_HANDLE_RE.sub('', text)
|
||||
@ -35,7 +37,7 @@ def cld2_surface_tokenizer(text):
|
||||
|
||||
def cld2_detect_language(text):
|
||||
"""
|
||||
Uses CLD2 to detect the language
|
||||
Uses CLD2 to detect the language.
|
||||
"""
|
||||
# Format of pycld2.detect:
|
||||
# (Confident in result: bool,
|
||||
@ -45,16 +47,19 @@ def cld2_detect_language(text):
|
||||
# Language code: str
|
||||
# Percent of text in this language: float
|
||||
# Confidence score: float))
|
||||
|
||||
|
||||
text = CLD2_BAD_CHARS_RE.sub('', text)
|
||||
return pycld2.detect(text)[2][0][1]
|
||||
|
||||
|
||||
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
||||
"""
|
||||
Process a file by running it through the given tokenizer, sorting the
|
||||
results by the language of each line, and inserting newlines
|
||||
to mark the token boundaries.
|
||||
Process a file by running it through the Twitter-specific tokenizer,
|
||||
which uses cld2 for language detection, and removes Twitter handles
|
||||
and t.co URLs.
|
||||
|
||||
Produces output files that are separated by language, with newlines
|
||||
between the tokens.
|
||||
"""
|
||||
out_files = {}
|
||||
with open(in_filename, encoding='utf-8') as in_file:
|
||||
|
@ -6,6 +6,12 @@ import math
|
||||
import csv
|
||||
import msgpack
|
||||
import gzip
|
||||
import regex
|
||||
|
||||
|
||||
# Match common cases of URLs: the schema http:// or https:// followed by
|
||||
# non-whitespace characters.
|
||||
URL_RE = regex.compile(r'https?://(?:\S)+')
|
||||
|
||||
|
||||
def count_tokens(filename):
|
||||
@ -13,11 +19,13 @@ def count_tokens(filename):
|
||||
Count tokens that appear in a file, running each line through our
|
||||
simple tokenizer.
|
||||
|
||||
Unicode errors in the input data will become token boundaries.
|
||||
URLs will be skipped, and Unicode errors will become separate tokens
|
||||
containing '<EFBFBD>'.
|
||||
"""
|
||||
counts = defaultdict(int)
|
||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||
for line in infile:
|
||||
line = URL_RE.sub('', line.strip())
|
||||
for token in simple_tokenize(line):
|
||||
counts[token] += 1
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user