Merge pull request #22 from LuminosoInsight/standard-tokenizer

Use a more standard Unicode tokenizer
This commit is contained in:
Andrew Lin 2015-08-27 11:56:19 -04:00
commit e6d9b36203
43 changed files with 218 additions and 179 deletions

View File

@ -2,6 +2,7 @@ Tools for working with word frequencies from various corpora.
Author: Rob Speer Author: Rob Speer
## Installation ## Installation
wordfreq requires Python 3 and depends on a few other Python modules wordfreq requires Python 3 and depends on a few other Python modules
@ -21,11 +22,25 @@ install them on Ubuntu:
sudo apt-get install mecab-ipadic-utf8 libmecab-dev sudo apt-get install mecab-ipadic-utf8 libmecab-dev
pip3 install mecab-python3 pip3 install mecab-python3
## Unicode data
The tokenizers that split non-Japanese phrases utilize regexes built using the ## Tokenization
`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0. To
update these regexes, run `scripts/gen_regex.py`. wordfreq uses the Python package `regex`, which is a more advanced
implementation of regular expressions than the standard library, to
separate text into tokens that can be counted consistently. `regex`
produces tokens that follow the recommendations in [Unicode
Annex #29, Text Segmentation][uax29].
There are language-specific exceptions:
- In Arabic, it additionally normalizes ligatures and removes combining marks.
- In Japanese, instead of using the regex library, it uses the external library
`mecab-python3`. This is an optional dependency of wordfreq, and compiling
it requires the `libmecab-dev` system package to be installed.
- It does not yet attempt to tokenize Chinese ideograms.
[uax29]: http://unicode.org/reports/tr29/
## License ## License
@ -56,5 +71,5 @@ sources:
Some additional data was collected by a custom application that watches the Some additional data was collected by a custom application that watches the
streaming Twitter API, in accordance with Twitter's Developer Agreement & streaming Twitter API, in accordance with Twitter's Developer Agreement &
Policy. This software only gives statistics about words that are very commonly Policy. This software gives statistics about words that are commonly used on
used on Twitter; it does not display or republish any Twitter content. Twitter; it does not display or republish any Twitter content.

View File

@ -1,76 +0,0 @@
import unicodedata
from ftfy import chardata
import pathlib
from pkg_resources import resource_filename
CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
def func_to_regex(accept_func):
"""
Given a function that returns True or False for a numerical codepoint,
return a regex character class accepting the characters resulting in True.
Ranges separated only by unassigned characters are merged for efficiency.
"""
# parsing_range is True if the current codepoint might be in a range that
# the regex will accept
parsing_range = False
ranges = []
for codepoint, category in enumerate(CATEGORIES):
if accept_func(codepoint):
if not parsing_range:
ranges.append([codepoint, codepoint])
parsing_range = True
else:
ranges[-1][1] = codepoint
elif category != 'Cn':
parsing_range = False
return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
def cache_regex_from_func(filename, func):
"""
Generates a regex from a function that accepts a single unicode character,
and caches it in the data path at filename.
"""
with (DATA_PATH / filename).open(mode='w') as file:
file.write(func_to_regex(func))
def _is_emoji_codepoint(i):
"""
Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
character (as future-proofed by the ftfy chardata module) but excluding
symbols like © and below U+2600 and the replacement character U+FFFD.
"""
return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd
def _is_non_punct_codepoint(i):
"""
Report whether a numerical codepoint is not one of the following classes:
- P: punctuation
- S: symbols
- Z: separators
- C: control characters
This will classify symbols, including emoji, as punctuation; users that
want to accept emoji should add them separately.
"""
return CATEGORIES[i][0] not in 'PSZC'
def _is_combining_mark_codepoint(i):
"""
Report whether a numerical codepoint is a combining mark (Unicode 'M').
"""
return CATEGORIES[i][0] == 'M'
if __name__ == '__main__':
cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)

View File

@ -26,14 +26,14 @@ classifiers = [
current_dir = os.path.dirname(__file__) current_dir = os.path.dirname(__file__)
README_contents = open(os.path.join(current_dir, 'README.md')).read() README_contents = open(os.path.join(current_dir, 'README.md')).read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes'] dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
if sys.version_info < (3, 4): if sys.version_info < (3, 4):
dependencies.append('pathlib') dependencies.append('pathlib')
setup( setup(
name="wordfreq", name="wordfreq",
version='1.0', version='1.1',
maintainer='Luminoso Technologies, Inc.', maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com', maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -95,13 +95,17 @@ def test_failed_cB_conversion():
def test_tokenization(): def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the # We preserve apostrophes within words, so "can't" is a single word in the
# data # data
eq_(tokenize("can't", 'en'), ["can't"]) eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
# Certain punctuation does not inherently split a word.
eq_(tokenize("Anything is possible at zombo.com", 'en'),
['anything', 'is', 'possible', 'at', 'zombo.com'])
# Splits occur after symbols, and at splitting punctuation such as hyphens.
eq_(tokenize('😂test', 'en'), ['😂', 'test']) eq_(tokenize('😂test', 'en'), ['😂', 'test'])
# We do split at other punctuation, causing the word-combining rule to eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
# apply.
eq_(tokenize("can.t", 'en'), ['can', 't'])
def test_casefolding(): def test_casefolding():
@ -110,11 +114,11 @@ def test_casefolding():
def test_phrase_freq(): def test_phrase_freq():
plant = word_frequency("plan.t", 'en') ff = word_frequency("flip-flop", 'en')
assert_greater(plant, 0) assert_greater(ff, 0)
assert_almost_equal( assert_almost_equal(
1.0 / plant, 1.0 / ff,
1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en') 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
) )
@ -134,8 +138,8 @@ def test_not_really_random():
def test_not_enough_ascii(): def test_not_enough_ascii():
random_ascii_words(lang='zh') random_ascii_words(lang='zh')
def test_ar():
def test_ar():
# Remove tatweels # Remove tatweels
eq_( eq_(
tokenize('متــــــــعب', 'ar'), tokenize('متــــــــعب', 'ar'),
@ -152,3 +156,16 @@ def test_ar():
tokenize('\ufefb', 'ar'), # An Arabic ligature... tokenize('\ufefb', 'ar'), # An Arabic ligature...
['\u0644\u0627'] # ...that is affected by NFKC normalization ['\u0644\u0627'] # ...that is affected by NFKC normalization
) )
def test_ideographic_fallback():
# Try tokenizing Chinese text -- it should remain stuck together.
eq_(tokenize('中国文字', 'zh'), ['中国文字'])
# When Japanese is tagged with the wrong language, it will be split
# at script boundaries.
ja_text = 'ひらがなカタカナromaji'
eq_(
tokenize(ja_text, 'en'),
['ひらがな', 'カタカナ', 'romaji']
)

View File

@ -1,14 +1,13 @@
from wordfreq.tokens import tokenize, simple_tokenize
from pkg_resources import resource_filename from pkg_resources import resource_filename
from functools import lru_cache from functools import lru_cache
import langcodes import langcodes
import msgpack import msgpack
import re
import gzip import gzip
import itertools import itertools
import pathlib import pathlib
import random import random
import logging import logging
import unicodedata
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
CACHE_SIZE = 100000 CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
def load_range(filename):
"""
Load a file from the data path.
"""
with (DATA_PATH / filename).open() as file:
return file.read()
EMOJI_RANGE = load_range('emoji.txt') # simple_tokenize is imported so that other things can import it from here.
NON_PUNCT_RANGE = load_range('non_punct.txt') # Suppress the pyflakes warning.
COMBINING_MARK_RANGE = load_range('combining_mark.txt') simple_tokenize = simple_tokenize
COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
def simple_tokenize(text):
"""
A simple tokenizer that can be applied to most languages.
It considers a word to be made of a sequence of 'token characters', an
overly inclusive range that includes letters, Han characters, emoji, and a
bunch of miscellaneous whatnot, but excludes most punctuation and
whitespace.
The single complication for the sake of English is that apostrophes are not
considered part of the token if they appear on the edge of the character
sequence, but they are if they appear internally. "cats'" is not a token,
but "cat's" is.
"""
return [token.casefold() for token in TOKEN_RE.findall(text)]
mecab_tokenize = None
def tokenize(text, lang):
"""
Tokenize this text in a way that's straightforward but appropriate for
the language.
So far, this means that Japanese is handled by mecab_tokenize, and
everything else is handled by simple_tokenize. Additionally, Arabic commas
and combining marks are removed.
Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data.
"""
if lang == 'ja':
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize
return mecab_tokenize(text)
if lang == 'ar':
text = standardize_arabic(text)
return simple_tokenize(text)
def standardize_arabic(text):
"""
Standardizes arabic text by removing combining marks and tatweels.
"""
return unicodedata.normalize(
'NFKC',
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
)
def read_cBpack(filename): def read_cBpack(filename):

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1 +0,0 @@
[̀-ͯ҃-҉֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ׇؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݊ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࡙࠭-࡛ࣤ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঁ-ঃ়-়া-্ৗ-ৗৢ-ৣਁ-ਃ਼-ੑੰ-ੱੵ-ઃ઼-઼ા-્ૢ-ૣଁ-ଃ଼-଼ା-ୗୢ-ୣஂ-ஂா-்ௗ-ௗఁ-ఃా-ౖౢ-ౣಂ-ಃ಼-಼ಾ-ೖೢ-ೣം-ഃാ-്ൗ-ൗൢ-ൣං-ඃ්-ෳั-ัิ-ฺ็-๎ັ-ັິ-ຼ່-ໍ༘-༙༵-༵༷-༹༷-༹༾-༿ཱ-྄྆-྇ྍ-ྼ࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፝-፟ᜒ-᜔ᜲ-᜴ᝒ-ᝓᝲ-ᝳ឴-៓៝-៝᠋-᠍ᢩ-ᢩᤠ-᤻ᦰ-ᧀᧈ-ᧉᨗ-ᨛᩕ-᩿ᬀ-ᬄ᬴-᭄᭫-᭳ᮀ-ᮂᮡ-ᮭ᯦-᯳ᰤ-᰷᳐-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿⃐-⃰⳯-⵿⳱-⵿ⷠ-〪ⷿ-゙〯-゚꙯-꙲ꙴ-꙽ꚟ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧꢀ-ꢁꢴ-꣄꣠-꣱ꤦ-꤭ꥇ-꥓ꦀ-ꦃ꦳-꧀ꨩ-ꨶꩃ-ꩃꩌ-ꩍꩻ-ꩻꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꫶ꯣ-ꯪ꯬-꯭ﬞ-ﬞ︀-️︠-𐇽︦-𐇽𐨁-𐨏𐨸-𐨿𑀀-𑀂𑀸-𑁆𑂀-𑂂𑂰-𑂺𑄀-𑄂𑄧-𑄴𑆀-𑆂𑆳-𑇀𑚫-𑚷𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠄀-󠇯]

View File

@ -1 +0,0 @@
[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯑⳥-⳪⺀-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽¦-¦│-│■-○-𐄷-𐄿𐅹-𐆉𐆌-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖭅𛲜-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍖🀀-🃿🄍-🣿]

View File

@ -1 +0,0 @@
[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙա-և֑-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠭ࡀ-࡛ࢠ-ॣ०-९ॱ-ৱ৴-৹ਁ-૯ଁ-୯ୱ-௲ఁ-౾ಂ-൵ൺ-ෳก-ฺเ-๎๐-๙ກ-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-ྼ࿆-࿆က-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏᎠ-Ᏼᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛮ-᜴ᝀ-៓ៗ-ៗៜ-៹᠋-᠍᠐-᤻᥆-᧚ᨀ-ᨛᨠ-᪙ᪧ-ᪧᬀ-᭙᭫-᭳ᮀ-᯳ᰀ-᰷᱀-ᱽ᳐-᳔᳒-ᾼι-ιῂ-ῌῐ-Ίῠ-Ῥῲ-ῼ⁰-⁹ⁿ-₉ₐ-ₜ⃐-⃰ℂ-ℂℇ-ℇℊ----ℤΩ-Ωℨ---ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↉①-⒛⓪-⓿❶-➓Ⰰ-ⳤⳫ-ⳳ⳽-⳽ⴀ-ⵯ⵿-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼ぁ-゚ゝ-ゟァ-ヺー-ㆎ㆒-㆕ㆠ-ㆺㇰ-ㇿ㈠-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶵一-ꒌꓐ-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠰-꠵ꡀ-ꡳꢀ-꣄꣐-ꣷꣻ-꤭ꤰ-꥓ꥠ-꧀ꧏ-꧙ꨀ-꩙ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-ퟻ豈-ﬨשׁ-ﮱﯓ-ﴽﵐ-ﷻ︀-️︠-︦ﹰ-ﻼ0---zヲ-ᅵ𐀀-𐃺𐄇-𐄳𐅀-𐅸𐆊-𐆊𐇽-𐎝𐎠-𐏏𐏑-𐡕𐡘-𐤛𐤠-𐤹𐦀-𐩇𐩠-𐩾𐬀-𐬵𐭀-𑁆𑁒-𑂺𑃐-𑄿𑆀-𑇄𑇐-𒑢𓀀-𛀁𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍠-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞺻🄀-🄊𠀀-𪘀󠄀-󠇯]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,4 +1,5 @@
import MeCab import MeCab
import unicodedata
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a # Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
@ -14,6 +15,7 @@ def mecab_tokenize(text):
contains the same table that the command-line version of MeCab would output. contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table. We find the tokens in the first column of this table.
""" """
text = unicodedata.normalize('NFKC', text.strip())
return [line.split('\t')[0] return [line.split('\t')[0]
for line in MECAB_ANALYZER.parse(text.strip()).split('\n') for line in MECAB_ANALYZER.parse(text).split('\n')
if line != '' and line != 'EOS'] if line != '' and line != 'EOS']

114
wordfreq/tokens.py Normal file
View File

@ -0,0 +1,114 @@
import regex
import unicodedata
TOKEN_RE = regex.compile(r"""
# Case 1: a special case for Chinese and Japanese
# -----------------------------------------------
# When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
# (\p{Script=Hiragana}), we allow a sequence of those characters to be
# glued together as a single token. Without this case, the standard rule
# (case 2) would make each character a separate token. This would be the
# correct behavior for word-wrapping, but a messy failure mode for NLP
# tokenization.
#
# It is, of course, better to use a tokenizer that is designed for Chinese
# or Japanese text. This is effectively a fallback for when the wrong
# tokenizer is used.
#
# This rule is listed first so that it takes precedence.
[\p{IsIdeo}\p{Script=Hiragana}]+ |
# Case 2: standard Unicode segmentation
# -------------------------------------
# The start of the token must be 'word-like', not punctuation or whitespace
# or various other things. However, we allow characters of category So
# (Symbol - Other) because many of these are emoji, which can convey
# meaning.
[\w\p{So}]
# The rest of the token matches characters that are not any sort of space
# (\S) and do not cause word breaks according to the Unicode word
# segmentation heuristic (\B).
(?:\B\S)*
""", regex.V1 | regex.WORD | regex.VERBOSE)
ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
def simple_tokenize(text):
"""
Tokenize the given text using a straightforward, Unicode-aware token
expression.
The expression mostly implements the rules of Unicode Annex #29 that
are contained in the `regex` module's word boundary matching, including
the refinement that splits words between apostrophes and vowels in order
to separate tokens such as the French article «l'». Our customizations
to the expression are:
- It leaves sequences of Chinese or Japanese characters (specifically, Han
ideograms and hiragana) relatively untokenized, instead of splitting each
character into its own token.
- It outputs only the tokens that start with a word-like character, or
miscellaneous symbols such as emoji.
- It breaks on all spaces, even the "non-breaking" ones.
"""
text = unicodedata.normalize('NFC', text)
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
def remove_arabic_marks(text):
"""
Remove decorations from Arabic words:
- Combining marks of class Mn, which tend to represent non-essential
vowel markings.
- Tatweels, horizontal segments that are used to extend or justify a
word.
"""
return ARABIC_MARK_RE.sub('', text)
mecab_tokenize = None
def tokenize(text, lang):
"""
Tokenize this text in a way that's relatively simple but appropriate for
the language.
So far, this means:
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
- Japanese will be delegated to the external mecab-python module.
- Chinese or Japanese texts that aren't identified as the appropriate
language will only split on punctuation and script boundaries, giving
you untokenized globs of characters that probably represent many words.
- All other languages will be tokenized using a regex that mostly
implements the Word Segmentation section of Unicode Annex #29.
See `simple_tokenize` for details.
Additionally, the text will be case-folded to lowercase, and text marked
as Arabic will be normalized more strongly and have combining marks and
tatweels removed.
Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data.
"""
if lang == 'ja':
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize
return mecab_tokenize(text)
if lang == 'ar':
text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text)

View File

@ -6,7 +6,7 @@ def test_tokenizer_1():
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."' text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
tokens = [ tokens = [
'this', 'is', 'a', 'test', 'she', 'said', 'this', 'is', 'a', 'test', 'she', 'said',
'and', "i'll", 'bet', "y'all", '3', '50', 'that', 'and', "i'll", 'bet', "y", "all", '3.50', 'that',
'it', "won't", 'fail', 'it', "won't", 'fail',
] ]
result = cld2_surface_tokenizer(text) result = cld2_surface_tokenizer(text)

View File

@ -0,0 +1,20 @@
from wordfreq_builder.word_counts import URL_RE
from nose.tools import eq_
def check_url(url):
match = URL_RE.match(url)
assert match
eq_(match.span(), (0, len(url)))
def test_url_re():
# URLs like this are all over the Arabic Wikipedia. Here's one with the
# student ID blanked out.
yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
yield check_url, 'https://example.com/űnicode.html'
yield check_url, 'http://☃.net'
assert not URL_RE.match('ftp://127.0.0.1')

View File

@ -123,7 +123,6 @@ def google_books_deps(dirname_in):
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
languages): languages):
lines = [] lines = []
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,

View File

@ -1,7 +1,6 @@
from html.entities import name2codepoint from wordfreq import tokenize
from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
from ftfy.fixes import unescape_html from ftfy.fixes import unescape_html
import re import regex
import pycld2 import pycld2
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
@ -11,19 +10,22 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
'\x0e-\x1f', '\x0e-\x1f',
'\x7f-\x9f', '\x7f-\x9f',
'\ud800-\udfff', '\ud800-\udfff',
'\ufdd0-\ufdef' '\ufdd0-\ufdef',
'\N{HANGUL FILLER}',
'\N{HANGUL CHOSEONG FILLER}',
'\N{HANGUL JUNGSEONG FILLER}'
] + ] +
[chr(65534+65536*x+y) for x in range(17) for y in range(2)] [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
) )
CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE) CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE)) TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
def cld2_surface_tokenizer(text): def cld2_surface_tokenizer(text):
""" """
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
""" """
text = unescape_html(text) text = unescape_html(text)
text = TWITTER_HANDLE_RE.sub('', text) text = TWITTER_HANDLE_RE.sub('', text)
@ -35,7 +37,7 @@ def cld2_surface_tokenizer(text):
def cld2_detect_language(text): def cld2_detect_language(text):
""" """
Uses CLD2 to detect the language Uses CLD2 to detect the language.
""" """
# Format of pycld2.detect: # Format of pycld2.detect:
# (Confident in result: bool, # (Confident in result: bool,
@ -45,16 +47,19 @@ def cld2_detect_language(text):
# Language code: str # Language code: str
# Percent of text in this language: float # Percent of text in this language: float
# Confidence score: float)) # Confidence score: float))
text = CLD2_BAD_CHARS_RE.sub('', text) text = CLD2_BAD_CHARS_RE.sub('', text)
return pycld2.detect(text)[2][0][1] return pycld2.detect(text)[2][0][1]
def tokenize_twitter(in_filename, out_prefix, tokenizer): def tokenize_twitter(in_filename, out_prefix, tokenizer):
""" """
Process a file by running it through the given tokenizer, sorting the Process a file by running it through the Twitter-specific tokenizer,
results by the language of each line, and inserting newlines which uses cld2 for language detection, and removes Twitter handles
to mark the token boundaries. and t.co URLs.
Produces output files that are separated by language, with newlines
between the tokens.
""" """
out_files = {} out_files = {}
with open(in_filename, encoding='utf-8') as in_file: with open(in_filename, encoding='utf-8') as in_file:

View File

@ -6,6 +6,12 @@ import math
import csv import csv
import msgpack import msgpack
import gzip import gzip
import regex
# Match common cases of URLs: the schema http:// or https:// followed by
# non-whitespace characters.
URL_RE = regex.compile(r'https?://(?:\S)+')
def count_tokens(filename): def count_tokens(filename):
@ -13,11 +19,13 @@ def count_tokens(filename):
Count tokens that appear in a file, running each line through our Count tokens that appear in a file, running each line through our
simple tokenizer. simple tokenizer.
Unicode errors in the input data will become token boundaries. URLs will be skipped, and Unicode errors will become separate tokens
containing '<EFBFBD>'.
""" """
counts = defaultdict(int) counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile: with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile: for line in infile:
line = URL_RE.sub('', line.strip())
for token in simple_tokenize(line): for token in simple_tokenize(line):
counts[token] += 1 counts[token] += 1