handle multi-word stems correctly

This commit is contained in:
Rob Speer 2015-04-29 13:45:53 -04:00
parent 60a7c4d1ec
commit 70c9e99ee4

View File

@ -10,12 +10,21 @@ def rosette_tokenizer(text):
# I'm aware this doesn't do the right things with multi-word stems.
# Wordfreq doesn't either. And wordfreq isn't designed to look up
# multiple words anyway.
return [stem + '|' + lang for (stem, pos, span) in analysis]
tokens = []
for (stem, pos, span) in analysis:
for subtoken in stem.split(' '):
tokens.append(subtoken + '|' + lang)
return tokens
def rosette_surface_tokenizer(text):
analysis, lang = ROSETTE.rosette.analyze(text)
return [text[span[0]:span[1]] + '|' + lang for (stem, pos, span) in analysis]
tokens = []
for (stem, pos, span) in analysis:
surface_text = text[span[0]:span[1]]
for subtoken in surface_text.split(' '):
tokens.append(subtoken + '|' + lang)
return tokens
def treebank_surface_tokenizer(text):