Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH][WIP] Add UDPipe Lemmatizer #367

Merged
merged 7 commits into from
Sep 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 95 additions & 1 deletion orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import os
import json
import ufal.udpipe as udpipe
import serverfiles
from nltk import stem
from Orange.misc.environ import data_dir


from orangecontrib.text.misc import wait_nltk_data

__all__ = ['BaseNormalizer', 'WordNetLemmatizer', 'PorterStemmer',
'SnowballStemmer', 'DictionaryLookupNormalizer']
'SnowballStemmer', 'DictionaryLookupNormalizer',
'UDPipeLemmatizer']


class BaseNormalizer:
Expand Down Expand Up @@ -79,3 +86,90 @@ def language(self):
def language(self, value):
self._language = value
self.normalizer = stem.SnowballStemmer(self.language.lower())


def language_to_name(language):
return language.lower().replace(' ', '') + 'ud'


def file_to_name(file):
return file.replace('-', '').replace('_', '')


def file_to_language(file):
return file[:file.find('ud')-1]\
.replace('-', ' ').replace('_', ' ').capitalize()


class UDPipeModels:
server_url = "http://file.biolab.si/files/udpipe/"

def __init__(self):
self.local_data = os.path.join(data_dir(versioned=False), 'udpipe/')
self.serverfiles = serverfiles.ServerFiles(self.server_url)
self.localfiles = serverfiles.LocalFiles(self.local_data,
serverfiles=self.serverfiles)
self._supported_languages = []

def __getitem__(self, language):
file_name = self._find_file(language_to_name(language))
return self.localfiles.localpath_download(file_name)

def _find_file(self, language):
return next(filter(lambda f: file_to_name(f).startswith(language),
map(lambda f: f[0], self.serverfiles.listfiles())))

@property
def supported_languages(self):
self._supported_languages = list(map(lambda f: file_to_language(f[0]),
self.serverfiles.listfiles()))
return self._supported_languages


class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'
str_format = '{self.name} ({self.language})'
models = UDPipeModels()
supported_languages = models.supported_languages

def __init__(self, language='English'):
self._language = language
self.model = None
self.output_format = udpipe.OutputFormat.newOutputFormat('epe')
self.use_tokenizer = False

def load_model(self):
if self.model is None:
self.model = udpipe.Model.load(self.models[self._language])

def normalize(self, token):
self.load_model()
sentence = udpipe.Sentence()
sentence.addWord(token)
self.model.tag(sentence, self.model.DEFAULT)
output = self.output_format.writeSentence(sentence)
return json.loads(output)['nodes'][0]['properties']['lemma']

def normalize_doc(self, document):
self.load_model()
tokens = []
tokenizer = self.model.newTokenizer(self.model.DEFAULT)
tokenizer.setText(document)
error = udpipe.ProcessingError()
sentence = udpipe.Sentence()
while tokenizer.nextSentence(sentence, error):
self.model.tag(sentence, self.model.DEFAULT)
output = self.output_format.writeSentence(sentence)
sentence = udpipe.Sentence()
tokens.extend([t['properties']['lemma']
for t in json.loads(output)['nodes']])
return tokens

@property
def language(self):
return self._language

@language.setter
def language(self, value):
self._language = value
self.model = None
12 changes: 10 additions & 2 deletions orangecontrib/text/preprocess/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ def process_document(self, document):
tokens = BASE_TOKENIZER.tokenize(document)

if self.normalizer:
tokens = self.normalizer(tokens)
if getattr(self.normalizer, 'use_tokenizer', False):
tokens = self.normalizer.normalize_doc(document)
else:
tokens = self.normalizer(tokens)

for filter in self.filters:
tokens = filter(tokens)
Expand Down Expand Up @@ -131,7 +134,10 @@ def __str__(self):
return '\n'.join(['{}: {}'.format(name, value) for name, value in self.report()])

def report(self):
return (
if getattr(self.normalizer, 'use_tokenizer', False):
self.tokenizer = \
'UDPipe Tokenizer ({})'.format(self.normalizer.language)
rep = (
('Transformers', ', '.join(str(tr) for tr in self.transformers)
if self.transformers else None),
('Tokenizer', str(self.tokenizer) if self.tokenizer else None),
Expand All @@ -144,6 +150,8 @@ def report(self):
else None),
('Pos tagger', str(self.pos_tagger) if self.pos_tagger else None),
)
del self.tokenizer
return rep


base_preprocessor = Preprocessor(transformers=BASE_TRANSFORMERS,
Expand Down
39 changes: 39 additions & 0 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import tempfile
import unittest
import os.path

import itertools
import nltk
Expand All @@ -9,6 +10,8 @@
from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.preprocess import Preprocessor
from orangecontrib.text.preprocess.normalize import file_to_language, \
file_to_name, language_to_name, UDPipeModels


def counted(f):
Expand Down Expand Up @@ -168,6 +171,20 @@ def test_snowball(self):
token = 'voudrais'
self.assertEqual(stemmer(token), nltk.SnowballStemmer(language='french').stem(token))

def test_udpipe(self):
"""Test udpipe token lemmatization"""
normalizer = preprocess.UDPipeLemmatizer()
normalizer.language = 'Slovenian'
self.assertEqual(normalizer('sem'), 'biti')

def test_udpipe_doc(self):
"""Test udpipe lemmatization with its own tokenization """
normalizer = preprocess.UDPipeLemmatizer()
normalizer.language = 'Slovenian'
normalizer.use_tokenizer = True
self.assertListEqual(normalizer.normalize_doc('Gori na gori hiša gori'),
['gora', 'na', 'gora', 'hiša', 'goreti'])

def test_porter_with_bad_input(self):
stemmer = preprocess.PorterStemmer()
self.assertRaises(TypeError, stemmer, 10)
Expand All @@ -177,6 +194,28 @@ def test_lookup_normalize(self):
self.assertEqual(dln.normalize('aka'), 'also known as')


class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self):
"""Test helper functions for label transformation"""
self.assertEqual(file_to_language('slovenian-sst-ud-2.0-170801.udpipe'),
'Slovenian sst')
self.assertEqual(file_to_name('slovenian-sst-ud-2.0-170801.udpipe'),
'sloveniansstud2.0170801.udpipe')
self.assertEqual(language_to_name('Slovenian sst'), 'sloveniansstud')

def test_udpipe_model(self):
"""Test udpipe models loading from server"""
models = UDPipeModels()
self.assertIn('Slovenian', models.supported_languages)
self.assertEqual(68, len(models.supported_languages))

local_file = os.path.join(models.local_data,
'slovenian-ud-2.0-170801.udpipe')
model = models['Slovenian']
self.assertEqual(model, local_file)
self.assertTrue(os.path.isfile(local_file))


class FilteringTests(unittest.TestCase):

def setUp(self):
Expand Down
40 changes: 37 additions & 3 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,21 +252,41 @@ class NormalizationModule(SingleMethodModule):
preprocess.PorterStemmer,
preprocess.SnowballStemmer,
preprocess.WordNetLemmatizer,
preprocess.UDPipeLemmatizer,
]

SNOWBALL = 1
UDPIPE = 3

snowball_language = settings.Setting('English')
udpipe_language = settings.Setting('English')
udpipe_tokenizer = settings.Setting(False)

def __init__(self, master):
super().__init__(master)

label = gui.label(self, self, 'Language:')
label.setAlignment(Qt.AlignRight)
self.method_layout.addWidget(label, self.SNOWBALL, 1)
box = widgets.ComboBox(self, 'snowball_language',
snowball_box = widgets.ComboBox(self, 'snowball_language',
items=preprocess.SnowballStemmer.supported_languages)
box.currentIndexChanged.connect(self.change_language)
self.method_layout.addWidget(box, self.SNOWBALL, 2)
snowball_box.currentIndexChanged.connect(self.change_language)
self.method_layout.addWidget(snowball_box, self.SNOWBALL, 2)
self.methods[self.SNOWBALL].language = self.snowball_language

self.udpipe_tokenizer_box = QCheckBox("UDPipe tokenizer", self,
checked=self.udpipe_tokenizer)
self.udpipe_tokenizer_box.stateChanged.connect(self.change_tokenizer)
self.method_layout.addWidget(self.udpipe_tokenizer_box, self.UDPIPE, 1)
label = gui.label(self, self, 'Language:')
label.setAlignment(Qt.AlignRight)
self.method_layout.addWidget(label, self.UDPIPE, 2)
udpipe_box = widgets.ComboBox(self, 'udpipe_language',
items=preprocess.UDPipeLemmatizer.supported_languages)
udpipe_box.currentIndexChanged.connect(self.change_language)
self.method_layout.addWidget(udpipe_box, self.UDPIPE, 3)
self.methods[self.UDPIPE].language = self.udpipe_language
self.methods[self.UDPIPE].use_tokenizer = self.udpipe_tokenizer

def change_language(self):
if self.methods[self.SNOWBALL].language != self.snowball_language:
Expand All @@ -275,6 +295,20 @@ def change_language(self):
if self.method_index == self.SNOWBALL:
self.change_signal.emit()

if self.methods[self.UDPIPE].language != self.udpipe_language:
self.methods[self.UDPIPE].language = self.udpipe_language

if self.method_index == self.UDPIPE:
self.change_signal.emit()

def change_tokenizer(self):
self.udpipe_tokenizer = self.udpipe_tokenizer_box.isChecked()
if self.methods[self.UDPIPE].use_tokenizer != self.udpipe_tokenizer:
self.methods[self.UDPIPE].use_tokenizer = self.udpipe_tokenizer

if self.method_index == self.UDPIPE:
self.change_signal.emit()


class TransformationModule(MultipleMethodModule):
attribute = 'transformers'
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ odfpy>=1.3.5
docx2txt>=0.6
lxml
biopython # Enables Pubmed widget.
ufal.udpipe