Skip to content

Commit

Permalink
preprocess: add support for all UDPipe models
Browse files Browse the repository at this point in the history
Additionally to the code changes, all UDPipe models were add on the
server.
  • Loading branch information
robertcv committed Sep 3, 2018
1 parent 256efb0 commit 9d28c03
Showing 1 changed file with 21 additions and 9 deletions.
30 changes: 21 additions & 9 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ def language(self, value):
self.normalizer = stem.SnowballStemmer(self.language.lower())


def language_to_name(language):
return language.lower().replace(' ', '') + 'ud'


def file_to_name(file):
return file.replace('-', '').replace('_', '')


def file_to_language(file):
return file[:file.find('ud')-1]\
.replace('-', ' ').replace('_', ' ').capitalize()


class UDPipeModels:
server_url = "http://file.biolab.si/files/udpipe/"

Expand All @@ -99,30 +112,29 @@ def __init__(self):
self._supported_languages = []

def __getitem__(self, language):
file_name = self._find_file(language)
file_name = self._find_file(language_to_name(language))
return self.localfiles.localpath_download(file_name)

def _find_file(self, language):
return list(filter(lambda f: f.startswith(language),
map(lambda f: f[0],
self.serverfiles.listfiles())))[0]
return next(filter(lambda f: file_to_name(f).startswith(language),
map(lambda f: f[0], self.serverfiles.listfiles())))

@property
def supported_languages(self):
self._supported_languages = [f[0].split('-')[0]
for f in self.serverfiles.listfiles()]
self._supported_languages = list(map(lambda f: file_to_language(f[0]),
self.serverfiles.listfiles()))
return self._supported_languages


class UDPipeLemmatizer(BaseNormalizer):
name = 'UDPipe Lemmatizer'
str_format = '{self.name} ({self.language})'
models = UDPipeModels()
supported_languages = [l.capitalize() for l in models.supported_languages]
supported_languages = models.supported_languages

def __init__(self, language='English'):
self._language = language
self.model = udpipe.Model.load(self.models[self._language.lower()])
self.model = udpipe.Model.load(self.models[self._language])
self.output_format = udpipe.OutputFormat.newOutputFormat('epe')
self.use_tokenizer = False

Expand Down Expand Up @@ -160,4 +172,4 @@ def language(self):
@language.setter
def language(self, value):
self._language = value
self.model = udpipe.Model.load(self.models[self._language.lower()])
self.model = udpipe.Model.load(self.models[self._language])

0 comments on commit 9d28c03

Please sign in to comment.