From c7c771f9591c7dd1046e68cbea5aa5aaac7c5094 Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 4 Aug 2017 14:25:05 +0200 Subject: [PATCH] POS: Adopt to async NLTK data download --- orangecontrib/text/tag/__init__.py | 10 ++++----- orangecontrib/text/tag/pos.py | 24 ++++++++++++++++------ orangecontrib/text/widgets/owpreprocess.py | 14 +++++++++---- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/orangecontrib/text/tag/__init__.py b/orangecontrib/text/tag/__init__.py index 12f4c854e..ffb43ab5e 100644 --- a/orangecontrib/text/tag/__init__.py +++ b/orangecontrib/text/tag/__init__.py @@ -5,17 +5,15 @@ This module provides a default `pos_tagger` that can be used for POSTagging an English corpus:: >>> from orangecontrib.text.corpus import Corpus - >>> from orangecontrib.text.tag import pos_tagger + >>> from orangecontrib.text.tag import AveragedPerceptronTagger >>> corpus = Corpus.from_file('deerwester.tab') - >>> tagged_corpus = pos_tagger.tag_corpus(corpus) + >>> tagger = AveragedPerceptronTagger() + >>> tagged_corpus = tagger.tag_corpus(corpus) >>> tagged_corpus.pos_tags[0] # you can use `pos_tags` attribute to access tags directly ['JJ', 'NN', 'NN', 'IN', 'NN', 'NN', 'NN', 'NNS'] >>> next(tagged_corpus.ngrams_iterator(include_postags=True)) # or `ngrams_iterator` to iterate over documents ['human_JJ', 'machine_NN', 'interface_NN', 'for_IN', 'lab_NN', 'abc_NN', 'computer_NN', 'applications_NNS'] - """ -from .pos import POSTagger, StanfordPOSTagger, taggers - -pos_tagger = taggers[0] +from .pos import * diff --git a/orangecontrib/text/tag/pos.py b/orangecontrib/text/tag/pos.py index e126783ee..a804bfdd1 100644 --- a/orangecontrib/text/tag/pos.py +++ b/orangecontrib/text/tag/pos.py @@ -2,8 +2,10 @@ import numpy as np from orangecontrib.text.util import chunkable +from orangecontrib.text.misc import wait_nltk_data -nltk.download(['averaged_perceptron_tagger', 'maxent_treebank_pos_tagger']) + +__all__ = ['AveragedPerceptronTagger', 'MaxEntTagger', 'StanfordPOSTagger'] class POSTagger: @@ -62,8 +64,18 @@ def __str__(self): return "{} (model: {})".format(self.name, self._stanford_model) -taggers = [ - POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'), - POSTagger(nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'), - 'Treebank POS Tagger (MaxEnt)'), -] +class AveragedPerceptronTagger(POSTagger): + name = 'Averaged Perceptron Tagger' + + @wait_nltk_data + def __init__(self): + super().__init__(nltk.PerceptronTagger(), self.name) + + +class MaxEntTagger(POSTagger): + name = 'Treebank POS Tagger (MaxEnt)' + + @wait_nltk_data + def __init__(self): + tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle') + super().__init__(tagger, self.name) diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index f38adf05b..6334b1003 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -12,8 +12,8 @@ from orangecontrib.text import preprocess from orangecontrib.text.corpus import Corpus from orangecontrib.text.misc import nltk_data_dir -from orangecontrib.text.tag import StanfordPOSTagger -from orangecontrib.text.tag import taggers +from orangecontrib.text.tag import StanfordPOSTagger, AveragedPerceptronTagger, \ + MaxEntTagger from orangecontrib.text.widgets.utils import widgets, ResourceLoader from orangecontrib.text.widgets.utils.concurrent import asynchronous @@ -453,14 +453,20 @@ class POSTaggingModule(SingleMethodModule): attribute = 'pos_tagger' enabled = settings.Setting(False) - STANFORD = len(taggers) stanford = settings.SettingProvider(ResourceLoader) - methods = taggers + [StanfordPOSTagger] + methods = [AveragedPerceptronTagger, MaxEntTagger, StanfordPOSTagger] + STANFORD = 2 + initialize_methods = False def setup_method_layout(self): super().setup_method_layout() + # initialize all methods except StanfordPOSTagger + # cannot be done in superclass due to StanfordPOSTagger + for i, method in enumerate(self.methods[:self.STANFORD]): + self.methods[i] = method() + self.stanford = ResourceLoader(widget=self.master, model_format='Stanford model (*.model *.tagger)', provider_format='Java file (*.jar)', model_button_label='Model', provider_button_label='Tagger')