Skip to content

Commit

Permalink
POS: Adopt to async NLTK data download
Browse files Browse the repository at this point in the history
  • Loading branch information
nikicc committed Aug 4, 2017
1 parent cc978e8 commit c7c771f
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 16 deletions.
10 changes: 4 additions & 6 deletions orangecontrib/text/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
This module provides a default `pos_tagger` that can be used for POSTagging an English corpus::
>>> from orangecontrib.text.corpus import Corpus
>>> from orangecontrib.text.tag import pos_tagger
>>> from orangecontrib.text.tag import AveragedPerceptronTagger
>>> corpus = Corpus.from_file('deerwester.tab')
>>> tagged_corpus = pos_tagger.tag_corpus(corpus)
>>> tagger = AveragedPerceptronTagger()
>>> tagged_corpus = tagger.tag_corpus(corpus)
>>> tagged_corpus.pos_tags[0] # you can use `pos_tags` attribute to access tags directly
['JJ', 'NN', 'NN', 'IN', 'NN', 'NN', 'NN', 'NNS']
>>> next(tagged_corpus.ngrams_iterator(include_postags=True)) # or `ngrams_iterator` to iterate over documents
['human_JJ', 'machine_NN', 'interface_NN', 'for_IN', 'lab_NN', 'abc_NN', 'computer_NN', 'applications_NNS']
"""

from .pos import POSTagger, StanfordPOSTagger, taggers

pos_tagger = taggers[0]
from .pos import *
24 changes: 18 additions & 6 deletions orangecontrib/text/tag/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import numpy as np

from orangecontrib.text.util import chunkable
from orangecontrib.text.misc import wait_nltk_data

nltk.download(['averaged_perceptron_tagger', 'maxent_treebank_pos_tagger'])

__all__ = ['AveragedPerceptronTagger', 'MaxEntTagger', 'StanfordPOSTagger']


class POSTagger:
Expand Down Expand Up @@ -62,8 +64,18 @@ def __str__(self):
return "{} (model: {})".format(self.name, self._stanford_model)


taggers = [
POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'),
POSTagger(nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'),
'Treebank POS Tagger (MaxEnt)'),
]
class AveragedPerceptronTagger(POSTagger):
name = 'Averaged Perceptron Tagger'

@wait_nltk_data
def __init__(self):
super().__init__(nltk.PerceptronTagger(), self.name)


class MaxEntTagger(POSTagger):
name = 'Treebank POS Tagger (MaxEnt)'

@wait_nltk_data
def __init__(self):
tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
super().__init__(tagger, self.name)
14 changes: 10 additions & 4 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.tag import StanfordPOSTagger
from orangecontrib.text.tag import taggers
from orangecontrib.text.tag import StanfordPOSTagger, AveragedPerceptronTagger, \
MaxEntTagger
from orangecontrib.text.widgets.utils import widgets, ResourceLoader
from orangecontrib.text.widgets.utils.concurrent import asynchronous

Expand Down Expand Up @@ -453,14 +453,20 @@ class POSTaggingModule(SingleMethodModule):
attribute = 'pos_tagger'
enabled = settings.Setting(False)

STANFORD = len(taggers)
stanford = settings.SettingProvider(ResourceLoader)

methods = taggers + [StanfordPOSTagger]
methods = [AveragedPerceptronTagger, MaxEntTagger, StanfordPOSTagger]
STANFORD = 2

initialize_methods = False

def setup_method_layout(self):
super().setup_method_layout()
# initialize all methods except StanfordPOSTagger
# cannot be done in superclass due to StanfordPOSTagger
for i, method in enumerate(self.methods[:self.STANFORD]):
self.methods[i] = method()

self.stanford = ResourceLoader(widget=self.master, model_format='Stanford model (*.model *.tagger)',
provider_format='Java file (*.jar)',
model_button_label='Model', provider_button_label='Tagger')
Expand Down

0 comments on commit c7c771f

Please sign in to comment.