Skip to content

Commit

Permalink
POS: Adopt to async NLTK data download
Browse files Browse the repository at this point in the history
  • Loading branch information
nikicc committed Sep 1, 2017
1 parent cc978e8 commit f95c48b
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 23 deletions.
10 changes: 4 additions & 6 deletions orangecontrib/text/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
This module provides a default `pos_tagger` that can be used for POSTagging an English corpus::
>>> from orangecontrib.text.corpus import Corpus
>>> from orangecontrib.text.tag import pos_tagger
>>> from orangecontrib.text.tag import AveragedPerceptronTagger
>>> corpus = Corpus.from_file('deerwester.tab')
>>> tagged_corpus = pos_tagger.tag_corpus(corpus)
>>> tagger = AveragedPerceptronTagger()
>>> tagged_corpus = tagger.tag_corpus(corpus)
>>> tagged_corpus.pos_tags[0] # you can use `pos_tags` attribute to access tags directly
['JJ', 'NN', 'NN', 'IN', 'NN', 'NN', 'NN', 'NNS']
>>> next(tagged_corpus.ngrams_iterator(include_postags=True)) # or `ngrams_iterator` to iterate over documents
['human_JJ', 'machine_NN', 'interface_NN', 'for_IN', 'lab_NN', 'abc_NN', 'computer_NN', 'applications_NNS']
"""

from .pos import POSTagger, StanfordPOSTagger, taggers

pos_tagger = taggers[0]
from .pos import *
24 changes: 18 additions & 6 deletions orangecontrib/text/tag/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import numpy as np

from orangecontrib.text.util import chunkable
from orangecontrib.text.misc import wait_nltk_data

nltk.download(['averaged_perceptron_tagger', 'maxent_treebank_pos_tagger'])

__all__ = ['POSTagger', 'StanfordPOSTagger', 'AveragedPerceptronTagger', 'MaxEntTagger']


class POSTagger:
Expand Down Expand Up @@ -62,8 +64,18 @@ def __str__(self):
return "{} (model: {})".format(self.name, self._stanford_model)


taggers = [
POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'),
POSTagger(nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'),
'Treebank POS Tagger (MaxEnt)'),
]
class AveragedPerceptronTagger(POSTagger):
name = 'Averaged Perceptron Tagger'

@wait_nltk_data
def __init__(self):
super().__init__(nltk.PerceptronTagger(), self.name)


class MaxEntTagger(POSTagger):
name = 'Treebank POS Tagger (MaxEnt)'

@wait_nltk_data
def __init__(self):
tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
super().__init__(tagger, self.name)
13 changes: 8 additions & 5 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@

from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.tag import pos_tagger
from orangecontrib.text.tag import AveragedPerceptronTagger


class CorpusTests(unittest.TestCase):
def setUp(self):
self.pos_tagger = AveragedPerceptronTagger()

def test_init_preserve_shape_of_empty_x(self):
c = Corpus.from_file('book-excerpts')
d = c.domain
Expand Down Expand Up @@ -66,7 +69,7 @@ def test_extend(self):
c2 = c[:5]
self.assertEqual(len(c2), 5)
n = len(c)
pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c)
self.assertIsNot(c._tokens, None)
self.assertIsNot(c.pos_tags, None)
self.assertIs(c2._tokens, None)
Expand All @@ -77,8 +80,8 @@ def test_extend(self):
self.assertIs(c._tokens, None)
self.assertIs(c.pos_tags, None)

pos_tagger.tag_corpus(c)
pos_tagger.tag_corpus(c2)
self.pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c2)
c.extend(c2)
self.assertEqual(len(c), n + 10)
self.assertEqual(len(c._tokens), n + 10)
Expand Down Expand Up @@ -330,7 +333,7 @@ def test_ngrams_iter(self):
self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0])
self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0])

pos_tagger.tag_corpus(c)
self.pos_tagger.tag_corpus(c)
c.ngram_range = (1, 1)
for doc in c.ngrams_iterator(join_with='_', include_postags=True):
for token in doc:
Expand Down
4 changes: 2 additions & 2 deletions orangecontrib/text/tests/test_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class POSTaggerTests(unittest.TestCase):
def test_POSTagger(self):
corpus = Corpus.from_file('deerwester')
tagger = tag.pos_tagger
tagger = tag.AveragedPerceptronTagger()
result = tagger.tag_corpus(corpus)
self.assertTrue(hasattr(result, 'pos_tags'))
# for token in itertools.chain(*result.tokens):
Expand All @@ -33,7 +33,7 @@ def test_str(self):

def test_preprocess(self):
pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
pos_tagger=tag.taggers[0])
pos_tagger=tag.AveragedPerceptronTagger())
corpus = Corpus.from_file('deerwester')
pr(corpus, inplace=True)
self.assertIsNotNone(corpus.pos_tags)
14 changes: 10 additions & 4 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.tag import StanfordPOSTagger
from orangecontrib.text.tag import taggers
from orangecontrib.text.tag import StanfordPOSTagger, AveragedPerceptronTagger, \
MaxEntTagger
from orangecontrib.text.widgets.utils import widgets, ResourceLoader
from orangecontrib.text.widgets.utils.concurrent import asynchronous

Expand Down Expand Up @@ -453,14 +453,20 @@ class POSTaggingModule(SingleMethodModule):
attribute = 'pos_tagger'
enabled = settings.Setting(False)

STANFORD = len(taggers)
stanford = settings.SettingProvider(ResourceLoader)

methods = taggers + [StanfordPOSTagger]
methods = [AveragedPerceptronTagger, MaxEntTagger, StanfordPOSTagger]
STANFORD = 2

initialize_methods = False

def setup_method_layout(self):
super().setup_method_layout()
# initialize all methods except StanfordPOSTagger
# cannot be done in superclass due to StanfordPOSTagger
for i, method in enumerate(self.methods[:self.STANFORD]):
self.methods[i] = method()

self.stanford = ResourceLoader(widget=self.master, model_format='Stanford model (*.model *.tagger)',
provider_format='Java file (*.jar)',
model_button_label='Model', provider_button_label='Tagger')
Expand Down

0 comments on commit f95c48b

Please sign in to comment.