From f95c48b93ddaf6b489725ad7f648ce3e95f253c5 Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 4 Aug 2017 14:25:05 +0200 Subject: [PATCH] POS: Adopt to async NLTK data download --- orangecontrib/text/tag/__init__.py | 10 ++++----- orangecontrib/text/tag/pos.py | 24 ++++++++++++++++------ orangecontrib/text/tests/test_corpus.py | 13 +++++++----- orangecontrib/text/tests/test_tags.py | 4 ++-- orangecontrib/text/widgets/owpreprocess.py | 14 +++++++++---- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/orangecontrib/text/tag/__init__.py b/orangecontrib/text/tag/__init__.py index 12f4c854e..ffb43ab5e 100644 --- a/orangecontrib/text/tag/__init__.py +++ b/orangecontrib/text/tag/__init__.py @@ -5,17 +5,15 @@ This module provides a default `pos_tagger` that can be used for POSTagging an English corpus:: >>> from orangecontrib.text.corpus import Corpus - >>> from orangecontrib.text.tag import pos_tagger + >>> from orangecontrib.text.tag import AveragedPerceptronTagger >>> corpus = Corpus.from_file('deerwester.tab') - >>> tagged_corpus = pos_tagger.tag_corpus(corpus) + >>> tagger = AveragedPerceptronTagger() + >>> tagged_corpus = tagger.tag_corpus(corpus) >>> tagged_corpus.pos_tags[0] # you can use `pos_tags` attribute to access tags directly ['JJ', 'NN', 'NN', 'IN', 'NN', 'NN', 'NN', 'NNS'] >>> next(tagged_corpus.ngrams_iterator(include_postags=True)) # or `ngrams_iterator` to iterate over documents ['human_JJ', 'machine_NN', 'interface_NN', 'for_IN', 'lab_NN', 'abc_NN', 'computer_NN', 'applications_NNS'] - """ -from .pos import POSTagger, StanfordPOSTagger, taggers - -pos_tagger = taggers[0] +from .pos import * diff --git a/orangecontrib/text/tag/pos.py b/orangecontrib/text/tag/pos.py index e126783ee..ff39e2343 100644 --- a/orangecontrib/text/tag/pos.py +++ b/orangecontrib/text/tag/pos.py @@ -2,8 +2,10 @@ import numpy as np from orangecontrib.text.util import chunkable +from orangecontrib.text.misc import wait_nltk_data -nltk.download(['averaged_perceptron_tagger', 'maxent_treebank_pos_tagger']) + +__all__ = ['POSTagger', 'StanfordPOSTagger', 'AveragedPerceptronTagger', 'MaxEntTagger'] class POSTagger: @@ -62,8 +64,18 @@ def __str__(self): return "{} (model: {})".format(self.name, self._stanford_model) -taggers = [ - POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'), - POSTagger(nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'), - 'Treebank POS Tagger (MaxEnt)'), -] +class AveragedPerceptronTagger(POSTagger): + name = 'Averaged Perceptron Tagger' + + @wait_nltk_data + def __init__(self): + super().__init__(nltk.PerceptronTagger(), self.name) + + +class MaxEntTagger(POSTagger): + name = 'Treebank POS Tagger (MaxEnt)' + + @wait_nltk_data + def __init__(self): + tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle') + super().__init__(tagger, self.name) diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py index b9fc48086..55576059a 100644 --- a/orangecontrib/text/tests/test_corpus.py +++ b/orangecontrib/text/tests/test_corpus.py @@ -10,10 +10,13 @@ from orangecontrib.text import preprocess from orangecontrib.text.corpus import Corpus -from orangecontrib.text.tag import pos_tagger +from orangecontrib.text.tag import AveragedPerceptronTagger class CorpusTests(unittest.TestCase): + def setUp(self): + self.pos_tagger = AveragedPerceptronTagger() + def test_init_preserve_shape_of_empty_x(self): c = Corpus.from_file('book-excerpts') d = c.domain @@ -66,7 +69,7 @@ def test_extend(self): c2 = c[:5] self.assertEqual(len(c2), 5) n = len(c) - pos_tagger.tag_corpus(c) + self.pos_tagger.tag_corpus(c) self.assertIsNot(c._tokens, None) self.assertIsNot(c.pos_tags, None) self.assertIs(c2._tokens, None) @@ -77,8 +80,8 @@ def test_extend(self): self.assertIs(c._tokens, None) self.assertIs(c.pos_tags, None) - pos_tagger.tag_corpus(c) - pos_tagger.tag_corpus(c2) + self.pos_tagger.tag_corpus(c) + self.pos_tagger.tag_corpus(c2) c.extend(c2) self.assertEqual(len(c), n + 10) self.assertEqual(len(c._tokens), n + 10) @@ -330,7 +333,7 @@ def test_ngrams_iter(self): self.assertIn(ngram, list(c.ngrams_iterator(join_with=None))[0]) self.assertIn('-'.join(ngram), list(c.ngrams_iterator(join_with='-'))[0]) - pos_tagger.tag_corpus(c) + self.pos_tagger.tag_corpus(c) c.ngram_range = (1, 1) for doc in c.ngrams_iterator(join_with='_', include_postags=True): for token in doc: diff --git a/orangecontrib/text/tests/test_tags.py b/orangecontrib/text/tests/test_tags.py index 34d272e9d..1300bf764 100644 --- a/orangecontrib/text/tests/test_tags.py +++ b/orangecontrib/text/tests/test_tags.py @@ -10,7 +10,7 @@ class POSTaggerTests(unittest.TestCase): def test_POSTagger(self): corpus = Corpus.from_file('deerwester') - tagger = tag.pos_tagger + tagger = tag.AveragedPerceptronTagger() result = tagger.tag_corpus(corpus) self.assertTrue(hasattr(result, 'pos_tags')) # for token in itertools.chain(*result.tokens): @@ -33,7 +33,7 @@ def test_str(self): def test_preprocess(self): pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'), - pos_tagger=tag.taggers[0]) + pos_tagger=tag.AveragedPerceptronTagger()) corpus = Corpus.from_file('deerwester') pr(corpus, inplace=True) self.assertIsNotNone(corpus.pos_tags) diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index f38adf05b..6334b1003 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -12,8 +12,8 @@ from orangecontrib.text import preprocess from orangecontrib.text.corpus import Corpus from orangecontrib.text.misc import nltk_data_dir -from orangecontrib.text.tag import StanfordPOSTagger -from orangecontrib.text.tag import taggers +from orangecontrib.text.tag import StanfordPOSTagger, AveragedPerceptronTagger, \ + MaxEntTagger from orangecontrib.text.widgets.utils import widgets, ResourceLoader from orangecontrib.text.widgets.utils.concurrent import asynchronous @@ -453,14 +453,20 @@ class POSTaggingModule(SingleMethodModule): attribute = 'pos_tagger' enabled = settings.Setting(False) - STANFORD = len(taggers) stanford = settings.SettingProvider(ResourceLoader) - methods = taggers + [StanfordPOSTagger] + methods = [AveragedPerceptronTagger, MaxEntTagger, StanfordPOSTagger] + STANFORD = 2 + initialize_methods = False def setup_method_layout(self): super().setup_method_layout() + # initialize all methods except StanfordPOSTagger + # cannot be done in superclass due to StanfordPOSTagger + for i, method in enumerate(self.methods[:self.STANFORD]): + self.methods[i] = method() + self.stanford = ResourceLoader(widget=self.master, model_format='Stanford model (*.model *.tagger)', provider_format='Java file (*.jar)', model_button_label='Model', provider_button_label='Tagger')