From a1e7c74e457dfc1b49e6b962f348bbb3f9c791c4 Mon Sep 17 00:00:00 2001 From: Dian Li Date: Mon, 5 Apr 2021 18:17:01 -0400 Subject: [PATCH 1/5] add infer_batch_size to AllenNLPProcessor --- forte_wrapper/allennlp/allennlp_processors.py | 55 +++++++++++-------- setup.py | 1 + 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index c49198b..445a9e2 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -14,7 +14,8 @@ import itertools import logging -from typing import List, Dict +import more_itertools +from typing import Dict, List from allennlp.predictors import Predictor @@ -116,6 +117,8 @@ def default_configs(cls): multiple models are loaded, cuda devices are assigned in a round robin fashion. E.g. [0, -1] -> first model uses gpu 0 but second model uses cpu. + - infer_batch_size: batch size used for model inference. + A value <= 0 means no limit. """ config = super().default_configs() config.update({ @@ -126,33 +129,41 @@ def default_configs(cls): 'stanford_url': MODEL2URL['stanford'], 'srl_url': MODEL2URL['srl'], 'universal_url': MODEL2URL['universal'], - 'cuda_devices': [-1] + 'cuda_devices': [-1], + 'infer_batch_size': 0 }) return config def _process(self, input_pack: DataPack): # handle existing entries self._process_existing_entries(input_pack) - sentences = [_ for _ in input_pack.get(Sentence)] - inputs = [{"sentence": s.text} for s in sentences] - results = {k: p.predict_batch_json(inputs) - for k, p in self.predictor.items()} - for i in range(len(sentences)): - result = {} - for key in self.predictor.keys(): - if key == 'srl': - result.update( - parse_allennlp_srl_results(results[key][i]["verbs"]) - ) - else: - result.update(results[key][i]) - if "tokenize" in self.configs.processors: - # creating new tokens and dependencies - tokens = self._create_tokens(input_pack, sentences[i], result) - if "depparse" in self.configs.processors: - self._create_dependencies(input_pack, tokens, result) - if 'srl' in self.configs.processors: - self._create_srl(input_pack, tokens, result) + + batch_size = self.configs['infer_batch_size'] + if batch_size <= 0: + batches = [input_pack.get(Sentence)] + else: + batches = more_itertools.chunked(input_pack.get(Sentence), + batch_size, strict=False) + for sentences in batches: + inputs = [{"sentence": s.text} for s in sentences] + results = {k: p.predict_batch_json(inputs) + for k, p in self.predictor.items()} + for i, sentence in enumerate(sentences): + result = {} + for key in self.predictor.keys(): + if key == 'srl': + result.update(parse_allennlp_srl_results( + results[key][i]["verbs"] + )) + else: + result.update(results[key][i]) + if "tokenize" in self.configs.processors: + # creating new tokens and dependencies + tokens = self._create_tokens(input_pack, sentence, result) + if "depparse" in self.configs.processors: + self._create_dependencies(input_pack, tokens, result) + if 'srl' in self.configs.processors: + self._create_srl(input_pack, tokens, result) def _process_existing_entries(self, input_pack): tokens_exist = any(True for _ in input_pack.get(Token)) diff --git a/setup.py b/setup.py index 95d9ada..186a2a9 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ install_requires=[ 'forte @ git+ssh://git@github.com/asyml/forte.git' + 'more-itertools>=8.0.0' ], extras_require={ 'nltk': ['nltk==3.4.5'], From ce4b18003ecae49403f8eeb95254ff238db5fbb1 Mon Sep 17 00:00:00 2001 From: Dian Li Date: Tue, 6 Apr 2021 11:39:06 -0400 Subject: [PATCH 2/5] type hint --- forte_wrapper/allennlp/allennlp_processors.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index 445a9e2..3d1a11d 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -15,7 +15,7 @@ import itertools import logging import more_itertools -from typing import Dict, List +from typing import Any, Dict, Iterable, Iterator, List from allennlp.predictors import Predictor @@ -138,18 +138,22 @@ def _process(self, input_pack: DataPack): # handle existing entries self._process_existing_entries(input_pack) - batch_size = self.configs['infer_batch_size'] + batch_size: int = self.configs['infer_batch_size'] + batches: Iterator[Iterable[Sentence]] if batch_size <= 0: - batches = [input_pack.get(Sentence)] + batches = iter([input_pack.get(Sentence)]) else: - batches = more_itertools.chunked(input_pack.get(Sentence), - batch_size, strict=False) + batches = more_itertools.chunked( + input_pack.get(Sentence), batch_size) for sentences in batches: - inputs = [{"sentence": s.text} for s in sentences] - results = {k: p.predict_batch_json(inputs) - for k, p in self.predictor.items()} + inputs: List[Dict[str, str]] = [{"sentence": s.text} + for s in sentences] + results: Dict[str, List[Dict[str, Any]]] = { + k: p.predict_batch_json(inputs) + for k, p in self.predictor.items() + } for i, sentence in enumerate(sentences): - result = {} + result: Dict[str, List[str]] = {} for key in self.predictor.keys(): if key == 'srl': result.update(parse_allennlp_srl_results( From 0b4230ec134cdcc5a8591650b79a3e2c7e88d2b8 Mon Sep 17 00:00:00 2001 From: Dian Li Date: Tue, 6 Apr 2021 11:43:43 -0400 Subject: [PATCH 3/5] fix build --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 186a2a9..f462c13 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ platforms='any', install_requires=[ - 'forte @ git+ssh://git@github.com/asyml/forte.git' + 'forte @ git+ssh://git@github.com/asyml/forte.git', 'more-itertools>=8.0.0' ], extras_require={ From 1ab008d7450ef2569aff06f7e546111f3bc68aa5 Mon Sep 17 00:00:00 2001 From: Dian Li Date: Tue, 6 Apr 2021 16:07:34 -0400 Subject: [PATCH 4/5] fix lint --- forte_wrapper/allennlp/allennlp_processors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index 402a375..d2bab1b 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -14,8 +14,8 @@ import itertools import logging -import more_itertools from typing import Any, Dict, Iterable, Iterator, List +import more_itertools from allennlp.predictors import Predictor @@ -154,7 +154,7 @@ def _process(self, input_pack: DataPack): } for i, sent in enumerate(sentences): result: Dict[str, List[str]] = {} - for key in self.predictor.keys(): + for key in self.predictor: if key == 'srl': result.update(parse_allennlp_srl_results( results[key][i]["verbs"] From 3843bd1f0e3dab8a3335a71dce84593c2f45afd7 Mon Sep 17 00:00:00 2001 From: Dian Li Date: Tue, 6 Apr 2021 16:57:51 -0400 Subject: [PATCH 5/5] update doc --- forte_wrapper/allennlp/allennlp_processors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py index d2bab1b..53b87fa 100644 --- a/forte_wrapper/allennlp/allennlp_processors.py +++ b/forte_wrapper/allennlp/allennlp_processors.py @@ -117,8 +117,8 @@ def default_configs(cls): multiple models are loaded, cuda devices are assigned in a round robin fashion. E.g. [0, -1] -> first model uses gpu 0 but second model uses cpu. - - infer_batch_size: batch size used for model inference. - A value <= 0 means no limit. + - infer_batch_size: maximum number of sentences passed in as a + batch to model's predict function. A value <= 0 means no limit. """ config = super().default_configs() config.update({