asyml · hunterhector · Apr 6, 2021 · Apr 5, 2021 · Apr 6, 2021 · Apr 6, 2021
diff --git a/forte_wrapper/allennlp/allennlp_processors.py b/forte_wrapper/allennlp/allennlp_processors.py
@@ -14,7 +14,8 @@
 
 import itertools
 import logging
-from typing import List, Dict
+from typing import Any, Dict, Iterable, Iterator, List
+import more_itertools
 
 from allennlp.predictors import Predictor
 
@@ -116,6 +117,8 @@ def default_configs(cls):
                 multiple models are loaded, cuda devices are assigned in a
                 round robin fashion. E.g. [0, -1] -> first model uses gpu 0
                 but second model uses cpu.
+            - infer_batch_size: maximum number of sentences passed in as a
+                batch to model's predict function. A value <= 0 means no limit.
         """
         config = super().default_configs()
         config.update({
@@ -126,33 +129,45 @@ def default_configs(cls):
             'stanford_url': MODEL2URL['stanford'],
             'srl_url': MODEL2URL['srl'],
             'universal_url': MODEL2URL['universal'],
-            'cuda_devices': [-1]
+            'cuda_devices': [-1],
+            'infer_batch_size': 0
         })
         return config
 
     def _process(self, input_pack: DataPack):
         # handle existing entries
         self._process_existing_entries(input_pack)
-        sentences = input_pack.get(Sentence)
-        inputs = [{"sentence": s.text} for s in sentences]
-        results = {k: p.predict_batch_json(inputs)
-                   for k, p in self.predictor.items()}
-        for i, sent in enumerate(sentences):
-            result = {}
-            for key in self.predictor:
-                if key == 'srl':
-                    result.update(
-                        parse_allennlp_srl_results(results[key][i]["verbs"])
-                    )
-                else:
-                    result.update(results[key][i])
-            if "tokenize" in self.configs.processors:
-                # creating new tokens and dependencies
-                tokens = self._create_tokens(input_pack, sent, result)
-                if "depparse" in self.configs.processors:
-                    self._create_dependencies(input_pack, tokens, result)
-                if 'srl' in self.configs.processors:
-                    self._create_srl(input_pack, tokens, result)
+
+        batch_size: int = self.configs['infer_batch_size']
+        batches: Iterator[Iterable[Sentence]]
+        if batch_size <= 0:
+            batches = iter([input_pack.get(Sentence)])
+        else:
+            batches = more_itertools.chunked(
+                input_pack.get(Sentence), batch_size)
+        for sentences in batches:
+            inputs: List[Dict[str, str]] = [{"sentence": s.text}
+                                            for s in sentences]
+            results: Dict[str, List[Dict[str, Any]]] = {
+                k: p.predict_batch_json(inputs)
+                for k, p in self.predictor.items()
+            }
+            for i, sent in enumerate(sentences):
+                result: Dict[str, List[str]] = {}
+                for key in self.predictor:
+                    if key == 'srl':
+                        result.update(parse_allennlp_srl_results(
+                            results[key][i]["verbs"]
+                        ))
+                    else:
+                        result.update(results[key][i])
+                if "tokenize" in self.configs.processors:
+                    # creating new tokens and dependencies
+                    tokens = self._create_tokens(input_pack, sent, result)
+                    if "depparse" in self.configs.processors:
+                        self._create_dependencies(input_pack, tokens, result)
+                    if 'srl' in self.configs.processors:
+                        self._create_srl(input_pack, tokens, result)
 
     def _process_existing_entries(self, input_pack):
         tokens_exist = any(True for _ in input_pack.get(Token))

diff --git a/setup.py b/setup.py
@@ -22,7 +22,8 @@
     platforms='any',
 
     install_requires=[
-        'forte @ git+https://[email protected]/asyml/forte.git'
+        'forte @ git+https://[email protected]/asyml/forte.git',
+        'more-itertools>=8.0.0'
     ],
     extras_require={
         'nltk': ['nltk==3.4.5'],