Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add infer_batch_size to AllenNLPProcessor #8

Merged
merged 6 commits into from
Apr 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 37 additions & 22 deletions forte_wrapper/allennlp/allennlp_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

import itertools
import logging
from typing import List, Dict
from typing import Any, Dict, Iterable, Iterator, List
import more_itertools

from allennlp.predictors import Predictor

Expand Down Expand Up @@ -116,6 +117,8 @@ def default_configs(cls):
multiple models are loaded, cuda devices are assigned in a
round robin fashion. E.g. [0, -1] -> first model uses gpu 0
but second model uses cpu.
- infer_batch_size: maximum number of sentences passed in as a
batch to model's predict function. A value <= 0 means no limit.
"""
config = super().default_configs()
config.update({
Expand All @@ -126,33 +129,45 @@ def default_configs(cls):
'stanford_url': MODEL2URL['stanford'],
'srl_url': MODEL2URL['srl'],
'universal_url': MODEL2URL['universal'],
'cuda_devices': [-1]
'cuda_devices': [-1],
'infer_batch_size': 0
})
return config

def _process(self, input_pack: DataPack):
# handle existing entries
self._process_existing_entries(input_pack)
sentences = input_pack.get(Sentence)
inputs = [{"sentence": s.text} for s in sentences]
results = {k: p.predict_batch_json(inputs)
for k, p in self.predictor.items()}
for i, sent in enumerate(sentences):
result = {}
for key in self.predictor:
if key == 'srl':
result.update(
parse_allennlp_srl_results(results[key][i]["verbs"])
)
else:
result.update(results[key][i])
if "tokenize" in self.configs.processors:
# creating new tokens and dependencies
tokens = self._create_tokens(input_pack, sent, result)
if "depparse" in self.configs.processors:
self._create_dependencies(input_pack, tokens, result)
if 'srl' in self.configs.processors:
self._create_srl(input_pack, tokens, result)

batch_size: int = self.configs['infer_batch_size']
batches: Iterator[Iterable[Sentence]]
if batch_size <= 0:
batches = iter([input_pack.get(Sentence)])
else:
batches = more_itertools.chunked(
input_pack.get(Sentence), batch_size)
for sentences in batches:
inputs: List[Dict[str, str]] = [{"sentence": s.text}
for s in sentences]
results: Dict[str, List[Dict[str, Any]]] = {
k: p.predict_batch_json(inputs)
for k, p in self.predictor.items()
}
for i, sent in enumerate(sentences):
result: Dict[str, List[str]] = {}
for key in self.predictor:
if key == 'srl':
result.update(parse_allennlp_srl_results(
results[key][i]["verbs"]
))
else:
result.update(results[key][i])
if "tokenize" in self.configs.processors:
# creating new tokens and dependencies
tokens = self._create_tokens(input_pack, sent, result)
if "depparse" in self.configs.processors:
self._create_dependencies(input_pack, tokens, result)
if 'srl' in self.configs.processors:
self._create_srl(input_pack, tokens, result)

def _process_existing_entries(self, input_pack):
tokens_exist = any(True for _ in input_pack.get(Token))
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
platforms='any',

install_requires=[
'forte @ git+https://[email protected]/asyml/forte.git'
'forte @ git+https://[email protected]/asyml/forte.git',
'more-itertools>=8.0.0'
],
extras_require={
'nltk': ['nltk==3.4.5'],
Expand Down