-
Notifications
You must be signed in to change notification settings - Fork 25
/
input_example.py
128 lines (99 loc) · 3.49 KB
/
input_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass
from typing import List, Optional, Any, Dict, Union
from torch.utils.data.dataset import Dataset
@dataclass
class EntityType:
"""
An entity type in a dataset.
"""
short: str = None
natural: str = None # string to use in input/output sentences
def __hash__(self):
return hash(self.short)
@dataclass
class RelationType:
"""
A relation type in a dataset.
"""
short: str = None
natural: str = None # string to use in input/output sentences
def __hash__(self):
return hash(self.short)
@dataclass
class Entity:
"""
An entity in a training/test example.
"""
start: int # start index in the sentence
end: int # end index in the sentence
type: Optional[EntityType] = None # entity type
id: Optional[int] = None # id in the current training/test example
def to_tuple(self):
return self.type.natural, self.start, self.end
def __hash__(self):
return hash((self.id, self.start, self.end))
@dataclass
class Relation:
"""
An (asymmetric) relation in a training/test example.
"""
type: RelationType # relation type
head: Entity # head of the relation
tail: Entity # tail of the relation
def to_tuple(self):
return self.type.natural, self.head.to_tuple(), self.tail.to_tuple()
@dataclass
class Intent:
"""
The intent of an utterance.
"""
short: str = None
natural: str = None
def __hash__(self):
return hash(self.short)
@dataclass
class InputExample:
"""
A single training/test example.
"""
id: str # unique id in the dataset
tokens: List[str] # list of tokens (words)
dataset: Optional[Dataset] = None # dataset this example belongs to
# entity-relation extraction
entities: List[Entity] = None # list of entities
relations: List[Relation] = None # list of relations
intent: Optional[Intent] = None
# event extraction
triggers: List[Entity] = None # list of event triggers
# SRL
sentence_level_entities: List[Entity] = None
# coreference resolution
document_id: str = None # the id of the document this example belongs to
chunk_id: int = None # position in the list of chunks
offset: int = None # offset of this example in the document
groups: List[List[Entity]] = None # groups of entities
# DST
belief_state: Union[Dict[str, Any], str] = None
utterance_tokens: str = None
@dataclass
class CorefDocument:
"""
A document for the coreference resolution task.
It has several input examples corresponding to chunks of the document.
"""
id: str # unique id in the dataset
tokens: List[str] # list of tokens (words)
chunks: List[InputExample] # list of chunks for this document (the offset is an attribute of the InputExample)
chunk_centers: List[int] # list of the centers of the chunks (useful to find the chunk with largest context)
groups: List[List[Entity]] # coreference groups
@dataclass
class InputFeatures:
"""
A single set of features of data.
Property names are the same names as the corresponding inputs to a model.
"""
input_ids: List[int]
attention_mask: List[int]
label_ids: Optional[List[int]] = None