Skip to content

Commit

Permalink
Modified drugprot.py to align with BRAT format
Browse files Browse the repository at this point in the history
* split affected entity ids and relation ids in metadata in example to document methods

* adjusted document to example methods to fit

* adjusted related test methods to align with new format
  • Loading branch information
kai-car committed Aug 12, 2024
1 parent 81779cb commit ea1a6c8
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 46 deletions.
57 changes: 39 additions & 18 deletions dataset_builders/pie/drugprot/drugprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,24 +41,31 @@ def example2drugprot(example: Dict[str, Any]) -> DrugprotDocument:
id=example["document_id"],
metadata=metadata,
)

for span in example["entities"]:
labeled_span = LabeledSpan(
start=span["offset"][0],
end=span["offset"][1],
label=span["type"],
)
document.entities.append(labeled_span)
document.metadata["entity_ids"].append(span["id"])
id2labeled_span[span["id"]] = labeled_span
entity_id = span["id"].split("_")[1]
document.metadata["entity_ids"].append(entity_id)
id2labeled_span[entity_id] = labeled_span

for relation in example["relations"]:
arg1_id = relation["arg1_id"].split("_")[1]
arg2_id = relation["arg2_id"].split("_")[1]
document.relations.append(
BinaryRelation(
head=id2labeled_span[relation["arg1_id"]],
tail=id2labeled_span[relation["arg2_id"]],
head=id2labeled_span[arg1_id],
tail=id2labeled_span[arg2_id],
label=relation["type"],
)
)
document.metadata["relation_ids"].append(relation["id"])
relation_id = "R" + relation["id"].split("_")[1]
document.metadata["relation_ids"].append(relation_id)

return document


Expand Down Expand Up @@ -89,17 +96,23 @@ def example2drugprot_bigbio(example: Dict[str, Any]) -> DrugprotBigbioDocument:
label=span["type"],
)
document.entities.append(labeled_span)
document.metadata["entity_ids"].append(span["id"])
id2labeled_span[span["id"]] = labeled_span
entity_id = span["id"].split("_")[1]
document.metadata["entity_ids"].append(entity_id)
id2labeled_span[entity_id] = labeled_span

for relation in example["relations"]:
arg1_id = relation["arg1_id"].split("_")[1]
arg2_id = relation["arg2_id"].split("_")[1]
document.relations.append(
BinaryRelation(
head=id2labeled_span[relation["arg1_id"]],
tail=id2labeled_span[relation["arg2_id"]],
head=id2labeled_span[arg1_id],
tail=id2labeled_span[arg2_id],
label=relation["type"],
)
)
document.metadata["relation_ids"].append(relation["id"])
relation_id = "R" + relation["id"].split("_")[1]
document.metadata["relation_ids"].append(relation_id)

return document


Expand All @@ -108,7 +121,7 @@ def drugprot2example(doc: DrugprotDocument) -> Dict[str, Any]:
for i, entity in enumerate(doc.entities):
entities.append(
{
"id": doc.metadata["entity_ids"][i],
"id": doc.id + "_" + doc.metadata["entity_ids"][i],
"type": entity.label,
"text": doc.text[entity.start : entity.end],
"offset": [entity.start, entity.end],
Expand All @@ -119,9 +132,13 @@ def drugprot2example(doc: DrugprotDocument) -> Dict[str, Any]:
for i, relation in enumerate(doc.relations):
relations.append(
{
"id": doc.metadata["relation_ids"][i],
"arg1_id": doc.metadata["entity_ids"][doc.entities.index(relation.head)],
"arg2_id": doc.metadata["entity_ids"][doc.entities.index(relation.tail)],
"id": doc.id + "_" + doc.metadata["relation_ids"][i][1:],
"arg1_id": doc.id
+ "_"
+ doc.metadata["entity_ids"][doc.entities.index(relation.head)],
"arg2_id": doc.id
+ "_"
+ doc.metadata["entity_ids"][doc.entities.index(relation.tail)],
"type": relation.label,
}
)
Expand All @@ -141,7 +158,7 @@ def drugprot_bigbio2example(doc: DrugprotBigbioDocument) -> Dict[str, Any]:
for i, entity in enumerate(doc.entities):
entities.append(
{
"id": doc.metadata["entity_ids"][i],
"id": doc.id + "_" + doc.metadata["entity_ids"][i],
"normalized": [],
"offsets": [[entity.start, entity.end]],
"type": entity.label,
Expand All @@ -153,9 +170,13 @@ def drugprot_bigbio2example(doc: DrugprotBigbioDocument) -> Dict[str, Any]:
for i, relation in enumerate(doc.relations):
relations.append(
{
"id": doc.metadata["relation_ids"][i],
"arg1_id": doc.metadata["entity_ids"][doc.entities.index(relation.head)],
"arg2_id": doc.metadata["entity_ids"][doc.entities.index(relation.tail)],
"id": doc.id + "_" + doc.metadata["relation_ids"][i][1:],
"arg1_id": doc.id
+ "_"
+ doc.metadata["entity_ids"][doc.entities.index(relation.head)],
"arg2_id": doc.id
+ "_"
+ doc.metadata["entity_ids"][doc.entities.index(relation.tail)],
"normalized": [],
"type": relation.label,
}
Expand Down
58 changes: 30 additions & 28 deletions tests/dataset_builders/pie/drugprot/test_drugprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,22 +365,23 @@ def test_example_to_document(document, dataset_variant):
("GENE-Y", "RDH12"),
("GENE-N", "retinol dehydrogenase"),
]
# check entity ids
# check metadata
assert document.metadata["entity_ids"] == [
"17512723_T1",
"17512723_T2",
"17512723_T3",
"17512723_T4",
"17512723_T5",
"17512723_T6",
"17512723_T7",
"17512723_T8",
"17512723_T9",
"17512723_T10",
"17512723_T11",
"17512723_T12",
"17512723_T13",
"T1",
"T2",
"T3",
"T4",
"T5",
"T6",
"T7",
"T8",
"T9",
"T10",
"T11",
"T12",
"T13",
]
assert document.metadata["relation_ids"] == ["R0"]

# check the relations
assert document.relations.resolve() == [
Expand Down Expand Up @@ -511,22 +512,23 @@ def test_converted_document(converted_document, converted_document_type):
("GENE-Y", "RDH12"),
("GENE-N", "retinol dehydrogenase"),
]
# check entity ids
# check metadata
assert converted_document.metadata["entity_ids"] == [
"17512723_T1",
"17512723_T2",
"17512723_T3",
"17512723_T4",
"17512723_T5",
"17512723_T6",
"17512723_T7",
"17512723_T8",
"17512723_T9",
"17512723_T10",
"17512723_T11",
"17512723_T12",
"17512723_T13",
"T1",
"T2",
"T3",
"T4",
"T5",
"T6",
"T7",
"T8",
"T9",
"T10",
"T11",
"T12",
"T13",
]
assert converted_document.metadata["relation_ids"] == ["R0"]

# check the relations
assert converted_document.binary_relations.resolve() == [
Expand Down

0 comments on commit ea1a6c8

Please sign in to comment.