Skip to content

Commit

Permalink
Chemprot dataset [WIP] (#138)
Browse files Browse the repository at this point in the history
* Created new branch for chemprot dataset

* Working on document_to_example

* Exchanged example split from "test" to "sample"

* Tested pie dataset

* Document converters + tests

* Final adjustments

* Adjusted according to review

* Adjusted according to review

* Wrote README.md

* Wrote README.md
  • Loading branch information
kai-car authored Jul 18, 2024
1 parent 1a19cdf commit 5bdcaca
Show file tree
Hide file tree
Showing 4 changed files with 707 additions and 0 deletions.
48 changes: 48 additions & 0 deletions dataset_builders/pie/chemprot/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# PIE Dataset Card for "ChemProt"

This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
[ChemProt Huggingface dataset loading script](https://huggingface.co/datasets/bigbio/chemprot).

## Data Schema

There are three versions of the dataset supported, `chemprot_full_source`, `chemprot_shared_task_eval_source` and `chemprot_bigbio_kb`.

#### `ChemprotDocument` for `chemprot_source` and `chemprot_shared_task_eval_source`

defines following fields:

- `text` (str)
- `id` (str, optional)
- `metadata` (dictionary, optional)

and the following annotation layers:

- `entities` (annotation type: `LabeledSpan`, target: `text`)
- `relations` (annotation type: `BinaryRelation`, target: `entities`)

#### `ChemprotBigbioDocument` for `chemprot_bigbio_kb`

defines following fields:

- `text` (str)
- `id` (str, optional)
- `metadata` (dictionary, optional)

and the following annotation layers:

- `passages` (annotation type: `LabeledSpan`, target: `text`)
- `entities` (annotation type: `LabeledSpan`, target: `text`)
- `relations` (annotation type: `BinaryRelation`, target: `entities`)

See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/annotations.py) for the annotation
type definitions.

## Document Converters

The dataset provides predefined document converters for the following target document types:

- `pie_modules.documents.TextDocumentWithLabeledSpansAndBinaryRelations` for `ChemprotDocument`
- `pie_modules.documents.TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions` for `ChemprotBigbioDocument`

See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/documents.py) for the document type
definitions.
273 changes: 273 additions & 0 deletions dataset_builders/pie/chemprot/chemprot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
from dataclasses import dataclass
from typing import Any, Dict

import datasets
from pytorch_ie import Document
from pytorch_ie.annotations import BinaryRelation, LabeledSpan
from pytorch_ie.documents import (
AnnotationLayer,
TextBasedDocument,
TextDocumentWithLabeledSpansAndBinaryRelations,
TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
annotation_field,
)

from pie_datasets import GeneratorBasedBuilder


@dataclass
class ChemprotDocument(TextBasedDocument):
# used by chemprot_full_source and chemprot_shared_task_eval_source
entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")


@dataclass
class ChemprotBigbioDocument(TextBasedDocument):
passages: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")


def example_to_chemprot_doc(example) -> ChemprotDocument:
metadata = {"entity_ids": []}
id_to_labeled_span: Dict[str, LabeledSpan] = {}

doc = ChemprotDocument(
text=example["text"],
id=example["pmid"],
metadata=metadata,
)

for idx in range(len(example["entities"]["id"])):
labeled_span = LabeledSpan(
start=example["entities"]["offsets"][idx][0],
end=example["entities"]["offsets"][idx][1],
label=example["entities"]["type"][idx],
)
doc.entities.append(labeled_span)
doc.metadata["entity_ids"].append(example["entities"]["id"][idx])
id_to_labeled_span[example["entities"]["id"][idx]] = labeled_span

for idx in range(len(example["relations"]["type"])):
doc.relations.append(
BinaryRelation(
head=id_to_labeled_span[example["relations"]["arg1"][idx]],
tail=id_to_labeled_span[example["relations"]["arg2"][idx]],
label=example["relations"]["type"][idx],
)
)

return doc


def example_to_chemprot_bigbio_doc(example) -> ChemprotBigbioDocument:
text = " ".join([" ".join(passage["text"]) for passage in example["passages"]])
metadata = {"id": example["id"], "entity_ids": [], "relation_ids": []}
id_to_labeled_span: Dict[str, LabeledSpan] = {}

doc = ChemprotBigbioDocument(
text=text,
id=example["document_id"],
metadata=metadata,
)

for passage in example["passages"]:
doc.passages.append(
LabeledSpan(
start=passage["offsets"][0][0],
end=passage["offsets"][0][1],
label=passage["type"],
)
)

for span in example["entities"]:
labeled_span = LabeledSpan(
start=span["offsets"][0][0],
end=span["offsets"][0][1],
label=span["type"],
)
doc.entities.append(labeled_span)
doc.metadata["entity_ids"].append(span["id"])
id_to_labeled_span[span["id"]] = labeled_span

for relation in example["relations"]:
doc.relations.append(
BinaryRelation(
head=id_to_labeled_span[relation["arg1_id"]],
tail=id_to_labeled_span[relation["arg2_id"]],
label=relation["type"],
)
)
doc.metadata["relation_ids"].append([relation["arg1_id"], relation["arg2_id"]])

return doc


def chemprot_doc_to_example(doc: ChemprotDocument) -> Dict[str, Any]:
entities = {
"id": [],
"offsets": [],
"text": [],
"type": [],
}
relations = {
"arg1": [],
"arg2": [],
"type": [],
}

entity_id2entity = {
ent_id: entity for ent_id, entity in zip(doc.metadata["entity_ids"], doc.entities)
}

for entity_id, entity in zip(doc.metadata["entity_ids"], doc.entities):
entities["id"].append(entity_id)
entities["offsets"].append([entity.start, entity.end])
entities["text"].append(doc.text[entity.start : entity.end])
entities["type"].append(entity.label)

if entity in entity_id2entity:
raise ValueError("Entity already exists in entity_id2entity")

entity_id2entity[entity] = entity_id

for relation in doc.relations:
relations["arg1"].append(entity_id2entity[relation.head])
relations["arg2"].append(entity_id2entity[relation.tail])
relations["type"].append(relation.label)

return {
"text": doc.text,
"pmid": doc.id,
"entities": entities,
"relations": relations,
}


def chemprot_bigbio_doc_to_example(doc: ChemprotBigbioDocument) -> Dict[str, Any]:
id = int(doc.metadata["id"])
passages = []
entities = []
relations = []

entity_id2entity = {
ent_id: entity for ent_id, entity in zip(doc.metadata["entity_ids"], doc.entities)
}

for passage in doc.passages:
id += 1
passages.append(
{
"id": str(id),
"offsets": [[passage.start, passage.end]],
"text": [doc.text[passage.start : passage.end]],
"type": passage.label,
}
)

entity2entity_id = dict()

for entity_id, entity in zip(doc.metadata["entity_ids"], doc.entities):
id += 1
entities.append(
{
"id": entity_id, # entity_id = str(id)
"normalized": [],
"offsets": [[entity.start, entity.end]],
"text": [doc.text[entity.start : entity.end]],
"type": entity.label,
}
)
if entity in entity_id2entity:
raise ValueError("Entity already exists in entity_id2entity")

entity2entity_id[entity] = entity_id

for relation in doc.relations:
id += 1
relations.append(
{
"id": str(id), # save in metadata?
"arg1_id": entity2entity_id[relation.head],
"arg2_id": entity2entity_id[relation.tail],
"type": relation.label,
"normalized": [],
}
)

return {
"id": doc.metadata["id"],
"document_id": doc.id,
"passages": passages,
"entities": entities,
"events": [],
"coreferences": [],
"relations": relations,
}


class Chemprot(GeneratorBasedBuilder):
DOCUMENT_TYPES = { # Note ChemprotDocument is used twice
"chemprot_full_source": ChemprotDocument,
"chemprot_bigbio_kb": ChemprotBigbioDocument,
"chemprot_shared_task_eval_source": ChemprotDocument,
}

BASE_DATASET_PATH = "bigbio/chemprot"
BASE_DATASET_REVISION = "86afccf3ccc614f817a7fad0692bf62fbc5ce469"

BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="chemprot_full_source",
version=datasets.Version("1.0.0"),
description="ChemProt full source version",
),
datasets.BuilderConfig(
name="chemprot_bigbio_kb",
version=datasets.Version("1.0.0"),
description="ChemProt BigBio kb version",
),
datasets.BuilderConfig(
name="chemprot_shared_task_eval_source",
version=datasets.Version("1.0.0"),
description="ChemProt shared task eval source version",
),
]

@property
def document_converters(self):
if (
self.config.name == "chemprot_full_source"
or self.config.name == "chemprot_shared_task_eval_source"
):
return {
TextDocumentWithLabeledSpansAndBinaryRelations: {
"entities": "labeled_spans",
"relations": "binary_relations",
}
}
elif self.config.name == "chemprot_bigbio_kb":
return {
TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions: {
"passages": "labeled_partitions",
"entities": "labeled_spans",
"relations": "binary_relations",
}
}
else:
raise ValueError(f"Unknown dataset name: {self.config.name}")

def _generate_document(self, example, **kwargs):
if self.config.name == "chemprot_bigbio_kb":
return example_to_chemprot_bigbio_doc(example)
else:
return example_to_chemprot_doc(example)

def _generate_example(self, document: Document, **kwargs) -> Dict[str, Any]:
if isinstance(document, ChemprotBigbioDocument):
return chemprot_bigbio_doc_to_example(document)
elif isinstance(document, ChemprotDocument):
return chemprot_doc_to_example(document)
else:
raise ValueError(f"Unknown document type: {type(document)}")
1 change: 1 addition & 0 deletions dataset_builders/pie/chemprot/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pie-datasets>=0.6.0,<0.11.0
Loading

0 comments on commit 5bdcaca

Please sign in to comment.