ArneBinder · kai-car · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/dataset_builders/pie/biorel/README.md b/dataset_builders/pie/biorel/README.md
@@ -0,0 +1,34 @@
+# PIE Dataset Card for "BioRel"
+
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
+[BioRel Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/BioRel).
+
+## Data Schema
+
+The document type for this dataset is `BioRelDocument` which defines the following data fields:
+
+- `text` (str)
+
+and the following annotation layers:
+
+- `entities` (annotation type: `SpanWithIdAndName`, target: `text`)
+- `relations` (annotation type: `BinaryRelation`, target: `entities`)
+
+`SpanWithIdAndName` is a custom annotation type that extends typical `Span` with the following data fields:
+
+- `id` (str, for entity identification)
+- `name` (str, entity string between span start and end)
+
+See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/annotations.py) and
+[here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation
+type definitions.
+
+## Document Converters
+
+The dataset provides predefined document converters for the following target document types:
+
+- `pie_modules.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
+
+See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/documents.py) and
+[here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
+definitions.
diff --git a/dataset_builders/pie/biorel/biorel.py b/dataset_builders/pie/biorel/biorel.py
@@ -0,0 +1,135 @@
+import dataclasses
+import logging
+from typing import Any
+
+import datasets
+from pytorch_ie import AnnotationLayer, annotation_field
+from pytorch_ie.annotations import BinaryRelation, LabeledSpan, Span
+from pytorch_ie.documents import (
+    TextBasedDocument,
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+)
+
+from pie_datasets import ArrowBasedBuilder, GeneratorBasedBuilder
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass(frozen=True)
+class SpanWithIdAndName(Span):
+    id: str
+    name: str
+
+    def resolve(self) -> Any:
+        return self.id, self.name, super().resolve()
+
+
+@dataclasses.dataclass
+class BioRelDocument(TextBasedDocument):
+    entities: AnnotationLayer[SpanWithIdAndName] = annotation_field(target="text")
+    relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")
+
+
+def example_to_document(example):
+    document = BioRelDocument(text=example["text"])
+    head = SpanWithIdAndName(
+        id=example["h"]["id"],
+        name=example["h"]["name"],
+        start=example["h"]["pos"][0],
+        end=example["h"]["pos"][1],
+    )
+    tail = SpanWithIdAndName(
+        id=example["t"]["id"],
+        name=example["t"]["name"],
+        start=example["t"]["pos"][0],
+        end=example["t"]["pos"][1],
+    )
+    document.entities.extend([head, tail])
+
+    relation = BinaryRelation(head=head, tail=tail, label=example["relation"])
+    document.relations.append(relation)
+    return document
+
+
+def document_to_example(document):
+    head = document.entities[0]
+    tail = document.entities[1]
+    return {
+        "text": document.text,
+        "relation": document.relations[0].label,
+        "h": {"id": head.id, "name": head.name, "pos": [head.start, head.end]},
+        "t": {"id": tail.id, "name": tail.name, "pos": [tail.start, tail.end]},
+    }
+
+
+def convert_to_text_document_with_labeled_spans_and_binary_relations(
+    document: BioRelDocument,
+) -> TextDocumentWithLabeledSpansAndBinaryRelations:
+    text_document = TextDocumentWithLabeledSpansAndBinaryRelations(text=document.text)
+    old2new_spans = {}
+    ids = []
+    names = []
+
+    for entity in document.entities:  # in our case two entities (head and tail)
+        # create LabeledSpan and append (required for document type)
+        labeled_span = LabeledSpan(start=entity.start, end=entity.end, label="ENTITY")
+        text_document.labeled_spans.append(labeled_span)
+
+        # check if the labeled span text is the same as the entity name
+        if str(labeled_span) != entity.name:
+            logger.warning(
+                f"Expected labeled span text to be '{entity.name}', got '{labeled_span}'"
+            )
+
+        # Map the original entity to the new labeled span
+        old2new_spans[entity] = labeled_span
+
+        ids.append(entity.id)
+        names.append(entity.name)
+
+    if len(document.relations) != 1:  # one relation between two entities
+        raise ValueError(f"Expected exactly one relation, got {len(document.relations)}")
+    old_rel = document.relations[0]
+
+    # create BinaryRelation and append (required for document type)
+    rel = BinaryRelation(
+        head=old2new_spans[old_rel.head],
+        tail=old2new_spans[old_rel.tail],
+        label=old_rel.label,
+    )
+    text_document.binary_relations.append(rel)
+    text_document.metadata["entity_ids"] = ids
+    text_document.metadata["entity_names"] = names
+
+    return text_document
+
+
+class BioRelConfig(datasets.BuilderConfig):
+    """BuilderConfig for BioRel."""
+
+    pass
+
+
+class BioRel(ArrowBasedBuilder):
+    DOCUMENT_TYPE = BioRelDocument
+    BASE_DATASET_PATH = "DFKI-SLT/BioRel"
+    BASE_DATASET_REVISION = "e4869c484c582cfbc7ead10d4d421bd4b275fa4e"
+    # BASE_CONFIG_KWARGS_DICT = None
+
+    BUILDER_CONFIGS = [
+        BioRelConfig(
+            name="biorel",
+            version=datasets.Version("1.0.0"),
+            description="BioRel dataset",
+        )
+    ]
+
+    DOCUMENT_CONVERTERS = {
+        TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations
+    }
+
+    def _generate_document(self, example, **kwargs):
+        return example_to_document(example)
+
+    def _generate_example(self, document: BioRelDocument, **kwargs):
+        return document_to_example(document)
diff --git a/dataset_builders/pie/biorel/requirements.txt b/dataset_builders/pie/biorel/requirements.txt
@@ -0,0 +1 @@
+pie-datasets>=0.6.0,<0.11.0
diff --git a/dataset_builders/pie/chemprot/README.md b/dataset_builders/pie/chemprot/README.md
diff --git a/dataset_builders/pie/chemprot/chemprot.py b/dataset_builders/pie/chemprot/chemprot.py
@@ -0,0 +1,175 @@
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Union
+
+import datasets
+from pytorch_ie import Document
+from pytorch_ie.annotations import BinaryRelation, LabeledSpan
+from pytorch_ie.documents import AnnotationLayer, TextBasedDocument, annotation_field
+
+from pie_datasets import GeneratorBasedBuilder
+
+
+@dataclass
+class ChemprotDocument(TextBasedDocument):
+    # used by chemprot_full_source and chemprot_shared_task_eval_source
+    entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
+    relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")
+
+
+@dataclass
+class ChemprotBigbioDocument(TextBasedDocument):
+    # check if correct
+    passages: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
+    entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
+    relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")
+
+
+def example_to_chemprot_doc(example) -> ChemprotDocument:
+    metadata = {"entity_ids": []}
+    id_to_labeled_span: Dict[str, LabeledSpan] = {}
+
+    doc = ChemprotDocument(
+        text=example["text"],
+        id=example["pmid"],
+        metadata=metadata,
+    )
+
+    for idx in range(len(example["entities"]["id"])):
+        # entities have "text" field: already included through the offset?
+        labeled_span = LabeledSpan(
+            start=example["entities"]["offsets"][idx][0],
+            end=example["entities"]["offsets"][idx][1],
+            label=example["entities"]["type"][idx],
+        )
+        doc.entities.append(labeled_span)
+        doc.metadata["entity_ids"].append(example["entities"]["id"][idx])
+        id_to_labeled_span[example["entities"]["id"][idx]] = labeled_span
+
+    for idx in range(len(example["relations"]["type"])):
+        doc.relations.append(
+            BinaryRelation(
+                head=id_to_labeled_span[example["relations"]["arg1"][idx]],
+                tail=id_to_labeled_span[example["relations"]["arg2"][idx]],
+                label=example["relations"]["type"][idx],
+            )
+        )
+
+    return doc
+
+
+def example_to_chemprot_bigbio_doc(example) -> ChemprotBigbioDocument:
+    text = " ".join([" ".join(passage["text"]) for passage in example["passages"]])
+    metadata = {"entity_ids": []}
+    id_to_labeled_span: Dict[str, LabeledSpan] = {}
+
+    doc = ChemprotBigbioDocument(
+        text=text,
+        id=example["document_id"],
+        metadata=metadata,
+    )
+
+    for passage in example["passages"]:
+        doc.passages.append(
+            LabeledSpan(
+                start=passage["offsets"][0][0],
+                end=passage["offsets"][0][1],
+                label=passage["type"],
+            )
+        )
+
+    for span in example["entities"]:
+        # entities have "text" field: already included through the offset?
+        labeled_span = LabeledSpan(
+            start=span["offsets"][0][0],
+            end=span["offsets"][0][1],
+            label=span["type"],
+        )
+        doc.entities.append(labeled_span)
+        doc.metadata["entity_ids"].append(span["id"])
+        id_to_labeled_span[span["id"]] = labeled_span
+
+    for relation in example["relations"]:
+        doc.relations.append(
+            BinaryRelation(
+                head=id_to_labeled_span[relation["arg1_id"]],
+                tail=id_to_labeled_span[relation["arg2_id"]],
+                label=relation["type"],
+            )
+        )
+
+    return doc
+
+
+def chemprot_doc_to_example(doc: ChemprotDocument) -> Dict[str, Any]:
+    # still in the process of being implemented
+    entities = {
+        "id": [],
+        "offsets": [],
+        "text": [],
+        "type": [],
+    }
+    relations = {
+        "arg1": [],
+        "arg2": [],
+        "type": [],
+    }
+
+    entities["id"] = doc.metadata["entity_ids"]
+    for entity in doc.entities:
+        entities["offsets"].append([entity.start, entity.end])
+        entities["text"].append(doc.text[entity.start : entity.end])
+        entities["type"].append(entity.label)
+
+    for relation in doc.relations:
+        # relations["arg1"].append(relation.head.id)
+        # relations["arg2"].append(relation.tail.id)
+        relations["type"].append(relation.label)
+
+    return {
+        "text": doc.text,
+        "pmid": doc.id,
+        "entities": entities,
+        "relations": relations,
+    }
+
+
+class ChemprotConfig(datasets.BuilderConfig):
+    pass
+
+
+class Chemprot(GeneratorBasedBuilder):
+    DOCUMENT_TYPES = {  # Note ChemprotDocument is used twice
+        "chemprot_full_source": ChemprotDocument,
+        "chemprot_bigbio_kb": ChemprotBigbioDocument,
+        "chemprot_shared_task_eval_source": ChemprotDocument,
+    }
+
+    BASE_DATASET_PATH = "bigbio/chemprot"
+    BASE_DATASET_REVISION = "86afccf3ccc614f817a7fad0692bf62fbc5ce469"
+
+    BUILDER_CONFIGS = [
+        ChemprotConfig(
+            name="chemprot_full_source",
+            version=datasets.Version("1.0.0"),
+            description="ChemProt full source version",
+        ),
+        ChemprotConfig(
+            name="chemprot_bigbio_kb",
+            version=datasets.Version("1.0.0"),
+            description="ChemProt BigBio kb version",
+        ),
+        ChemprotConfig(
+            name="chemprot_shared_task_eval_source",
+            version=datasets.Version("1.0.0"),
+            description="ChemProt shared task eval source version",
+        ),
+    ]
+
+    def _generate_document(self, example, **kwargs):
+        if self.config.name == "chemprot_bigbio_kb":
+            return example_to_chemprot_bigbio_doc(example)
+        else:
+            return example_to_chemprot_doc(example)
+
+    def _generate_example(self, document: Document, **kwargs) -> Dict[str, Any]:
+        pass
diff --git a/dataset_builders/pie/chemprot/requirements.txt b/dataset_builders/pie/chemprot/requirements.txt
@@ -0,0 +1 @@
+pie-datasets>=0.6.0,<0.11.0