Skip to content

Commit

Permalink
Tbga dataset (#140)
Browse files Browse the repository at this point in the history
* initial commit

* Test hf dataset

* Test example to document

* document to example and pie dataset

* Adjusted test

* Document Converters

* Test adjustment

* Added README.md

* initial commit

* Test hf dataset

* Test example to document

* document to example and pie dataset

* Adjusted test

* Document Converters

* Test adjustment

* Added README.md

* Update tests/dataset_builders/pie/tbga/test_tbga.py

Co-authored-by: ArneBinder <[email protected]>

---------

Co-authored-by: ArneBinder <[email protected]>
  • Loading branch information
kai-car and ArneBinder authored Jul 19, 2024
1 parent 5bdcaca commit ae49e5d
Show file tree
Hide file tree
Showing 4 changed files with 418 additions and 0 deletions.
34 changes: 34 additions & 0 deletions dataset_builders/pie/tbga/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# PIE Dataset Card for "TBGA"

This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
[TBGA Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/tbga).

## Data Schema

The document type for this dataset is `TbgaDocument` which defines the following data fields:

- `text` (str)

and the following annotation layers:

- `entities` (annotation type: `SpanWithIdAndName`, target: `text`)
- `relations` (annotation type: `BinaryRelation`, target: `entities`)

`SpanWithIdAndName` is a custom annotation type that extends typical `Span` with the following data fields:

- `id` (str, for entity identification)
- `name` (str, entity string between span start and end)

See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/annotations.py) and
[here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation
type definitions.

## Document Converters

The dataset provides predefined document converters for the following target document types:

- `pie_modules.documents.TextDocumentWithLabeledSpansAndBinaryRelations`

See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/documents.py) and
[here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
definitions.
1 change: 1 addition & 0 deletions dataset_builders/pie/tbga/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pie-datasets>=0.6.0,<0.11.0
119 changes: 119 additions & 0 deletions dataset_builders/pie/tbga/tbga.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import dataclasses
from typing import Any

import datasets
from pytorch_ie import AnnotationLayer, annotation_field
from pytorch_ie.annotations import BinaryRelation, LabeledSpan, Span
from pytorch_ie.documents import (
TextBasedDocument,
TextDocumentWithLabeledSpansAndBinaryRelations,
)

from pie_datasets import ArrowBasedBuilder, GeneratorBasedBuilder


@dataclasses.dataclass(frozen=True)
class SpanWithIdAndName(Span):
id: str
name: str

def resolve(self) -> Any:
return self.id, self.name, super().resolve()


@dataclasses.dataclass
class TbgaDocument(TextBasedDocument):
entities: AnnotationLayer[SpanWithIdAndName] = annotation_field(target="text")
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")


def example_to_document(example) -> TbgaDocument:
document = TbgaDocument(text=example["text"])
head = SpanWithIdAndName(
# this is due to the original dataset having an integer id but string is required
id=str(example["h"]["id"]),
name=example["h"]["name"],
start=example["h"]["pos"][0],
end=example["h"]["pos"][0] + example["h"]["pos"][1], # end is start + length
)
tail = SpanWithIdAndName(
id=example["t"]["id"],
name=example["t"]["name"],
start=example["t"]["pos"][0],
end=example["t"]["pos"][0] + example["t"]["pos"][1], # end is start + length
)
document.entities.extend([head, tail])

relation = BinaryRelation(head=head, tail=tail, label=example["relation"])
document.relations.append(relation)
return document


def document_to_example(document):
head = document.entities[0]
tail = document.entities[1]
return {
"text": document.text,
"relation": document.relations[0].label,
"h": {"id": int(head.id), "name": head.name, "pos": [head.start, head.end - head.start]},
"t": {"id": tail.id, "name": tail.name, "pos": [tail.start, tail.end - tail.start]},
}


def convert_to_text_document_with_labeled_spans_and_binary_relations(
document: TbgaDocument,
) -> TextDocumentWithLabeledSpansAndBinaryRelations:
text_document = TextDocumentWithLabeledSpansAndBinaryRelations(text=document.text)
old2new_spans = {}
ids = []
names = []

for entity in document.entities: # in our case two entities (head and tail)
# create LabeledSpan and append
labeled_span = LabeledSpan(start=entity.start, end=entity.end, label="ENTITY")
text_document.labeled_spans.append(labeled_span)

# Map the original entity to the new labeled span
old2new_spans[entity] = labeled_span

ids.append(entity.id)
names.append(entity.name)

if len(document.relations) != 1: # one relation between two entities
raise ValueError(f"Expected exactly one relation, got {len(document.relations)}")
old_rel = document.relations[0]

# create BinaryRelation and append
rel = BinaryRelation(
head=old2new_spans[old_rel.head],
tail=old2new_spans[old_rel.tail],
label=old_rel.label,
)
text_document.binary_relations.append(rel)
text_document.metadata["entity_ids"] = ids
text_document.metadata["entity_names"] = names

return text_document


class Tbga(ArrowBasedBuilder):
DOCUMENT_TYPE = TbgaDocument
BASE_DATASET_PATH = "DFKI-SLT/tbga"
BASE_DATASET_REVISION = "78575b79aa1c6ff7712bfa0f0eb0e3d01d80e9bc"

BUILDER_CONFIGS = [
datasets.BuilderConfig(
version=datasets.Version("1.0.0"),
description="TBGA dataset",
)
]

DOCUMENT_CONVERTERS = {
TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations
}

def _generate_document(self, example, **kwargs):
return example_to_document(example)

def _generate_example(self, document, **kwargs):
return document_to_example(document)
Loading

0 comments on commit ae49e5d

Please sign in to comment.