Skip to content

Commit

Permalink
Merge pull request #15 from ArneBinder/add_dataset_cdcp
Browse files Browse the repository at this point in the history
Add AM dataset - CDCP
  • Loading branch information
ArneBinder authored Nov 9, 2023
2 parents d887d18 + 4f74a05 commit fd0c954
Show file tree
Hide file tree
Showing 8 changed files with 626 additions and 4 deletions.
5 changes: 2 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,8 @@ repos:
- id: codespell
args:
- --skip=logs/**,data/**,tests/fixtures/**
# hist: required for plotext.hist()
# ba: denotes beginning of an encoding with label as 'a'. More details at src/pie_utils/sequence_tagging/ill_formed.py
- --ignore-words-list=hist,ba
# arbitral: this is a legal term and used in example data (cdcp dataset)
- --ignore-words-list=arbitral

# python static type checking
- repo: https://github.com/pre-commit/mirrors-mypy
Expand Down
29 changes: 29 additions & 0 deletions dataset_builders/pie/cdcp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# PIE Dataset Card for "CDCP"

This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
[CDCP Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/cdcp).

## Data Schema

The document type for this dataset is `CDCPDocument` which defines the following data fields:

- `text` (str)
- `id` (str, optional)
- `metadata` (dictionary, optional)

and the following annotation layers:

- `propositions` (annotation type: `LabeledSpan`, target: `text`)
- `relations` (annotation type: `BinaryRelation`, target: `propositions`)
- `urls` (annotation type: `Attribute`, target: `propositions`)

See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation type definitions.

## Document Converters

The dataset provides document converters for the following target document types:

- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`

See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
definitions.
142 changes: 142 additions & 0 deletions dataset_builders/pie/cdcp/cdcp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import dataclasses
import logging
from typing import Any, Callable, Dict, List, Optional

import datasets
from pytorch_ie.annotations import BinaryRelation, LabeledSpan
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
from pytorch_ie.documents import (
TextBasedDocument,
TextDocumentWithLabeledSpansAndBinaryRelations,
)

from pie_datasets import GeneratorBasedBuilder
from pie_datasets.document.processing.text_span_trimmer import trim_text_spans

log = logging.getLogger(__name__)


def dl2ld(dict_of_lists):
return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())]


def ld2dl(list_of_dicts, keys: Optional[List[str]] = None):
return {k: [d[k] for d in list_of_dicts] for k in keys}


@dataclasses.dataclass(frozen=True)
class Attribute(Annotation):
value: str
annotation: Annotation


@dataclasses.dataclass
class CDCPDocument(TextBasedDocument):
propositions: AnnotationList[LabeledSpan] = annotation_field(target="text")
relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions")
urls: AnnotationList[Attribute] = annotation_field(target="propositions")


def example_to_document(
example: Dict[str, Any],
relation_label: datasets.ClassLabel,
proposition_label: datasets.ClassLabel,
):
document = CDCPDocument(id=example["id"], text=example["text"])
for proposition_dict in dl2ld(example["propositions"]):
proposition = LabeledSpan(
start=proposition_dict["start"],
end=proposition_dict["end"],
label=proposition_label.int2str(proposition_dict["label"]),
)
document.propositions.append(proposition)
if proposition_dict.get("url", "") != "":
url = Attribute(annotation=proposition, value=proposition_dict["url"])
document.urls.append(url)

for relation_dict in dl2ld(example["relations"]):
relation = BinaryRelation(
head=document.propositions[relation_dict["head"]],
tail=document.propositions[relation_dict["tail"]],
label=relation_label.int2str(relation_dict["label"]),
)
document.relations.append(relation)

return document


def document_to_example(
document: CDCPDocument,
relation_label: datasets.ClassLabel,
proposition_label: datasets.ClassLabel,
) -> Dict[str, Any]:
result = {"id": document.id, "text": document.text}
proposition2dict = {}
proposition2idx = {}
for idx, proposition in enumerate(document.propositions):
proposition2dict[proposition] = {
"start": proposition.start,
"end": proposition.end,
"label": proposition_label.str2int(proposition.label),
"url": "",
}
proposition2idx[proposition] = idx
for url in document.urls:
proposition2dict[url.annotation]["url"] = url.value

result["propositions"] = ld2dl(
proposition2dict.values(), keys=["start", "end", "label", "url"]
)

relations = [
{
"head": proposition2idx[relation.head],
"tail": proposition2idx[relation.tail],
"label": relation_label.str2int(relation.label),
}
for relation in document.relations
]
result["relations"] = ld2dl(relations, keys=["head", "tail", "label"])

return result


def convert_to_text_document_with_labeled_spans_and_binary_relations(
document: CDCPDocument,
verbose: bool = True,
) -> TextDocumentWithLabeledSpansAndBinaryRelations:
doc_simplified = document.as_type(
TextDocumentWithLabeledSpansAndBinaryRelations,
field_mapping={"propositions": "labeled_spans", "relations": "binary_relations"},
)
result = trim_text_spans(
doc_simplified,
layer="labeled_spans",
verbose=verbose,
)
return result


class CDCP(GeneratorBasedBuilder):
DOCUMENT_TYPE = CDCPDocument

DOCUMENT_CONVERTERS = {
TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations
}

BASE_DATASET_PATH = "DFKI-SLT/cdcp"

BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")]

DEFAULT_CONFIG_NAME = "default" # type: ignore

def _generate_document_kwargs(self, dataset):
return {
"relation_label": dataset.features["relations"].feature["label"],
"proposition_label": dataset.features["propositions"].feature["label"],
}

def _generate_document(self, example, relation_label, proposition_label):
return example_to_document(
example, relation_label=relation_label, proposition_label=proposition_label
)
1 change: 1 addition & 0 deletions dataset_builders/pie/cdcp/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pie-datasets>=0.3.0
12 changes: 11 additions & 1 deletion src/pie_datasets/document/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
from pytorch_ie.documents import TextBasedDocument
from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument


@dataclasses.dataclass(eq=True, frozen=True)
Expand All @@ -28,3 +28,13 @@ class BratDocumentWithMergedSpans(TextBasedDocument):
relations: AnnotationList[BinaryRelation] = annotation_field(target="spans")
span_attributes: AnnotationList[Attribute] = annotation_field(target="spans")
relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations")


@dataclasses.dataclass
class TokenDocumentWithLabeledSpans(TokenBasedDocument):
labeled_spans: AnnotationList[LabeledSpan] = annotation_field(target="tokens")


@dataclasses.dataclass
class TokenDocumentWithLabeledSpansAndBinaryRelations(TokenDocumentWithLabeledSpans):
binary_relations: AnnotationList[BinaryRelation] = annotation_field(target="labeled_spans")
Empty file.
Loading

0 comments on commit fd0c954

Please sign in to comment.