Merge pull request #15 from ArneBinder/add_dataset_cdcp

Add AM dataset - CDCP
ArneBinder · Nov 9, 2023 · fd0c954 · fd0c954
2 parents d887d18 + 4f74a05
commit fd0c954
Show file tree

Hide file tree

Showing 8 changed files with 626 additions and 4 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -80,9 +80,8 @@ repos:
       - id: codespell
         args:
           - --skip=logs/**,data/**,tests/fixtures/**
-          # hist: required for plotext.hist()
-          # ba: denotes beginning of an encoding with label as 'a'. More details at src/pie_utils/sequence_tagging/ill_formed.py
-          - --ignore-words-list=hist,ba
+          # arbitral: this is a legal term and used in example data (cdcp dataset)
+          - --ignore-words-list=arbitral
 
   # python static type checking
   - repo: https://github.com/pre-commit/mirrors-mypy

diff --git a/dataset_builders/pie/cdcp/README.md b/dataset_builders/pie/cdcp/README.md
@@ -0,0 +1,29 @@
+# PIE Dataset Card for "CDCP"
+
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
+[CDCP Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/cdcp).
+
+## Data Schema
+
+The document type for this dataset is `CDCPDocument` which defines the following data fields:
+
+- `text` (str)
+- `id` (str, optional)
+- `metadata` (dictionary, optional)
+
+and the following annotation layers:
+
+- `propositions` (annotation type: `LabeledSpan`, target: `text`)
+- `relations` (annotation type: `BinaryRelation`, target: `propositions`)
+- `urls` (annotation type: `Attribute`, target: `propositions`)
+
+See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation type definitions.
+
+## Document Converters
+
+The dataset provides document converters for the following target document types:
+
+- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
+
+See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
+definitions.
diff --git a/dataset_builders/pie/cdcp/cdcp.py b/dataset_builders/pie/cdcp/cdcp.py
@@ -0,0 +1,142 @@
+import dataclasses
+import logging
+from typing import Any, Callable, Dict, List, Optional
+
+import datasets
+from pytorch_ie.annotations import BinaryRelation, LabeledSpan
+from pytorch_ie.core import Annotation, AnnotationList, annotation_field
+from pytorch_ie.documents import (
+    TextBasedDocument,
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+)
+
+from pie_datasets import GeneratorBasedBuilder
+from pie_datasets.document.processing.text_span_trimmer import trim_text_spans
+
+log = logging.getLogger(__name__)
+
+
+def dl2ld(dict_of_lists):
+    return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())]
+
+
+def ld2dl(list_of_dicts, keys: Optional[List[str]] = None):
+    return {k: [d[k] for d in list_of_dicts] for k in keys}
+
+
+@dataclasses.dataclass(frozen=True)
+class Attribute(Annotation):
+    value: str
+    annotation: Annotation
+
+
+@dataclasses.dataclass
+class CDCPDocument(TextBasedDocument):
+    propositions: AnnotationList[LabeledSpan] = annotation_field(target="text")
+    relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions")
+    urls: AnnotationList[Attribute] = annotation_field(target="propositions")
+
+
+def example_to_document(
+    example: Dict[str, Any],
+    relation_label: datasets.ClassLabel,
+    proposition_label: datasets.ClassLabel,
+):
+    document = CDCPDocument(id=example["id"], text=example["text"])
+    for proposition_dict in dl2ld(example["propositions"]):
+        proposition = LabeledSpan(
+            start=proposition_dict["start"],
+            end=proposition_dict["end"],
+            label=proposition_label.int2str(proposition_dict["label"]),
+        )
+        document.propositions.append(proposition)
+        if proposition_dict.get("url", "") != "":
+            url = Attribute(annotation=proposition, value=proposition_dict["url"])
+            document.urls.append(url)
+
+    for relation_dict in dl2ld(example["relations"]):
+        relation = BinaryRelation(
+            head=document.propositions[relation_dict["head"]],
+            tail=document.propositions[relation_dict["tail"]],
+            label=relation_label.int2str(relation_dict["label"]),
+        )
+        document.relations.append(relation)
+
+    return document
+
+
+def document_to_example(
+    document: CDCPDocument,
+    relation_label: datasets.ClassLabel,
+    proposition_label: datasets.ClassLabel,
+) -> Dict[str, Any]:
+    result = {"id": document.id, "text": document.text}
+    proposition2dict = {}
+    proposition2idx = {}
+    for idx, proposition in enumerate(document.propositions):
+        proposition2dict[proposition] = {
+            "start": proposition.start,
+            "end": proposition.end,
+            "label": proposition_label.str2int(proposition.label),
+            "url": "",
+        }
+        proposition2idx[proposition] = idx
+    for url in document.urls:
+        proposition2dict[url.annotation]["url"] = url.value
+
+    result["propositions"] = ld2dl(
+        proposition2dict.values(), keys=["start", "end", "label", "url"]
+    )
+
+    relations = [
+        {
+            "head": proposition2idx[relation.head],
+            "tail": proposition2idx[relation.tail],
+            "label": relation_label.str2int(relation.label),
+        }
+        for relation in document.relations
+    ]
+    result["relations"] = ld2dl(relations, keys=["head", "tail", "label"])
+
+    return result
+
+
+def convert_to_text_document_with_labeled_spans_and_binary_relations(
+    document: CDCPDocument,
+    verbose: bool = True,
+) -> TextDocumentWithLabeledSpansAndBinaryRelations:
+    doc_simplified = document.as_type(
+        TextDocumentWithLabeledSpansAndBinaryRelations,
+        field_mapping={"propositions": "labeled_spans", "relations": "binary_relations"},
+    )
+    result = trim_text_spans(
+        doc_simplified,
+        layer="labeled_spans",
+        verbose=verbose,
+    )
+    return result
+
+
+class CDCP(GeneratorBasedBuilder):
+    DOCUMENT_TYPE = CDCPDocument
+
+    DOCUMENT_CONVERTERS = {
+        TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations
+    }
+
+    BASE_DATASET_PATH = "DFKI-SLT/cdcp"
+
+    BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")]
+
+    DEFAULT_CONFIG_NAME = "default"  # type: ignore
+
+    def _generate_document_kwargs(self, dataset):
+        return {
+            "relation_label": dataset.features["relations"].feature["label"],
+            "proposition_label": dataset.features["propositions"].feature["label"],
+        }
+
+    def _generate_document(self, example, relation_label, proposition_label):
+        return example_to_document(
+            example, relation_label=relation_label, proposition_label=proposition_label
+        )
diff --git a/dataset_builders/pie/cdcp/requirements.txt b/dataset_builders/pie/cdcp/requirements.txt
@@ -0,0 +1 @@
+pie-datasets>=0.3.0
diff --git a/src/pie_datasets/document/types.py b/src/pie_datasets/document/types.py
@@ -3,7 +3,7 @@
 
 from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
 from pytorch_ie.core import Annotation, AnnotationList, annotation_field
-from pytorch_ie.documents import TextBasedDocument
+from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument
 
 
 @dataclasses.dataclass(eq=True, frozen=True)
@@ -28,3 +28,13 @@ class BratDocumentWithMergedSpans(TextBasedDocument):
     relations: AnnotationList[BinaryRelation] = annotation_field(target="spans")
     span_attributes: AnnotationList[Attribute] = annotation_field(target="spans")
     relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations")
+
+
+@dataclasses.dataclass
+class TokenDocumentWithLabeledSpans(TokenBasedDocument):
+    labeled_spans: AnnotationList[LabeledSpan] = annotation_field(target="tokens")
+
+
+@dataclasses.dataclass
+class TokenDocumentWithLabeledSpansAndBinaryRelations(TokenDocumentWithLabeledSpans):
+    binary_relations: AnnotationList[BinaryRelation] = annotation_field(target="labeled_spans")
diff --git a/tests/dataset_builders/pie/__init__.py b/tests/dataset_builders/pie/__init__.py