From c4f14b80ac068959bef6cfbeaa1c693c7a88b693 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Fri, 1 Nov 2024 17:06:20 +0100 Subject: [PATCH 1/3] test label counts for labeled_partitions in sciarg --- tests/dataset_builders/pie/sciarg/test_sciarg.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/dataset_builders/pie/sciarg/test_sciarg.py b/tests/dataset_builders/pie/sciarg/test_sciarg.py index 48104f08..d994023e 100644 --- a/tests/dataset_builders/pie/sciarg/test_sciarg.py +++ b/tests/dataset_builders/pie/sciarg/test_sciarg.py @@ -50,10 +50,12 @@ "supports": 5789, }, "spans": {"background_claim": 3291, "data": 4297, "own_claim": 6004}, + "labeled_partitions": {"Abstract": 39, "H1": 340, "Title": 40}, }, "resolve_parts_of_same": { "relations": {"contradicts": 696, "semantically_same": 44, "supports": 5788}, "spans": {"background_claim": 2752, "data": 4093, "own_claim": 5450}, + "labeled_partitions": {"Abstract": 39, "H1": 340, "Title": 40}, }, } @@ -257,7 +259,7 @@ def converted_dataset(dataset, target_document_type) -> Optional[DatasetDict]: return dataset.to_document_type(target_document_type) -def test_converted_datasets(converted_dataset, dataset_variant): +def test_converted_datasets(converted_dataset, dataset_variant, target_document_type): if converted_dataset is not None: split_sizes = {name: len(ds) for name, ds in converted_dataset.items()} assert split_sizes == SPLIT_SIZES @@ -280,9 +282,13 @@ def test_converted_datasets(converted_dataset, dataset_variant): if TEST_FULL_DATASET: expected_label_counts = { - layer_name_mapping[ln]: value + layer_name_mapping.get(ln, ln): value for ln, value in FULL_LABEL_COUNTS[dataset_variant].items() } + if not issubclass(target_document_type, TextDocumentWithLabeledPartitions): + expected_label_counts = { + k: v for k, v in expected_label_counts.items() if k != "labeled_partitions" + } assert_dataset_label_counts(converted_dataset, expected_label_counts) From 99505bd5f0ab3f0218927d5ca6398f943e06bcf2 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Fri, 1 Nov 2024 17:54:04 +0100 Subject: [PATCH 2/3] fix and simplify tests --- .../pie/sciarg/test_sciarg.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/tests/dataset_builders/pie/sciarg/test_sciarg.py b/tests/dataset_builders/pie/sciarg/test_sciarg.py index d994023e..1010368b 100644 --- a/tests/dataset_builders/pie/sciarg/test_sciarg.py +++ b/tests/dataset_builders/pie/sciarg/test_sciarg.py @@ -50,14 +50,31 @@ "supports": 5789, }, "spans": {"background_claim": 3291, "data": 4297, "own_claim": 6004}, - "labeled_partitions": {"Abstract": 39, "H1": 340, "Title": 40}, }, "resolve_parts_of_same": { "relations": {"contradicts": 696, "semantically_same": 44, "supports": 5788}, "spans": {"background_claim": 2752, "data": 4093, "own_claim": 5450}, - "labeled_partitions": {"Abstract": 39, "H1": 340, "Title": 40}, }, } +CONVERTED_LAYER_MAPPING = { + "default": { + "spans": "labeled_spans", + "relations": "binary_relations", + }, + "resolve_parts_of_same": { + "spans": "labeled_multi_spans", + "relations": "binary_relations", + }, +} +FULL_LABEL_COUNTS_CONVERTED = { + variant: {CONVERTED_LAYER_MAPPING[variant][ln]: value for ln, value in counts.items()} + for variant, counts in FULL_LABEL_COUNTS.items() +} +LABELED_PARTITION_COUNTS = { + "Abstract": 39, + "H1": 340, + "Title": 40, +} def resolve_annotation(annotation: Annotation) -> Any: @@ -265,30 +282,19 @@ def test_converted_datasets(converted_dataset, dataset_variant, target_document_ assert split_sizes == SPLIT_SIZES if dataset_variant == "default": expected_document_type = TextDocumentWithLabeledSpansAndBinaryRelations - layer_name_mapping = { - "spans": "labeled_spans", - "relations": "binary_relations", - } elif dataset_variant == "resolve_parts_of_same": expected_document_type = TextDocumentWithLabeledMultiSpansAndBinaryRelations - layer_name_mapping = { - "spans": "labeled_multi_spans", - "relations": "binary_relations", - } else: raise ValueError(f"Unknown dataset variant: {dataset_variant}") + assert issubclass(converted_dataset.document_type, expected_document_type) assert isinstance(converted_dataset["train"][0], expected_document_type) if TEST_FULL_DATASET: - expected_label_counts = { - layer_name_mapping.get(ln, ln): value - for ln, value in FULL_LABEL_COUNTS[dataset_variant].items() - } - if not issubclass(target_document_type, TextDocumentWithLabeledPartitions): - expected_label_counts = { - k: v for k, v in expected_label_counts.items() if k != "labeled_partitions" - } + # copy to avoid modifying the original dict + expected_label_counts = {**FULL_LABEL_COUNTS_CONVERTED[dataset_variant]} + if issubclass(target_document_type, TextDocumentWithLabeledPartitions): + expected_label_counts["labeled_partitions"] = LABELED_PARTITION_COUNTS assert_dataset_label_counts(converted_dataset, expected_label_counts) From 3086c6a1c315f428fe96adadea18b066d06e847c Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Fri, 1 Nov 2024 18:41:05 +0100 Subject: [PATCH 3/3] fix allowing newlines between matching tags (important for abstract) --- dataset_builders/pie/sciarg/sciarg.py | 3 ++- tests/dataset_builders/pie/sciarg/test_sciarg.py | 6 +----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/dataset_builders/pie/sciarg/sciarg.py b/dataset_builders/pie/sciarg/sciarg.py index dc0e2a7d..002a8fff 100644 --- a/dataset_builders/pie/sciarg/sciarg.py +++ b/dataset_builders/pie/sciarg/sciarg.py @@ -123,7 +123,8 @@ def _generate_document(self, example, **kwargs): def document_converters(self) -> DocumentConvertersType: regex_partitioner = RegexPartitioner( partition_layer_name="labeled_partitions", - pattern="<([^>/]+)>.*", + # find matching tags, allow newlines in between (s flag) and capture the tag name + pattern="<([^>/]+)>(?s:.)*?", label_group_id=1, label_whitelist=["Title", "Abstract", "H1"], skip_initial_partition=True, diff --git a/tests/dataset_builders/pie/sciarg/test_sciarg.py b/tests/dataset_builders/pie/sciarg/test_sciarg.py index 1010368b..3a3ad8e0 100644 --- a/tests/dataset_builders/pie/sciarg/test_sciarg.py +++ b/tests/dataset_builders/pie/sciarg/test_sciarg.py @@ -70,11 +70,7 @@ variant: {CONVERTED_LAYER_MAPPING[variant][ln]: value for ln, value in counts.items()} for variant, counts in FULL_LABEL_COUNTS.items() } -LABELED_PARTITION_COUNTS = { - "Abstract": 39, - "H1": 340, - "Title": 40, -} +LABELED_PARTITION_COUNTS = {"Abstract": 40, "H1": 340, "Title": 40} def resolve_annotation(annotation: Annotation) -> Any: