From f55f5b24a1a9db3b317db0e9ec1dd2db92334736 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 9 Apr 2024 11:57:36 +0100 Subject: [PATCH 01/13] Adding method to the ParserOutput object to get row wise json. --- src/cpr_sdk/parser_models.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index b14bf63..5a069a6 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -3,7 +3,7 @@ from collections import Counter from datetime import date from enum import Enum -from typing import List, Optional, Sequence, Tuple, TypeVar, Union +from typing import List, Optional, Sequence, Tuple, TypeVar, Union, Any from cpr_sdk.pipeline_general_models import ( CONTENT_TYPE_HTML, @@ -373,3 +373,28 @@ def from_flat_json(data: dict): unflattened = remove_key_if_all_nested_vals_none(unflattened, "pdf_data") return ParserOutput.model_validate(unflattened) + + def to_passage_level_json(self) -> list[dict[str, Any]]: + """ + Convert the parser output to a passage-level JSON format. + + In passage-level format we have a row for every text block in the document. This + is as for natural language processing tasks we often want to work with text at + the passage level. + """ + if self.text_blocks is None: + return [] + + common_fields_dict = self.model_dump( + exclude={ + "pdf_data": {"text_blocks", "page_metadata"}, + "html_data": {"text_blocks"}, + } + ) + + return [ + common_fields_dict + | block.model_dump(exclude={"text"}) + | {"text": block.to_string(), "block_index": idx} + for idx, block in enumerate(self.text_blocks) + ] From aa2bb0461929625863eeb804bf609966feb24648 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 9 Apr 2024 12:00:06 +0100 Subject: [PATCH 02/13] Bumping the version. --- src/cpr_sdk/version.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpr_sdk/version.py b/src/cpr_sdk/version.py index cc9e155..a3ef986 100644 --- a/src/cpr_sdk/version.py +++ b/src/cpr_sdk/version.py @@ -1,6 +1,6 @@ _MAJOR = "1" -_MINOR = "0" -_PATCH = "2" +_MINOR = "1" +_PATCH = "0" _SUFFIX = "" VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) From 2e14cf9a18ccc9ffa04c7f4dd6bc30c4ae8d936f Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 9 Apr 2024 15:31:06 +0100 Subject: [PATCH 03/13] Adding unit test. --- tests/test_parser_models.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index a098794..41cf291 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -1,15 +1,13 @@ import pydantic import pytest + from cpr_sdk.parser_models import ( ParserInput, ParserOutput, PDFTextBlock, VerticalFlipError, ) -from cpr_sdk.pipeline_general_models import ( - CONTENT_TYPE_HTML, - CONTENT_TYPE_PDF, -) +from cpr_sdk.pipeline_general_models import CONTENT_TYPE_HTML, CONTENT_TYPE_PDF def test_parser_input_object(parser_output_json_pdf) -> None: @@ -150,3 +148,22 @@ def test_parser_output_object( with pytest.raises(pydantic.ValidationError) as context: ParserOutput.model_validate(parser_output_json_flat) parser_output = ParserOutput.from_flat_json(parser_output_json_flat) + + +def test_to_passage_level_json_method( + parser_output_json_pdf: dict, + parser_output_json_html: dict, +) -> None: + """Test that we can successfully create a passage level array from the text blocks.""" + for parser_output_json in [parser_output_json_pdf, parser_output_json_html]: + parser_output = ParserOutput.model_validate(parser_output_json) + passage_level_array = parser_output.to_passage_level_json() + assert isinstance(passage_level_array, list) + assert len(passage_level_array) > 0 + assert len(passage_level_array) == len(parser_output.text_blocks) + assert all(isinstance(passage, dict) for passage in passage_level_array) + # TODO Check that all the keys are correct + first_doc_keys = set(passage_level_array[0].keys()) + assert all( + set(passage.keys()) == first_doc_keys for passage in passage_level_array + ) From d0e66754cb2bb19906eaaa801d048ba79c55e2f7 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 9 Apr 2024 20:04:13 +0100 Subject: [PATCH 04/13] Updating the tests to check for the correct keys in each row. --- tests/test_parser_models.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 41cf291..18489cb 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -6,6 +6,8 @@ ParserOutput, PDFTextBlock, VerticalFlipError, + HTMLTextBlock, + TextBlock ) from cpr_sdk.pipeline_general_models import CONTENT_TYPE_HTML, CONTENT_TYPE_PDF @@ -158,12 +160,26 @@ def test_to_passage_level_json_method( for parser_output_json in [parser_output_json_pdf, parser_output_json_html]: parser_output = ParserOutput.model_validate(parser_output_json) passage_level_array = parser_output.to_passage_level_json() + assert isinstance(passage_level_array, list) assert len(passage_level_array) > 0 assert len(passage_level_array) == len(parser_output.text_blocks) assert all(isinstance(passage, dict) for passage in passage_level_array) - # TODO Check that all the keys are correct + first_doc_keys = set(passage_level_array[0].keys()) assert all( set(passage.keys()) == first_doc_keys for passage in passage_level_array ) + + expected_model_fields = set( + list(TextBlock.model_fields.keys()) + + list(HTMLTextBlock.model_fields.keys()) + + list(PDFTextBlock.model_fields.keys()) + + list(ParserOutput.model_fields.keys()) + + ["block_index"] + ) + + assert all( + set(passage.keys()) == expected_model_fields + for passage in passage_level_array + ) From b740ccea9a9102e36588981453b9dc2bc3af1de1 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 9 Apr 2024 20:09:20 +0100 Subject: [PATCH 05/13] Refactoring. --- tests/test_parser_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 18489cb..315f91b 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -7,7 +7,7 @@ PDFTextBlock, VerticalFlipError, HTMLTextBlock, - TextBlock + TextBlock, ) from cpr_sdk.pipeline_general_models import CONTENT_TYPE_HTML, CONTENT_TYPE_PDF From 359e3142ecf94ac2370553b1af307ae44f09f187 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 9 Apr 2024 20:32:48 +0100 Subject: [PATCH 06/13] Updating to fill missing columns. --- src/cpr_sdk/parser_models.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index 5a069a6..bfbe078 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -4,6 +4,7 @@ from datetime import date from enum import Enum from typing import List, Optional, Sequence, Tuple, TypeVar, Union, Any +from pydantic_core.core_schema import PydanticUndefined from cpr_sdk.pipeline_general_models import ( CONTENT_TYPE_HTML, @@ -392,9 +393,33 @@ def to_passage_level_json(self) -> list[dict[str, Any]]: } ) - return [ + passages_array = [ common_fields_dict | block.model_dump(exclude={"text"}) | {"text": block.to_string(), "block_index": idx} for idx, block in enumerate(self.text_blocks) ] + + # HTML data won't contain PDF fields and vice versa, thus we must fill this in. + # We could rely on the hugging face dataset transformation to fill in the + # missing fields, but this is more explicit and provides default values. + class BlockIndex: + default = None + + expected_model_fields = ( + TextBlock.model_fields + | HTMLTextBlock.model_fields + | PDFTextBlock.model_fields + | ParserOutput.model_fields + | {"block_index": BlockIndex} + ) + + passages_array_filled = [] + for passage in passages_array: + for key in expected_model_fields: + if key not in passage: + default = expected_model_fields[key].default + passage[key] = default if default != PydanticUndefined else None + passages_array_filled.append(passage) + + return passages_array From c36edc8061847cb6098207ca68b71d809fe88ab1 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 9 Apr 2024 20:36:47 +0100 Subject: [PATCH 07/13] Moving comment. --- src/cpr_sdk/parser_models.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index bfbe078..5bdff0b 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -382,6 +382,10 @@ def to_passage_level_json(self) -> list[dict[str, Any]]: In passage-level format we have a row for every text block in the document. This is as for natural language processing tasks we often want to work with text at the passage level. + + HTML data won't contain PDF fields and vice versa, thus we must fill this in. + We could rely on the hugging face dataset transformation to fill in the missing + fields, but this is more explicit and provides default values. """ if self.text_blocks is None: return [] @@ -400,9 +404,6 @@ def to_passage_level_json(self) -> list[dict[str, Any]]: for idx, block in enumerate(self.text_blocks) ] - # HTML data won't contain PDF fields and vice versa, thus we must fill this in. - # We could rely on the hugging face dataset transformation to fill in the - # missing fields, but this is more explicit and provides default values. class BlockIndex: default = None From 8ba4125103ac56edf510c88e5b02a6d196502d9a Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 10 Apr 2024 08:48:46 +0100 Subject: [PATCH 08/13] Pre-commit fix. --- src/cpr_sdk/parser_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index 5bdff0b..f71a1f5 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -4,7 +4,7 @@ from datetime import date from enum import Enum from typing import List, Optional, Sequence, Tuple, TypeVar, Union, Any -from pydantic_core.core_schema import PydanticUndefined +from pydantic_core._pydantic_core import PydanticUndefined from cpr_sdk.pipeline_general_models import ( CONTENT_TYPE_HTML, From 0699cbff6a17b01da325a379f6d01ae0e2dd8bfc Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 10 Apr 2024 09:30:09 +0100 Subject: [PATCH 09/13] Updating to new method of filling missing fields. --- src/cpr_sdk/parser_models.py | 37 ++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index f71a1f5..780be53 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -4,7 +4,6 @@ from datetime import date from enum import Enum from typing import List, Optional, Sequence, Tuple, TypeVar, Union, Any -from pydantic_core._pydantic_core import PydanticUndefined from cpr_sdk.pipeline_general_models import ( CONTENT_TYPE_HTML, @@ -404,23 +403,33 @@ def to_passage_level_json(self) -> list[dict[str, Any]]: for idx, block in enumerate(self.text_blocks) ] - class BlockIndex: - default = None - - expected_model_fields = ( - TextBlock.model_fields - | HTMLTextBlock.model_fields - | PDFTextBlock.model_fields - | ParserOutput.model_fields - | {"block_index": BlockIndex} - ) + empty_html_text_block: dict[str, Any] = HTMLTextBlock.model_validate( + { + "text": [], + "text_block_id": "", + "type": BlockType.TEXT, + "type_confidence": 1.0, + } + ).model_dump() + empty_pdf_text_block: dict[str, Any] = PDFTextBlock.model_validate( + { + "text": [], + "text_block_id": "", + "type": BlockType.TEXT, + "type_confidence": 1.0, + "coords": [], + "page_number": 0, + } + ).model_dump() passages_array_filled = [] for passage in passages_array: - for key in expected_model_fields: + for key in empty_html_text_block.keys(): + if key not in passage: + passage[key] = empty_html_text_block[key] + for key in empty_pdf_text_block.keys(): if key not in passage: - default = expected_model_fields[key].default - passage[key] = default if default != PydanticUndefined else None + passage[key] = empty_pdf_text_block[key] passages_array_filled.append(passage) return passages_array From 292dfa934f7df5338013ff5f98b3f753bbea38af Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 10 Apr 2024 10:03:20 +0100 Subject: [PATCH 10/13] Updating tests. --- tests/test_parser_models.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 315f91b..2474424 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -157,13 +157,16 @@ def test_to_passage_level_json_method( parser_output_json_html: dict, ) -> None: """Test that we can successfully create a passage level array from the text blocks.""" - for parser_output_json in [parser_output_json_pdf, parser_output_json_html]: - parser_output = ParserOutput.model_validate(parser_output_json) - passage_level_array = parser_output.to_passage_level_json() + parser_output_pdf = ParserOutput.model_validate(parser_output_json_pdf) + passage_level_array_pdf = parser_output_pdf.to_passage_level_json() - assert isinstance(passage_level_array, list) - assert len(passage_level_array) > 0 - assert len(passage_level_array) == len(parser_output.text_blocks) + parser_output_html = ParserOutput.model_validate(parser_output_json_html) + passage_level_array_html = parser_output_html.to_passage_level_json() + + assert len(passage_level_array_pdf) == len(parser_output_pdf.text_blocks) + assert len(passage_level_array_html) == len(parser_output_html.text_blocks) + + for passage_level_array in [passage_level_array_pdf, passage_level_array_html]: assert all(isinstance(passage, dict) for passage in passage_level_array) first_doc_keys = set(passage_level_array[0].keys()) @@ -183,3 +186,11 @@ def test_to_passage_level_json_method( set(passage.keys()) == expected_model_fields for passage in passage_level_array ) + + passage_level_array_pdf_first_doc = passage_level_array_pdf[0] + passage_level_array_html_first_doc = passage_level_array_html[0] + + assert ( + passage_level_array_pdf_first_doc.keys() + == passage_level_array_html_first_doc.keys() + ) From 7b7f9050cf6179f3e830aa044c2bf791016ec1ed Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 10 Apr 2024 10:06:33 +0100 Subject: [PATCH 11/13] Return the correct variable. --- src/cpr_sdk/parser_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index 780be53..cdd486f 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -432,4 +432,4 @@ def to_passage_level_json(self) -> list[dict[str, Any]]: passage[key] = empty_pdf_text_block[key] passages_array_filled.append(passage) - return passages_array + return passages_array_filled From 913bbc76a0847c66f0b718e0d7f340242b2fb12f Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 10 Apr 2024 11:24:22 +0100 Subject: [PATCH 12/13] Updating how we process the data structures from the pydantic base model. --- src/cpr_sdk/parser_models.py | 60 +++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index cdd486f..c0f9616 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -3,6 +3,7 @@ from collections import Counter from datetime import date from enum import Enum +import json from typing import List, Optional, Sequence, Tuple, TypeVar, Union, Any from cpr_sdk.pipeline_general_models import ( @@ -385,42 +386,53 @@ def to_passage_level_json(self) -> list[dict[str, Any]]: HTML data won't contain PDF fields and vice versa, thus we must fill this in. We could rely on the hugging face dataset transformation to fill in the missing fields, but this is more explicit and provides default values. + + The reason we convert from the pydantic BaseModel to a string using the + model_dump_json method and then reloading with json.load is as objects like + Enums and child pydantic objects persist when using the model_dump method. + We don't want these when we push to huggingface. """ if self.text_blocks is None: return [] - common_fields_dict = self.model_dump( - exclude={ - "pdf_data": {"text_blocks", "page_metadata"}, - "html_data": {"text_blocks"}, - } + common_fields_dict = json.loads( + self.model_dump_json( + exclude={ + "pdf_data": {"text_blocks", "page_metadata"}, + "html_data": {"text_blocks"}, + } + ) ) passages_array = [ common_fields_dict - | block.model_dump(exclude={"text"}) + | json.loads(block.model_dump_json(exclude={"text"})) | {"text": block.to_string(), "block_index": idx} for idx, block in enumerate(self.text_blocks) ] - empty_html_text_block: dict[str, Any] = HTMLTextBlock.model_validate( - { - "text": [], - "text_block_id": "", - "type": BlockType.TEXT, - "type_confidence": 1.0, - } - ).model_dump() - empty_pdf_text_block: dict[str, Any] = PDFTextBlock.model_validate( - { - "text": [], - "text_block_id": "", - "type": BlockType.TEXT, - "type_confidence": 1.0, - "coords": [], - "page_number": 0, - } - ).model_dump() + empty_html_text_block: dict[str, Any] = json.loads( + HTMLTextBlock.model_validate( + { + "text": [], + "text_block_id": "", + "type": BlockType.TEXT, + "type_confidence": 1.0, + } + ).model_dump_json() + ) + empty_pdf_text_block: dict[str, Any] = json.loads( + PDFTextBlock.model_validate( + { + "text": [], + "text_block_id": "", + "type": BlockType.TEXT, + "type_confidence": 1.0, + "coords": [], + "page_number": 0, + } + ).model_dump_json() + ) passages_array_filled = [] for passage in passages_array: From f6395fd4cec74899d8daaa10ad3b9ace859efbf3 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 11 Apr 2024 09:40:55 +0100 Subject: [PATCH 13/13] Updating to use None instead of default values. --- src/cpr_sdk/parser_models.py | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index c0f9616..08e9df7 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -411,37 +411,17 @@ def to_passage_level_json(self) -> list[dict[str, Any]]: for idx, block in enumerate(self.text_blocks) ] - empty_html_text_block: dict[str, Any] = json.loads( - HTMLTextBlock.model_validate( - { - "text": [], - "text_block_id": "", - "type": BlockType.TEXT, - "type_confidence": 1.0, - } - ).model_dump_json() - ) - empty_pdf_text_block: dict[str, Any] = json.loads( - PDFTextBlock.model_validate( - { - "text": [], - "text_block_id": "", - "type": BlockType.TEXT, - "type_confidence": 1.0, - "coords": [], - "page_number": 0, - } - ).model_dump_json() - ) + empty_html_text_block_keys: list[str] = list(HTMLTextBlock.model_fields.keys()) + empty_pdf_text_block_keys: list[str] = list(PDFTextBlock.model_fields.keys()) passages_array_filled = [] for passage in passages_array: - for key in empty_html_text_block.keys(): + for key in empty_html_text_block_keys: if key not in passage: - passage[key] = empty_html_text_block[key] - for key in empty_pdf_text_block.keys(): + passage[key] = None + for key in empty_pdf_text_block_keys: if key not in passage: - passage[key] = empty_pdf_text_block[key] + passage[key] = None passages_array_filled.append(passage) return passages_array_filled