From e4bb64e4172075066fb51cf7e3db3d7d9e843cf0 Mon Sep 17 00:00:00 2001 From: Conner Panarella Date: Tue, 10 Dec 2024 11:24:40 -0500 Subject: [PATCH] feat: Configurable null character sanitization (#434) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Null characters are currently passed as-is to Postgres despite being unsupported. If it is encountered, it causes the sink to fail as noted here: https://github.com/MeltanoLabs/target-postgres/issues/60 with an error like `ValueError: A string literal cannot contain NUL (0x00) characters.` This PR introduces a new option called `sanitize_null_text_characters` which enables sanitization of these characters. --------- Co-authored-by: Edgar Ramírez Mondragón <16805946+edgarrmondragon@users.noreply.github.com> Co-authored-by: Edgar Ramírez-Mondragón --- README.md | 55 ++++++++++++++++++------------------ target_postgres/connector.py | 9 ++++++ target_postgres/sinks.py | 35 +++++++++++++++++++++-- target_postgres/target.py | 11 ++++++++ 4 files changed, 81 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 9e6ab68..ec15649 100644 --- a/README.md +++ b/README.md @@ -24,33 +24,34 @@ This target is tested with all actively supported [Python](https://devguide.pyth ## Settings -| Setting | Required | Default | Description | -| :------------------------------ | :------- | :--------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| host | False | None | Hostname for postgres instance. | -| port | False | 5432 | The port on which postgres is awaiting connections. | -| user | False | None | User name used to authenticate. | -| password | False | None | Password used to authenticate. | -| database | False | None | Database name. | -| use_copy | False | None | Use the COPY command to insert data. This is usually faster than INSERT statements. This option is only available for the postgres+psycopg dialect+driver combination. | -| default_target_schema | False | melty | Postgres schema to send data to, example: tap-clickup | -| activate_version | False | 1 | If set to false, the tap will ignore activate version messages. If set to true, add_record_metadata must be set to true as well. | -| hard_delete | False | 0 | When activate version is sent from a tap this specefies if we should delete the records that don't match, or mark them with a date in the `_sdc_deleted_at` column. This config option is ignored if `activate_version` is set to false. | -| add_record_metadata | False | 1 | Note that this must be enabled for activate_version to work!This adds _sdc_extracted_at, _sdc_batched_at, and more to every table. See https://sdk.meltano.com/en/latest/implementation/record_metadata.html for more information. | -| interpret_content_encoding | False | 0 | If set to true, the target will interpret the content encoding of the schema to determine how to store the data. Using this option may result in a more efficient storage of the data but may also result in an error if the data is not encoded as expected. | -| ssl_enable | False | 0 | Whether or not to use ssl to verify the server's identity. Use ssl_certificate_authority and ssl_mode for further customization. To use a client certificate to authenticate yourself to the server, use ssl_client_certificate_enable instead. | -| ssl_client_certificate_enable | False | 0 | Whether or not to provide client-side certificates as a method of authentication to the server. Use ssl_client_certificate and ssl_client_private_key for further customization. To use SSL to verify the server's identity, use ssl_enable instead. | -| ssl_mode | False | verify-full | SSL Protection method, see [postgres documentation](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION) for more information. Must be one of disable, allow, prefer, require, verify-ca, or verify-full. | -| ssl_certificate_authority | False | ~/.postgresql/root.crl | The certificate authority that should be used to verify the server's identity. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | -| ssl_client_certificate | False | ~/.postgresql/postgresql.crt | The certificate that should be used to verify your identity to the server. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | -| ssl_client_private_key | False | ~/.postgresql/postgresql.key | The private key for the certificate you provided. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | -| ssl_storage_directory | False | .secrets | The folder in which to store SSL certificates provided as raw values. When a certificate/key is provided as a raw value instead of as a filepath, it must be written to a file before it can be used. This configuration option determines where that file is created. | -| ssh_tunnel | False | None | SSH Tunnel Configuration, this is a json object | -| ssh_tunnel.enable | False | 0 | Enable an ssh tunnel (also known as bastion host), see the other ssh_tunnel.* properties for more details | -| ssh_tunnel.host | False | None | Host of the bastion host, this is the host we'll connect to via ssh | -| ssh_tunnel.username | False | None | Username to connect to bastion host | -| ssh_tunnel.port | False | 22 | Port to connect to bastion host | -| ssh_tunnel.private_key | False | None | Private Key for authentication to the bastion host | -| ssh_tunnel.private_key_password | False | None | Private Key Password, leave None if no password is set | +| Setting | Required | Default | Description | +| :------------------------------ | :------- | :--------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| host | False | None | Hostname for postgres instance. | +| port | False | 5432 | The port on which postgres is awaiting connections. | +| user | False | None | User name used to authenticate. | +| password | False | None | Password used to authenticate. | +| database | False | None | Database name. | +| use_copy | False | None | Use the COPY command to insert data. This is usually faster than INSERT statements. This option is only available for the postgres+psycopg dialect+driver combination. | +| default_target_schema | False | melty | Postgres schema to send data to, example: tap-clickup | +| activate_version | False | 1 | If set to false, the tap will ignore activate version messages. If set to true, add_record_metadata must be set to true as well. | +| hard_delete | False | 0 | When activate version is sent from a tap this specefies if we should delete the records that don't match, or mark them with a date in the `_sdc_deleted_at` column. This config option is ignored if `activate_version` is set to false. | +| add_record_metadata | False | 1 | Note that this must be enabled for activate_version to work!This adds _sdc_extracted_at, _sdc_batched_at, and more to every table. See https://sdk.meltano.com/en/latest/implementation/record_metadata.html for more information. | +| interpret_content_encoding | False | 0 | If set to true, the target will interpret the content encoding of the schema to determine how to store the data. Using this option may result in a more efficient storage of the data but may also result in an error if the data is not encoded as expected. | +| sanitize_null_text_characters | False | 0 | If set to true, the target will sanitize null characters in char/text/varchar fields, as they are not supported by Postgres. See [postgres documentation](https://www.postgresql.org/docs/current/functions-string.html) for more information about chr(0) not being supported. | +| ssl_enable | False | 0 | Whether or not to use ssl to verify the server's identity. Use ssl_certificate_authority and ssl_mode for further customization. To use a client certificate to authenticate yourself to the server, use ssl_client_certificate_enable instead. | +| ssl_client_certificate_enable | False | 0 | Whether or not to provide client-side certificates as a method of authentication to the server. Use ssl_client_certificate and ssl_client_private_key for further customization. To use SSL to verify the server's identity, use ssl_enable instead. | +| ssl_mode | False | verify-full | SSL Protection method, see [postgres documentation](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION) for more information. Must be one of disable, allow, prefer, require, verify-ca, or verify-full. | +| ssl_certificate_authority | False | ~/.postgresql/root.crl | The certificate authority that should be used to verify the server's identity. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | +| ssl_client_certificate | False | ~/.postgresql/postgresql.crt | The certificate that should be used to verify your identity to the server. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | +| ssl_client_private_key | False | ~/.postgresql/postgresql.key | The private key for the certificate you provided. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | +| ssl_storage_directory | False | .secrets | The folder in which to store SSL certificates provided as raw values. When a certificate/key is provided as a raw value instead of as a filepath, it must be written to a file before it can be used. This configuration option determines where that file is created. | +| ssh_tunnel | False | None | SSH Tunnel Configuration, this is a json object | +| ssh_tunnel.enable | False | 0 | Enable an ssh tunnel (also known as bastion host), see the other ssh_tunnel.* properties for more details | +| ssh_tunnel.host | False | None | Host of the bastion host, this is the host we'll connect to via ssh | +| ssh_tunnel.username | False | None | Username to connect to bastion host | +| ssh_tunnel.port | False | 22 | Port to connect to bastion host | +| ssh_tunnel.private_key | False | None | Private Key for authentication to the bastion host | +| ssh_tunnel.private_key_password | False | None | Private Key Password, leave None if no password is set | A full list of supported settings and capabilities is available by running: `target-postgres --about` diff --git a/target_postgres/connector.py b/target_postgres/connector.py index f627876..540e262 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -112,6 +112,15 @@ def interpret_content_encoding(self) -> bool: """ return self.config.get("interpret_content_encoding", False) + @cached_property + def sanitize_null_text_characters(self) -> bool: + """Whether to sanitize null text characters. + + Returns: + True if the feature is enabled, False otherwise. + """ + return self.config.get("sanitize_null_text_characters", False) + def prepare_table( # type: ignore[override] # noqa: PLR0913 self, full_table_name: str | FullyQualifiedName, diff --git a/target_postgres/sinks.py b/target_postgres/sinks.py index d6959cd..729a8ef 100644 --- a/target_postgres/sinks.py +++ b/target_postgres/sinks.py @@ -119,6 +119,27 @@ def generate_temp_table_name(self): # in postgres, used a guid just in case we are using the same session return f"{str(uuid.uuid4()).replace('-', '_')}" + def sanitize_null_text_characters(self, data): + """Sanitizes null characters by replacing \u0000 with \ufffd.""" + + def replace_null_character(d): + return d.replace("\u0000", "\ufffd") + + if isinstance(data, str): + data = replace_null_character(data) + + elif isinstance(data, dict): + for k in data: + if isinstance(data[k], str): + data[k] = replace_null_character(data[k]) + + elif isinstance(data, list): + for i in range(0, len(data)): + if isinstance(data[i], str): + data[i] = replace_null_character(data[i]) + + return data + def generate_copy_statement( self, full_table_name: str | FullyQualifiedName, @@ -204,7 +225,12 @@ def bulk_insert_records( # type: ignore[override] unique_records: dict[tuple, dict] = {} # pk tuple: values for record in records: insert_record = { - column.name: record.get(column.name) for column in columns + column.name: ( + self.sanitize_null_text_characters(record.get(column.name)) + if self.connector.sanitize_null_text_characters + else record.get(column.name) + ) + for column in columns } # No need to check for a KeyError here because the SDK already # guarantees that all key properties exist in the record. @@ -214,7 +240,12 @@ def bulk_insert_records( # type: ignore[override] else: for record in records: insert_record = { - column.name: record.get(column.name) for column in columns + column.name: ( + self.sanitize_null_text_characters(record.get(column.name)) + if self.connector.sanitize_null_text_characters + else record.get(column.name) + ) + for column in columns } data.append(insert_record) diff --git a/target_postgres/target.py b/target_postgres/target.py index 1d4bf5a..b54d7fb 100644 --- a/target_postgres/target.py +++ b/target_postgres/target.py @@ -218,6 +218,17 @@ def __init__( "in an error if the data is not encoded as expected." ), ), + th.Property( + "sanitize_null_text_characters", + th.BooleanType, + default=False, + description=( + "If set to true, the target will sanitize null characters in " + "char/text/varchar fields, as they are not supported by Postgres. " + "See [postgres documentation](https://www.postgresql.org/docs/current/functions-string.html) " # noqa: E501 + "for more information about chr(0) not being supported." + ), + ), th.Property( "ssl_enable", th.BooleanType,