From d1a758e68d1d4c3e74beb8e3109daa7476e80ab0 Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Wed, 8 Nov 2023 14:50:29 -0800 Subject: [PATCH 1/8] SNOW-870432: use_logical_type for inferring timezone in pandas dfs --- src/snowflake/snowpark/session.py | 11 +++++++-- tests/integ/test_dataframe.py | 37 +++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/session.py b/src/snowflake/snowpark/session.py index 9c9eb714ad4..7a881e3ebf6 100644 --- a/src/snowflake/snowpark/session.py +++ b/src/snowflake/snowpark/session.py @@ -1789,6 +1789,7 @@ def write_pandas( create_temp_table: bool = False, overwrite: bool = False, table_type: Literal["", "temp", "temporary", "transient"] = "", + use_logical_type: Optional[bool] = None, ) -> Table: """Writes a pandas DataFrame to a table in Snowflake and returns a Snowpark :class:`DataFrame` object referring to the table where the @@ -1822,8 +1823,12 @@ def write_pandas( then it truncates the table. Note that in both cases (when overwrite is set to ``True``) it will replace the existing contents of the table with that of the passed in Pandas DataFrame. table_type: The table type of table to be created. The supported values are: ``temp``, ``temporary``, - and ``transient``. An empty string means to create a permanent table. Learn more about table - types `here `_. + and ``transient``. An empty string means to create a permanent table. Learn more about table types + `here `_. + use_logical_type: Boolean that specifies whether to use Parquet logical types. With this file format option, + Snowflake can interpret Parquet logical types during data loading. To enable Parquet logical types, + set use_logical_type as True. Set to None to use Snowflakes default. For more information, see: + https://docs.snowflake.com/en/sql-reference/sql/create-file-format Example:: @@ -1902,6 +1907,7 @@ def write_pandas( auto_create_table=auto_create_table, overwrite=overwrite, table_type=table_type, + use_logical_type=use_logical_type, ) except ProgrammingError as pe: if pe.msg.endswith("does not exist"): @@ -2008,6 +2014,7 @@ def create_dataframe( quote_identifiers=True, auto_create_table=True, table_type="temporary", + use_logical_type=True, ) set_api_call_source(t, "Session.create_dataframe[pandas]") return t diff --git a/tests/integ/test_dataframe.py b/tests/integ/test_dataframe.py index d9c9dae725e..6f99661866d 100644 --- a/tests/integ/test_dataframe.py +++ b/tests/integ/test_dataframe.py @@ -68,6 +68,7 @@ StringType, StructField, StructType, + TimestampTimeZone, TimestampType, TimeType, VariantType, @@ -1478,6 +1479,41 @@ def test_create_dataframe_with_semi_structured_data_types(session): ) +@pytest.mark.skipif(not is_pandas_available, reason="pandas is required") +def test_create_dataframe_with_pandas_df(session): + data = { + "pandas_datetime": ["2021-09-30 12:00:00", "2021-09-30 13:00:00"], + "date": [pd.to_datetime("2010-1-1"), pd.to_datetime("2011-1-1")], + "datetime.datetime": [ + datetime.datetime(2010, 1, 1), + datetime.datetime(2010, 1, 1), + ], + } + pdf = pd.DataFrame(data) + pdf["pandas_datetime"] = pd.to_datetime(pdf["pandas_datetime"]) + df = session.create_dataframe(pdf) + + assert df.schema[0].name == '"pandas_datetime"' + assert df.schema[1].name == '"date"' + assert df.schema[2].name == '"datetime.datetime"' + assert df.schema[0].datatype == TimestampType(TimestampTimeZone.NTZ) + assert df.schema[1].datatype == TimestampType(TimestampTimeZone.NTZ) + assert df.schema[2].datatype == TimestampType(TimestampTimeZone.NTZ) + + # test with timezone added to timestamp + pdf["pandas_datetime"] = pdf["pandas_datetime"].dt.tz_localize("US/Pacific") + pdf["date"] = pdf["date"].dt.tz_localize("US/Pacific") + pdf["datetime.datetime"] = pdf["datetime.datetime"].dt.tz_localize("US/Pacific") + df = session.create_dataframe(pdf) + + assert df.schema[0].name == '"pandas_datetime"' + assert df.schema[1].name == '"date"' + assert df.schema[2].name == '"datetime.datetime"' + assert df.schema[0].datatype == TimestampType(TimestampTimeZone.LTZ) + assert df.schema[1].datatype == TimestampType(TimestampTimeZone.LTZ) + assert df.schema[2].datatype == TimestampType(TimestampTimeZone.LTZ) + + def test_create_dataframe_with_dict(session): data = {f"snow_{idx + 1}": idx**3 for idx in range(5)} expected_names = [name.upper() for name in data.keys()] @@ -1958,6 +1994,7 @@ def test_case_insensitive_collect(session): assert row["p@$$w0rd"] == "test" assert row["P@$$W0RD"] == "test" + def test_case_insensitive_local_iterator(session): df = session.create_dataframe( [["Gordon", 153]], schema=["firstname", "matches_won"] From beb7bf07fc0538fc7b8b7a7a5495cf3516df272e Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Thu, 9 Nov 2023 15:30:28 -0800 Subject: [PATCH 2/8] changelog and dependency updates --- CHANGELOG.md | 14 ++++++++++++++ setup.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9231212f625..b98a084e164 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ ### New Features +- Added parameter for `use_logical_type` in `Session.write_pandas` to allow correct inference of pandas timestamp types from parquet files. + +### Dependency Updates + +- Updated ``snowflake-connector-python`` to 3.4.0. + +### Bug Fixes + +- Fixed a bug in `Session.create_dataframe` where the snowpark dataframes created using pandas dataframes were not inferring the type for timestamp columns correctly. + +## 1.10.0 (2023-11-03) + +### New Features + - Added support for managing case sensitivity in `DataFrame.to_local_iterator()`. - Added support for specifying vectorized UDTF's input column names by using the optional parameter `input_names` in `UDTFRegistration.register/register_file` and `functions.pandas_udtf`. By default, `RelationalGroupedDataFrame.applyInPandas` will infer the column names from current dataframe schema. - Add `sql_error_code` and `raw_message` attributes to `SnowflakeSQLException` when it is caused by a SQL exception. diff --git a/setup.py b/setup.py index 9dcead7dd22..54b75bb0783 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ THIS_DIR = os.path.dirname(os.path.realpath(__file__)) SRC_DIR = os.path.join(THIS_DIR, "src") SNOWPARK_SRC_DIR = os.path.join(SRC_DIR, "snowflake", "snowpark") -CONNECTOR_DEPENDENCY_VERSION = ">=3.2.0, <4.0.0" +CONNECTOR_DEPENDENCY_VERSION = ">=3.4.0, <4.0.0" INSTALL_REQ_LIST = [ "setuptools>=40.6.0", "wheel", From 182537f9f6d7b1f57d3948a12c2aa56ee8b0caff Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Thu, 9 Nov 2023 15:31:04 -0800 Subject: [PATCH 3/8] fix release number --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b98a084e164..efc91ad4706 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Release History -## 1.10.0 (2023-11-03) +## 1.11.0 (TBD) ### New Features From 0ea1145a02c54019e8eb9c6eb7479beea155f2bf Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Fri, 10 Nov 2023 10:45:36 -0800 Subject: [PATCH 4/8] provide additional details about the correct behavior --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index efc91ad4706..a1e92f1487e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,11 @@ ### Bug Fixes -- Fixed a bug in `Session.create_dataframe` where the snowpark dataframes created using pandas dataframes were not inferring the type for timestamp columns correctly. +- Fixed a bug in `Session.create_dataframe` where the snowpark dataframes created using pandas dataframes were not inferring the type for timestamp columns correctly. The behavior is as follows: + - Earlier timestamp columns without a timezone would be inferred as `LongType()` but will now be correctly inferred as `TimestampType(TimestampTimeZone.NTZ)`. + - Earlier timestamp columns without a timezone would be converted to nanosecond epochs, but will now be correctly be maintained as timestamp values. + - Earlier timestamp columns with a timezone would be inferred as `TimestampType(TimestampTimeZone.NTZ)` but will now be correctly inferred as `TimestampType(TimestampTimeZone.LTZ)`. + - Earlier timestamp columns with a timezone would loose timezone information and read incorrect time, but now the timezone information will be retained and time will be stored correctly. ## 1.10.0 (2023-11-03) From ee7e7568838976d0144eed5233cf135e830d463a Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Mon, 13 Nov 2023 15:04:11 -0800 Subject: [PATCH 5/8] use session param to control behavior --- src/snowflake/snowpark/session.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/session.py b/src/snowflake/snowpark/session.py index 7a881e3ebf6..72545ca375d 100644 --- a/src/snowflake/snowpark/session.py +++ b/src/snowflake/snowpark/session.py @@ -167,6 +167,9 @@ "PYTHON_SNOWPARK_USE_SCOPED_TEMP_OBJECTS" ) _PYTHON_SNOWPARK_USE_SQL_SIMPLIFIER_STRING = "PYTHON_SNOWPARK_USE_SQL_SIMPLIFIER" +_PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME_STRING = ( + "PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME" +) WRITE_PANDAS_CHUNK_SIZE: int = 100000 if is_in_stored_procedure() else None @@ -397,6 +400,11 @@ def __init__( _PYTHON_SNOWPARK_USE_SQL_SIMPLIFIER_STRING, True ) ) + self._use_logical_type_for_create_df: bool = ( + self._conn._get_client_side_session_parameter( + _PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME_STRING, True + ) + ) self._custom_package_usage_config: Dict = {} self._conf = self.RuntimeConfig(self, options or {}) self._tmpdir_handler: Optional[tempfile.TemporaryDirectory] = None @@ -2014,7 +2022,7 @@ def create_dataframe( quote_identifiers=True, auto_create_table=True, table_type="temporary", - use_logical_type=True, + use_logical_type=self._use_logical_type_for_create_df, ) set_api_call_source(t, "Session.create_dataframe[pandas]") return t From 99bd36f0c5d1828c17e3d65c162bd50b7fdb30b6 Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Mon, 13 Nov 2023 15:05:41 -0800 Subject: [PATCH 6/8] changelog updates --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a1e92f1487e..bec5ca22827 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - Earlier timestamp columns without a timezone would be converted to nanosecond epochs, but will now be correctly be maintained as timestamp values. - Earlier timestamp columns with a timezone would be inferred as `TimestampType(TimestampTimeZone.NTZ)` but will now be correctly inferred as `TimestampType(TimestampTimeZone.LTZ)`. - Earlier timestamp columns with a timezone would loose timezone information and read incorrect time, but now the timezone information will be retained and time will be stored correctly. + - Set session parameter `PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME` to revert back to old behavior. It is recommended that you update your code soon to align with correct behavior as the parameter will be removed in the future. ## 1.10.0 (2023-11-03) From ae9fc0f3a28045723808dd68ba579cf66ac5699a Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Fri, 29 Dec 2023 10:31:41 -0800 Subject: [PATCH 7/8] fix merge --- CHANGELOG.md | 7 +------ src/snowflake/snowpark/session.py | 1 + 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d440a8c66ef..ad2ce36e67a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,7 +34,6 @@ ### New Features -- Added parameter for `use_logical_type` in `Session.write_pandas` to allow correct inference of pandas timestamp types from parquet files. - Add the `conn_error` attribute to `SnowflakeSQLException` that stores the whole underlying exception from `snowflake-connector-python`. - Added support for `RelationalGroupedDataframe.pivot()` to access `pivot` in the following pattern `Dataframe.group_by(...).pivot(...)`. - Added experimental feature: Local Testing Mode, which allows you to create and operate on Snowpark Python DataFrames locally without connecting to a Snowflake account. You can use the local testing framework to test your DataFrame operations locally, on your development machine or in a CI (continuous integration) pipeline, before deploying code changes to your account. @@ -42,15 +41,11 @@ - Added support for `arrays_to_object` new functions in `snowflake.snowpark.functions`. - Added support for the vector data type. -## Dependency Updates +### Dependency Updates - Bumped cloudpickle dependency to work with `cloudpickle==2.2.1` - Updated ``snowflake-connector-python`` to `3.4.0`. -### Dependency Updates - -- Updated ``snowflake-connector-python`` to 3.4.0. - ### Bug Fixes - DataFrame column names quoting check now supports newline characters. diff --git a/src/snowflake/snowpark/session.py b/src/snowflake/snowpark/session.py index 4f692ffbd2f..b3cd66b2d9a 100644 --- a/src/snowflake/snowpark/session.py +++ b/src/snowflake/snowpark/session.py @@ -1865,6 +1865,7 @@ def write_pandas( create_temp_table: bool = False, overwrite: bool = False, table_type: Literal["", "temp", "temporary", "transient"] = "", + **kwargs: Dict[str, Any], ) -> Table: """Writes a pandas DataFrame to a table in Snowflake and returns a Snowpark :class:`DataFrame` object referring to the table where the From c816451a8484902e417f7076b49049e56aa7185d Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Tue, 2 Jan 2024 15:39:05 -0800 Subject: [PATCH 8/8] simplify changelog --- CHANGELOG.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad2ce36e67a..7212193c5e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,10 +15,8 @@ - Fixed a bug in `DataFrame.na.fill` that caused Boolean values to erroneously override integer values. - Fixed sql simplifier for filter with window function columns in select. - Fixed a bug in `Session.create_dataframe` where the snowpark dataframes created using pandas dataframes were not inferring the type for timestamp columns correctly. The behavior is as follows: - - Earlier timestamp columns without a timezone would be inferred as `LongType()` but will now be correctly inferred as `TimestampType(TimestampTimeZone.NTZ)`. - - Earlier timestamp columns without a timezone would be converted to nanosecond epochs, but will now be correctly be maintained as timestamp values. - - Earlier timestamp columns with a timezone would be inferred as `TimestampType(TimestampTimeZone.NTZ)` but will now be correctly inferred as `TimestampType(TimestampTimeZone.LTZ)`. - - Earlier timestamp columns with a timezone would loose timezone information and read incorrect time, but now the timezone information will be retained and time will be stored correctly. + - Earlier timestamp columns without a timezone would be converted to nanosecond epochs and inferred as `LongType()`, but will now be correctly be maintained as timestamp values and be inferred as `TimestampType(TimestampTimeZone.NTZ)`. + - Earlier timestamp columns with a timezone would be inferred as `TimestampType(TimestampTimeZone.NTZ)` and loose timezone information but will now be correctly inferred as `TimestampType(TimestampTimeZone.LTZ)` and timezone information is retained correctly. - Set session parameter `PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME` to revert back to old behavior. It is recommended that you update your code soon to align with correct behavior as the parameter will be removed in the future. ## 1.11.1 (2023-12-07)