snowflakedb · sfc-gh-aalam · Jan 3, 2024 · Nov 8, 2023 · Nov 9, 2023 · Nov 9, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,10 @@
 
 - Fixed a bug in `DataFrame.na.fill` that caused Boolean values to erroneously override integer values.
 - Fixed sql simplifier for filter with window function columns in select.
+- Fixed a bug in `Session.create_dataframe` where the snowpark dataframes created using pandas dataframes were not inferring the type for timestamp columns correctly. The behavior is as follows:
+  - Earlier timestamp columns without a timezone would be converted to nanosecond epochs and inferred as `LongType()`, but will now be correctly be maintained as timestamp values and be inferred as `TimestampType(TimestampTimeZone.NTZ)`.
+  - Earlier timestamp columns with a timezone would be inferred as `TimestampType(TimestampTimeZone.NTZ)` and loose timezone information but will now be correctly inferred as `TimestampType(TimestampTimeZone.LTZ)` and timezone information is retained correctly.
+  - Set session parameter `PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME` to revert back to old behavior. It is recommended that you update your code soon to align with correct behavior as the parameter will be removed in the future.
 
 ## 1.11.1 (2023-12-07)
 
@@ -35,7 +39,7 @@
 - Added support for `arrays_to_object` new functions in `snowflake.snowpark.functions`.
 - Added support for the vector data type.
 
-## Dependency Updates
+### Dependency Updates
 
 - Bumped cloudpickle dependency to work with `cloudpickle==2.2.1`
 - Updated ``snowflake-connector-python`` to `3.4.0`.

@@ -176,6 +176,9 @@
     "PYTHON_SNOWPARK_USE_SCOPED_TEMP_OBJECTS"
 )
 _PYTHON_SNOWPARK_USE_SQL_SIMPLIFIER_STRING = "PYTHON_SNOWPARK_USE_SQL_SIMPLIFIER"
+_PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME_STRING = (
+    "PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME"
+)
 WRITE_PANDAS_CHUNK_SIZE: int = 100000 if is_in_stored_procedure() else None
 
 
@@ -419,6 +422,11 @@ def __init__(
                 _PYTHON_SNOWPARK_USE_SQL_SIMPLIFIER_STRING, True
             )
         )
+        self._use_logical_type_for_create_df: bool = (
+            self._conn._get_client_side_session_parameter(
+                _PYTHON_SNOWPARK_USE_LOGICAL_TYPE_FOR_CREATE_DATAFRAME_STRING, True
+            )
+        )
         self._custom_package_usage_config: Dict = {}
         self._conf = self.RuntimeConfig(self, options or {})
         self._tmpdir_handler: Optional[tempfile.TemporaryDirectory] = None
@@ -2098,6 +2106,7 @@ def create_dataframe(
                     quote_identifiers=True,
                     auto_create_table=True,
                     table_type="temporary",
+                    use_logical_type=self._use_logical_type_for_create_df,
                 )
                 set_api_call_source(t, "Session.create_dataframe[pandas]")
                 return t

@@ -68,6 +68,7 @@
     StringType,
     StructField,
     StructType,
+    TimestampTimeZone,
     TimestampType,
     TimeType,
     VariantType,
@@ -1511,6 +1512,41 @@ def test_create_dataframe_with_semi_structured_data_types(session):
     )
 
 
+@pytest.mark.skipif(not is_pandas_available, reason="pandas is required")
+def test_create_dataframe_with_pandas_df(session):
+    data = {
+        "pandas_datetime": ["2021-09-30 12:00:00", "2021-09-30 13:00:00"],
+        "date": [pd.to_datetime("2010-1-1"), pd.to_datetime("2011-1-1")],
+        "datetime.datetime": [
+            datetime.datetime(2010, 1, 1),
+            datetime.datetime(2010, 1, 1),
+        ],
+    }
+    pdf = pd.DataFrame(data)
+    pdf["pandas_datetime"] = pd.to_datetime(pdf["pandas_datetime"])
+    df = session.create_dataframe(pdf)
+
+    assert df.schema[0].name == '"pandas_datetime"'
+    assert df.schema[1].name == '"date"'
+    assert df.schema[2].name == '"datetime.datetime"'
+    assert df.schema[0].datatype == TimestampType(TimestampTimeZone.NTZ)
+    assert df.schema[1].datatype == TimestampType(TimestampTimeZone.NTZ)
+    assert df.schema[2].datatype == TimestampType(TimestampTimeZone.NTZ)
+
+    # test with timezone added to timestamp
+    pdf["pandas_datetime"] = pdf["pandas_datetime"].dt.tz_localize("US/Pacific")
+    pdf["date"] = pdf["date"].dt.tz_localize("US/Pacific")
+    pdf["datetime.datetime"] = pdf["datetime.datetime"].dt.tz_localize("US/Pacific")
+    df = session.create_dataframe(pdf)
+
+    assert df.schema[0].name == '"pandas_datetime"'
+    assert df.schema[1].name == '"date"'
+    assert df.schema[2].name == '"datetime.datetime"'
+    assert df.schema[0].datatype == TimestampType(TimestampTimeZone.LTZ)
+    assert df.schema[1].datatype == TimestampType(TimestampTimeZone.LTZ)
+    assert df.schema[2].datatype == TimestampType(TimestampTimeZone.LTZ)
+
+
 def test_create_dataframe_with_dict(session):
     data = {f"snow_{idx + 1}": idx**3 for idx in range(5)}
     expected_names = [name.upper() for name in data.keys()]