SNOW-1818205: Add support for pd.json_normalize (#2657)

1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR.  Fixes SNOW-1818205 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [ ] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://docs.google.com/document/d/162d_i4zZ2AfcGRXojj0jByt8EUq-DrSHPPnTa4QvwbA/edit#bookmark=id.e82u4nekq80k) 3. Please describe how your code solves the related issue. Add support for pd.json_normalize.
snowflakedb · Nov 22, 2024 · bbd7a62 · bbd7a62
1 parent c8161c4
commit bbd7a62
Show file tree

Hide file tree

Showing 6 changed files with 235 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -59,6 +59,7 @@
 #### New Features
 
 - Added support for `DataFrame.align` and `Series.align` for `axis=1` and `axis=None`.
+- Added support fot `pd.json_normalize`.
 
 #### Bug Fixes
 

diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
@@ -32,6 +32,8 @@ Data manipulations
 | ``get_dummies``             | P                               | ``sparse`` is ignored            | ``Y`` if params ``dummy_na``, ``drop_first``       |
 |                             |                                 |                                  | and ``dtype`` are default, otherwise ``N``         |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
+| ``json_normalize``          | Y                               |                                  |                                                    |
++-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``lreshape``                | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``melt``                    | P                               | ``col_level``, ``ignore_index``  | ``N`` if df.columns is a MultiIndex                |

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/io.py b/src/snowflake/snowpark/modin/plugin/docstrings/io.py
@@ -343,6 +343,125 @@ def read_xml():
 def json_normalize():
     """
     Normalize semi-structured JSON data into a flat table.
+
+    Parameters
+    ----------
+    data : dict or list of dicts
+        Unserialized JSON objects.
+    record_path : str or list of str, default None
+        Path in each object to list of records. If not passed, data will be assumed to be an array of records.
+    meta : list of paths (str or list of str), default None
+        Fields to use as metadata for each record in resulting table.
+    meta_prefix : str, default None
+        If True, prefix records with dotted path, e.g. foo.bar.field if meta is [‘foo’, ‘bar’].
+    record_prefix : str, default None
+        If True, prefix records with dotted path, e.g. foo.bar.field if path to records is [‘foo’, ‘bar’].
+    errors : {‘raise’, ‘ignore’}, default ‘raise’
+        Configures error handling.
+        - ‘ignore’ : will ignore KeyError if keys listed in meta are not always present.
+        - ‘raise’ : will raise KeyError if keys listed in meta are not always present.
+    sep : str, default ‘.’
+        Nested records will generate names separated by sep. e.g., for sep=’.’, {‘foo’: {‘bar’: 0}} -> foo.bar.
+    max_level : int, default None
+        Max number of levels(depth of dict) to normalize. if None, normalizes all levels.
+
+    Returns
+    -------
+    frame : DataFrame
+    Normalize semi-structured JSON data into a flat table.
+
+    Examples
+    --------
+    >>> data = [
+    ...     {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
+    ...     {"name": {"given": "Mark", "family": "Regner"}},
+    ...     {"id": 2, "name": "Faye Raker"},
+    ... ]
+    >>> pd.json_normalize(data)
+        id name.first name.last name.given name.family        name
+    0  1.0     Coleen      Volk       None        None        None
+    1  NaN       None      None       Mark      Regner        None
+    2  2.0       None      None       None        None  Faye Raker
+
+    >>> data = [
+    ...     {
+    ...         "id": 1,
+    ...         "name": "Cole Volk",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ...     {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
+    ...     {
+    ...         "id": 2,
+    ...         "name": "Faye Raker",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ... ]
+    >>> pd.json_normalize(data, max_level=0)
+        id        name                        fitness
+    0  1.0   Cole Volk  {'height': 130, 'weight': 60}
+    1  NaN    Mark Reg  {'height': 130, 'weight': 60}
+    2  2.0  Faye Raker  {'height': 130, 'weight': 60}
+
+    Normalizes nested data up to level 1.
+
+    >>> data = [
+    ...     {
+    ...         "id": 1,
+    ...         "name": "Cole Volk",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ...     {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
+    ...     {
+    ...         "id": 2,
+    ...         "name": "Faye Raker",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ... ]
+    >>> pd.json_normalize(data, max_level=1)
+        id        name  fitness.height  fitness.weight
+    0  1.0   Cole Volk             130              60
+    1  NaN    Mark Reg             130              60
+    2  2.0  Faye Raker             130              60
+
+    >>> data = [
+    ...     {
+    ...         "state": "Florida",
+    ...         "shortname": "FL",
+    ...         "info": {"governor": "Rick Scott"},
+    ...         "counties": [
+    ...             {"name": "Dade", "population": 12345},
+    ...             {"name": "Broward", "population": 40000},
+    ...             {"name": "Palm Beach", "population": 60000},
+    ...         ],
+    ...     },
+    ...     {
+    ...         "state": "Ohio",
+    ...         "shortname": "OH",
+    ...         "info": {"governor": "John Kasich"},
+    ...         "counties": [
+    ...             {"name": "Summit", "population": 1234},
+    ...             {"name": "Cuyahoga", "population": 1337},
+    ...         ],
+    ...     },
+    ... ]
+    >>> result = pd.json_normalize(
+    ...     data, "counties", ["state", "shortname", ["info", "governor"]]
+    ... )
+    >>> result
+             name  population    state shortname info.governor
+    0        Dade       12345  Florida        FL    Rick Scott
+    1     Broward       40000  Florida        FL    Rick Scott
+    2  Palm Beach       60000  Florida        FL    Rick Scott
+    3      Summit        1234     Ohio        OH   John Kasich
+    4    Cuyahoga        1337     Ohio        OH   John Kasich
+
+    >>> data = {"A": [1, 2]}
+    >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
+       Prefix.0
+    0         1
+    1         2
+
+    Returns normalized data with columns prefixed with the given string.
     """
 
 

diff --git a/src/snowflake/snowpark/modin/plugin/extensions/io_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/io_overrides.py
@@ -133,7 +133,6 @@ def read_xml(
 
 @_inherit_docstrings(native_pd.json_normalize, apilink="pandas.json_normalize")
 @register_pd_accessor("json_normalize")
-@pandas_module_level_function_not_implemented()
 def json_normalize(
     data: dict | list[dict],
     record_path: str | list | None = None,
@@ -146,7 +145,14 @@ def json_normalize(
 ) -> pd.DataFrame:  # noqa: PR01, RT01, D200
     # TODO(https://github.com/modin-project/modin/issues/7104):
     # modin needs to remove defaults to pandas at API layer
-    pass  # pragma: no cover
+    _pd_json_normalize_signature = {
+        val.name
+        for val in inspect.signature(native_pd.json_normalize).parameters.values()
+    }
+    _, _, _, f_locals = inspect.getargvalues(inspect.currentframe())
+    kwargs = {k: v for k, v in f_locals.items() if k in _pd_json_normalize_signature}
+
+    return pd.DataFrame(query_compiler=PandasOnSnowflakeIO.json_normalize(**kwargs))
 
 
 @_inherit_docstrings(native_pd.read_orc, apilink="pandas.read_orc")

diff --git a/src/snowflake/snowpark/modin/plugin/io/snow_io.py b/src/snowflake/snowpark/modin/plugin/io/snow_io.py
@@ -177,6 +177,13 @@ def from_pandas(cls, df: pandas.DataFrame):
         """
         return cls.query_compiler_cls.from_pandas(df, pandas.DataFrame)
 
+    @classmethod
+    def json_normalize(cls, **kwargs):  # noqa: PR01
+        """
+        Normalize semi-structured JSON data into a query compiler representing a flat table.
+        """
+        return cls.from_pandas(pandas.json_normalize(**kwargs))
+
     @classmethod
     def read_excel(cls, **kwargs):  # noqa: PR01
         """

diff --git a/tests/integ/modin/io/test_json_normalize.py b/tests/integ/modin/io/test_json_normalize.py
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+import modin.pandas as pd
+import pandas as native_pd
+import pytest
+
+from tests.integ.modin.utils import assert_frame_equal
+from tests.integ.utils.sql_counter import SqlCounter
+
+
+def test_json_normalize_basic():
+    data = [
+        {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
+        {"name": {"given": "Mark", "family": "Regner"}},
+        {"id": 2, "name": "Faye Raker"},
+    ]
+
+    with SqlCounter(query_count=1):
+        assert_frame_equal(
+            pd.json_normalize(data),
+            native_pd.json_normalize(data),
+            check_dtype=False,
+        )
+
+
+@pytest.mark.parametrize("max_level", [0, 1])
+def test_json_normalize_max_level(max_level):
+    data = [
+        {
+            "id": 1,
+            "name": "Cole Volk",
+            "fitness": {"height": 130, "weight": 60},
+        },
+        {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
+        {
+            "id": 2,
+            "name": "Faye Raker",
+            "fitness": {"height": 130, "weight": 60},
+        },
+    ]
+
+    with SqlCounter(query_count=1):
+        assert_frame_equal(
+            pd.json_normalize(data=data, max_level=max_level),
+            native_pd.json_normalize(data=data, max_level=max_level),
+            check_dtype=False,
+        )
+
+
+def test_json_normalize_record_path_meta():
+    data = [
+        {
+            "state": "Florida",
+            "shortname": "FL",
+            "info": {"governor": "Rick Scott"},
+            "counties": [
+                {"name": "Dade", "population": 12345},
+                {"name": "Broward", "population": 40000},
+                {"name": "Palm Beach", "population": 60000},
+            ],
+        },
+        {
+            "state": "Ohio",
+            "shortname": "OH",
+            "info": {"governor": "John Kasich"},
+            "counties": [
+                {"name": "Summit", "population": 1234},
+                {"name": "Cuyahoga", "population": 1337},
+            ],
+        },
+    ]
+
+    with SqlCounter(query_count=1):
+        assert_frame_equal(
+            pd.json_normalize(
+                data=data,
+                record_path="counties",
+                meta=["state", "shortname", ["info", "governor"]],
+            ),
+            native_pd.json_normalize(
+                data=data,
+                record_path="counties",
+                meta=["state", "shortname", ["info", "governor"]],
+            ),
+            check_dtype=False,
+        )
+
+
+def test_json_normalize_record_prefix():
+    data = {"A": [1, 2]}
+
+    with SqlCounter(query_count=1):
+        assert_frame_equal(
+            pd.json_normalize(data=data, record_prefix="Prefix."),
+            native_pd.json_normalize(data=data, record_prefix="Prefix."),
+            check_dtype=False,
+        )