From a655252d2904e05365a8e6dc3a9396b7f3626641 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Sat, 29 Jun 2024 10:20:03 -0800 Subject: [PATCH] feat: support empty arrays, improve ibis.array() API Picking out the array stuff from https://github.com/ibis-project/ibis/pull/8666 Instead of trying to fit the two cases of 0-length and 1+ length arrays into the same op, I split them up into separate ones. By doing this, if we guarantee that all the elements of ops.Array() have the right type before construction, we don't have to do any fancy casting during compilation, all the elements will already have been casted as needed. This allows for the compilation of array on some sql backends like postgres. If we tried to cast the entire array, you end up with SQL like `cast [..] as STRUCT<...>[]`, which postgres freaks about. Instead, if we cast each individual element, such as `[cast({...} as ROW..., cast({...} as ROW...]`, then this is valid SQL. I added a Length annotation to ops.Array to verify the length is 1+. IDK, this isn't really needed, since if you ever did construct one, then the rlz.highest_precedence_dtype([]) would fail. But that might fail at a later time, and I wanted to raise an error at construction time. But, end users should never be constructing ops.Arrays directly, so this is a guardrail just for us ibis devs. So IDK, we could remove it, but I think it is a nice hint for future us. --- ibis/backends/pandas/executor.py | 9 +++- ibis/backends/polars/compiler.py | 15 ++++--- ibis/backends/sql/compiler.py | 3 ++ ibis/backends/tests/test_array.py | 68 ++++++++++++++++++++++++++++- ibis/backends/tests/test_generic.py | 7 ++- ibis/backends/tests/test_sql.py | 16 +++++-- ibis/backends/tests/test_string.py | 10 ++--- ibis/expr/operations/arrays.py | 19 +++++--- ibis/expr/types/arrays.py | 62 +++++++++++++++++++++----- 9 files changed, 172 insertions(+), 37 deletions(-) diff --git a/ibis/backends/pandas/executor.py b/ibis/backends/pandas/executor.py index a3153d17b8b4..868a08a98dfc 100644 --- a/ibis/backends/pandas/executor.py +++ b/ibis/backends/pandas/executor.py @@ -49,7 +49,9 @@ def visit(cls, op: ops.Node, **kwargs): @classmethod def visit(cls, op: ops.Literal, value, dtype): - if dtype.is_interval(): + if value is None: + value = None + elif dtype.is_interval(): value = pd.Timedelta(value, dtype.unit.short) elif dtype.is_array(): value = np.array(value) @@ -219,6 +221,11 @@ def visit(cls, op: ops.FindInSet, needle, values): result = np.select(condlist, choicelist, default=-1) return pd.Series(result, name=op.name) + @classmethod + def visit(cls, op: ops.EmptyArray, dtype): + pdt = PandasType.from_ibis(dtype) + return np.array([], dtype=pdt) + @classmethod def visit(cls, op: ops.Array, exprs): return cls.rowwise(lambda row: np.array(row, dtype=object), exprs) diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py index 4d9a497191b4..08f4b46ffb3e 100644 --- a/ibis/backends/polars/compiler.py +++ b/ibis/backends/polars/compiler.py @@ -87,12 +87,7 @@ def literal(op, **_): value = op.value dtype = op.dtype - if dtype.is_array(): - value = pl.Series("", value) - typ = PolarsType.from_ibis(dtype) - val = pl.lit(value, dtype=typ) - return val.implode() - elif dtype.is_struct(): + if dtype.is_struct(): values = [ pl.lit(v, dtype=PolarsType.from_ibis(dtype[k])).alias(k) for k, v in value.items() @@ -106,7 +101,7 @@ def literal(op, **_): return pl.lit(value) else: typ = PolarsType.from_ibis(dtype) - return pl.lit(op.value, dtype=typ) + return pl.lit(value, dtype=typ) _TIMESTAMP_SCALE_TO_UNITS = { @@ -973,6 +968,12 @@ def array_concat(op, **kw): return result +@translate.register(ops.EmptyArray) +def empty_array(op, **kw): + pdt = PolarsType.from_ibis(op.dtype) + return pl.lit([], dtype=pdt) + + @translate.register(ops.Array) def array_column(op, **kw): cols = [translate(col, **kw) for col in op.exprs] diff --git a/ibis/backends/sql/compiler.py b/ibis/backends/sql/compiler.py index 09f26b80d62f..18699aa560e5 100644 --- a/ibis/backends/sql/compiler.py +++ b/ibis/backends/sql/compiler.py @@ -1019,6 +1019,9 @@ def visit_InSubquery(self, op, *, rel, needle): query = sg.select(STAR).from_(query) return needle.isin(query=query) + def visit_EmptyArray(self, op, *, dtype): + return self.cast(self.f.array(), dtype) + def visit_Array(self, op, *, exprs): return self.f.array(*exprs) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 8b55c189e9d6..fb46d5357d24 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -31,6 +31,7 @@ PySparkAnalysisException, TrinoUserError, ) +from ibis.common.annotations import ValidationError from ibis.common.collections import frozendict pytestmark = [ @@ -72,6 +73,71 @@ # list. +def test_array_factory(con): + a = ibis.array([1, 2, 3]) + assert a.type() == dt.Array(value_type=dt.Int8) + assert con.execute(a) == [1, 2, 3] + + a2 = ibis.array(a) + assert a.type() == dt.Array(value_type=dt.Int8) + assert con.execute(a2) == [1, 2, 3] + + +def test_array_factory_typed(con): + typed = ibis.array([1, 2, 3], type="array") + assert con.execute(typed) == ["1", "2", "3"] + + typed2 = ibis.array(ibis.array([1, 2, 3]), type="array") + assert con.execute(typed2) == ["1", "2", "3"] + + +@pytest.mark.notimpl("flink", raises=Py4JJavaError) +def test_array_factory_empty(con): + with pytest.raises(ValidationError): + ibis.array([]) + + empty_typed = ibis.array([], type="array") + assert empty_typed.type() == dt.Array(value_type=dt.string) + assert con.execute(empty_typed) == [] + + +@pytest.mark.notyet( + "clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL" +) +def test_array_factory_null(con): + with pytest.raises(ValidationError): + ibis.array(None) + with pytest.raises(ValidationError): + ibis.array(None, type="int64") + none_typed = ibis.array(None, type="array") + assert none_typed.type() == dt.Array(value_type=dt.string) + assert con.execute(none_typed) is None + + nones = ibis.array([None, None], type="array") + assert nones.type() == dt.Array(value_type=dt.string) + assert con.execute(nones) == [None, None] + + # Execute a real value here, so the backends that don't support arrays + # actually xfail as we expect them to. + # Otherwise would have to @mark.xfail every test in this file besides this one. + assert con.execute(ibis.array([1, 2])) == [1, 2] + + +@pytest.mark.broken( + ["datafusion", "flink", "polars"], + raises=AssertionError, + reason="[None, 1] executes to [np.nan, 1.0]", +) +def test_array_factory_null_mixed(con): + none_and_val = ibis.array([None, 1]) + assert none_and_val.type() == dt.Array(value_type=dt.Int8) + assert con.execute(none_and_val) == [None, 1] + + none_and_val_typed = ibis.array([None, 1], type="array") + assert none_and_val_typed.type() == dt.Array(value_type=dt.String) + assert con.execute(none_and_val_typed) == [None, "1"] + + def test_array_column(backend, alltypes, df): expr = ibis.array( [alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)] @@ -1356,7 +1422,7 @@ def test_unnest_range(con): pytest.mark.notyet(["bigquery"], raises=GoogleBadRequest), pytest.mark.broken( ["polars"], - reason="expression input not supported with nested arrays", + reason="upstream polars bug: https://github.com/pola-rs/polars/issues/17294", raises=TypeError, ), ], diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 3ed4a9db8cc5..9f9a5ce5e4c9 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1431,13 +1431,12 @@ def query(t, group_cols): snapshot.assert_match(str(ibis.to_sql(t3, dialect=con.name)), "out.sql") -@pytest.mark.notimpl(["oracle", "exasol"], raises=com.OperationNotDefinedError) -@pytest.mark.notimpl(["druid"], raises=AssertionError) @pytest.mark.notyet( - ["datafusion", "impala", "mssql", "mysql", "sqlite"], + ["datafusion", "exasol", "impala", "mssql", "mysql", "oracle", "sqlite"], reason="backend doesn't support arrays and we don't implement pivot_longer with unions yet", - raises=com.OperationNotDefinedError, + raises=(com.OperationNotDefinedError, com.UnsupportedBackendType), ) +@pytest.mark.notimpl(["druid"], raises=AssertionError) @pytest.mark.broken( ["trino"], reason="invalid code generated for unnesting a struct", diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index 777cfa3db8bb..1f3f8cb69ad5 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -30,8 +30,18 @@ ibis.struct(dict(abc=432)), marks=[ pytest.mark.never( - ["impala", "mysql", "sqlite", "mssql", "exasol"], - raises=(NotImplementedError, exc.UnsupportedBackendType), + [ + "exasol", + "impala", + "mysql", + "sqlite", + "mssql", + ], + raises=( + exc.OperationNotDefinedError, + NotImplementedError, + exc.UnsupportedBackendType, + ), reason="structs not supported in the backend", ), pytest.mark.notimpl( @@ -104,7 +114,7 @@ def test_isin_bug(con, snapshot): @pytest.mark.notyet( ["datafusion", "exasol", "oracle", "flink", "risingwave"], reason="no unnest support", - raises=exc.OperationNotDefinedError, + raises=(exc.OperationNotDefinedError, exc.UnsupportedBackendType), ) @pytest.mark.notyet( ["sqlite", "mysql", "druid", "impala", "mssql"], reason="no unnest support upstream" diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index ceb9fdc77711..d56f5f934024 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -835,6 +835,11 @@ def test_capitalize(con, inp, expected): assert pd.isnull(result) +@pytest.mark.never( + ["exasol", "impala", "mssql", "mysql", "sqlite"], + reason="Backend doesn't support arrays", + raises=(com.OperationNotDefinedError, com.UnsupportedBackendType), +) @pytest.mark.notimpl( [ "dask", @@ -842,11 +847,6 @@ def test_capitalize(con, inp, expected): "polars", "oracle", "flink", - "sqlite", - "mssql", - "mysql", - "exasol", - "impala", ], raises=com.OperationNotDefinedError, ) diff --git a/ibis/expr/operations/arrays.py b/ibis/expr/operations/arrays.py index 6d68baab94c3..584cdf3f226b 100644 --- a/ibis/expr/operations/arrays.py +++ b/ibis/expr/operations/arrays.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Optional +from typing import Annotated, Optional from public import public @@ -10,19 +10,26 @@ import ibis.expr.datatypes as dt import ibis.expr.rules as rlz from ibis.common.annotations import attribute +from ibis.common.patterns import Length # noqa: TCH001 from ibis.common.typing import VarTuple # noqa: TCH001 from ibis.expr.operations.core import Unary, Value +@public +class EmptyArray(Value): + """Construct an array with 0 elements.""" + + dtype: dt.Array + shape = ds.scalar + + @public class Array(Value): - """Construct an array.""" + """Construct an array with 1+ elements. Use `EmptyArray` for empty arrays.""" - exprs: VarTuple[Value] + exprs: Annotated[VarTuple[Value], Length(at_least=1)] - @attribute - def shape(self): - return rlz.highest_precedence_shape(self.exprs) + shape = rlz.shape_like("exprs") @attribute def dtype(self): diff --git a/ibis/expr/types/arrays.py b/ibis/expr/types/arrays.py index 2d9e5a8f5b3a..fed00ad66926 100644 --- a/ibis/expr/types/arrays.py +++ b/ibis/expr/types/arrays.py @@ -5,16 +5,17 @@ from public import public +import ibis +import ibis.expr.datatypes as dt import ibis.expr.operations as ops +import ibis.expr.types as ir +from ibis.common.annotations import ValidationError from ibis.common.deferred import Deferred, deferrable from ibis.expr.types.generic import Column, Scalar, Value if TYPE_CHECKING: from collections.abc import Callable, Iterable - import ibis.expr.types as ir - from ibis.expr.types.typing import V - import ibis.common.exceptions as com @@ -1067,7 +1068,11 @@ def __getitem__(self, index: int | ir.IntegerValue | slice) -> ir.Column: @public @deferrable -def array(values: Iterable[V]) -> ArrayValue: +def array( + values: ArrayValue | Iterable | ir.NullValue | None, + *, + type: str | dt.DataType | None = None, +) -> ArrayValue: """Create an array expression. If any values are [column expressions](../concepts/datatypes.qmd) the @@ -1078,6 +1083,9 @@ def array(values: Iterable[V]) -> ArrayValue: ---------- values An iterable of Ibis expressions or Python literals + type + An instance of `ibis.expr.datatypes.DataType` or a string indicating + the Ibis type of `value`. eg `array`. Returns ------- @@ -1108,15 +1116,49 @@ def array(values: Iterable[V]) -> ArrayValue: │ [3, 42, ... +1] │ └──────────────────────┘ - >>> ibis.array([t.a, 42 + ibis.literal(5)]) + >>> ibis.array([t.a, 42 + ibis.literal(5)], type="array") ┏━━━━━━━━━━━━━━━━━━━━━━┓ ┃ Array() ┃ ┡━━━━━━━━━━━━━━━━━━━━━━┩ - │ array │ + │ array │ ├──────────────────────┤ - │ [1, 47] │ - │ [2, 47] │ - │ [3, 47] │ + │ [1.0, 47.0] │ + │ [2.0, 47.0] │ + │ [3.0, 47.0] │ └──────────────────────┘ """ - return ops.Array(tuple(values)).to_expr() + type = dt.dtype(type) if type is not None else None + if type is not None and not isinstance(type, dt.Array): + raise ValidationError(f"type must be an array, got {type}") + + if isinstance(values, ir.Value): + if type is not None: + return values.cast(type) + elif isinstance(values, ArrayValue): + return values + else: + raise ValidationError( + f"If no type passed, values must be an array, got {values.type()}" + ) + + if values is None: + if type is None: + raise ValidationError("If values is None/NULL, type must be provided") + return ir.null(type) + + values = tuple(values) + if len(values) == 0: + if type is None: + raise ValidationError("If values is empty, type must be provided") + return ops.EmptyArray(type).to_expr() + else: + value_type = type.value_type if type is not None else None + values = [_value(v, value_type) for v in values] + return ops.Array(values).to_expr() + + +def _value(x, type) -> ir.Value: + if isinstance(x, (ir.Value, Deferred)): + return x.cast(type) if type is not None else x + else: + return ibis.literal(x, type=type)