diff --git a/Project.toml b/Project.toml index f7f7b90..be404c1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Kezdi" uuid = "48308a23-c29e-446c-b4c0-d9446a767439" authors = ["Miklos Koren ", "Gergely Attila Kiss "] -version = "0.4.7" +version = "0.4.8" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" diff --git a/docs/src/index.md b/docs/src/index.md index 46bd185..6f9dd7b 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -86,20 +86,17 @@ See the benchmarking code for [Stata](https://github.com/codedthinking/Kezdi.jl/ The function can operate on individual elements, ```julia get_make(text) = split(text, " ")[1] -@generate Make = Main.get_make(Model) +@generate Make = get_make(Model) ``` or on the entire column: ```julia -function geometric_mean(x::AbstractVector) +function geometric_mean(x::Vector) n = length(x) return exp(sum(log.(x)) / n) end -@collapse geom_NPG = Main.geometric_mean(MPG), by(Cylinders) +@collapse geom_NPG = geometric_mean(MPG), by(Cylinders) ``` -!!! tip "Note: `Main.` prefix" - If you define a function in your own code, you need to prefix the function name with `Main.` to use it in other commands. To make use of [Automatic vectorization](@ref), make sure to give the function a vector argument type. - ## Commands ### Setting and inspecting the global DataFrame diff --git a/goals.md b/goals.md index 1335b1c..c736aa0 100644 --- a/goals.md +++ b/goals.md @@ -266,3 +266,67 @@ An in-place version of `@with!` should do everything in place. This can mean all - non-standard evaluation makes it hard to wrap Kezdi.jl code in functions 6. For loops - implement `scalars()` and automatic expansion of locals in context + +# 2024-07-12 In-flight debugging session +```julia +julia> using Kezdi + +julia> module MyModule + myfunc(x) = 2x + end +Main.MyModule + +julia> df = DataFrame(x = 1:10) +10×1 DataFrame + Row │ x + │ Int64 +─────┼─────── + 1 │ 1 + 2 │ 2 + 3 │ 3 + 4 │ 4 + 5 │ 5 + 6 │ 6 + 7 │ 7 + 8 │ 8 + 9 │ 9 + 10 │ 10 + +julia> @with df @generate y = MyModule.myfunc(x) +10×2 DataFrame + Row │ x y + │ Int64 Int64 +─────┼────────────── + 1 │ 1 2 + 2 │ 2 4 + 3 │ 3 6 + 4 │ 4 8 + 5 │ 5 10 + 6 │ 6 12 + 7 │ 7 14 + 8 │ 8 16 + 9 │ 9 18 + 10 │ 10 20 +``` + +How about aggreator function? + +```julia +julia> module MyModule + myfunc(x) = 2x + myaggreg(v::Vector) = sum(x.^2) + end +WARNING: replacing module MyModule. +Main.MyModule + +julia> @with df @egen y = MyModule.myaggreg(x) +┌ Warning: transform!(var"##237", [:x] => (((x,)->(passmissing(MyModule.myaggreg)).(x)) => $(QuoteNode("y")))) +└ @ Kezdi ~/Tresorit/Mac/code/julia/Kezdi.jl/src/commands.jl:100 +ERROR: MethodError: no method matching myaggreg(::Int64) + +Closest candidates are: + myaggreg(::Vector) + @ Main.MyModule REPL[8]:3 +``` + +This means it was vectorized at compile time, but it is found at runtime. \ No newline at end of file diff --git a/src/codegen.jl b/src/codegen.jl index 8342d88..f09ba83 100644 --- a/src/codegen.jl +++ b/src/codegen.jl @@ -6,6 +6,7 @@ function generate_command(command::Command; options=[], allowed=[]) teardown = Expr[] process = (x -> x) tdfunction = gensym() + # this points to the DataFrame that the command will return to the user target_df = df2 given_options = get_top_symbol.(command.options) @@ -31,18 +32,26 @@ function generate_command(command::Command; options=[], allowed=[]) push!(setup, :(local $df2 = copy(getdf()))) variables_condition = (:ifable in options) ? vcat(extract_variable_references(command.condition)...) : Symbol[] variables_RHS = (:variables in options) ? vcat(extract_variable_references.(command.arguments)...) : Symbol[] + variables = vcat(variables_condition, variables_RHS) if :replace_variables in options process(x) = replace_variable_references(sdf, x) end if :vectorize in options process = vectorize_function_calls ∘ process end - if :_n in variables_condition - push!(setup, :(transform!($df2, eachindex => :_n))) + # where should special variables be created? + # when grouped by, then couting rows should be done on the grouped data + _n_goes_to = df2 + if :by in given_options && (:_n in variables || :_N in variables) + by_cols = get_by(command) + _n_goes_to = :(groupby($df2, $by_cols)) + end + if :_n in variables + push!(setup, :(transform!($_n_goes_to, eachindex => :_n))) push!(teardown, :(select!($df2, Not(:_n)))) end - if :_N in variables_condition - push!(setup, :(transform!($df2, nrow => :_N))) + if :_N in variables + push!(setup, :(transform!($_n_goes_to, nrow => :_N))) push!(teardown, :(select!($df2, Not(:_N)))) end if :ifable in options @@ -60,14 +69,6 @@ function generate_command(command::Command; options=[], allowed=[]) by_cols = get_by(command) push!(setup, :(local $gdf = groupby($sdf, $by_cols))) end - if :_n in variables_RHS - push!(setup, :(transform!($target_df, eachindex => :_n))) - push!(teardown, :(select!($target_df, Not(:_n)))) - end - if :_N in variables_RHS - push!(setup, :(transform!($target_df, nrow => :_N))) - push!(teardown, :(select!($target_df, Not(:_N)))) - end push!(setup, quote function $tdfunction(x) $(Expr(:block, teardown...)) @@ -244,9 +245,10 @@ function vectorize_function_calls(expr::Any) end end +get_dot_parts(ex::Symbol) = [ex] function get_dot_parts(ex::Expr) is_dot_reference(ex) || error("Expected a dot reference, got $ex") - parts = [] + parts = Symbol[] while is_dot_reference(ex) push!(parts, ex.args[2].value) ex = ex.args[1] @@ -274,30 +276,14 @@ isalphanumeric(c::AbstractChar) = isletter(c) || isdigit(c) || c == '_' isalphanumeric(str::AbstractString) = all(isalphanumeric, str) isassignment(expr::Any) = expr isa Expr && expr.head == :(=) && length(expr.args) == 2 -function operates_on_vector(expr::Any) - try - length(methodswith(Vector, eval(expr); supertypes=true)) > 0 - catch e - if isa(e, UndefVarError) - return false - else - rethrow(e) - end - end -end +operates_on_missing(expr::Any) = (expr isa Symbol && expr == :ismissing) || operates_on_type(expr, Missing) +operates_on_vector(expr::Any) = operates_on_type(expr, Vector) -function operates_on_missing(expr::Any) - expr isa Symbol && expr == :ismissing && return true +function operates_on_type(expr::Any, T::Type) try - length(methodswith(Missing, eval(expr); supertypes=true)) > 0 - catch e - if isa(e, UndefVarError) - return false - else - rethrow(e) - end + return length(methodswith(T, Main.eval(expr); supertypes=true)) > 0 + catch ee + !isa(ee, UndefVarError) && rethrow(ee) + return false end end - -# only broadcast first argument. For example, [1, 2, 3] in [2, 3] should evaluate to [false, true, true] -BFA(f::Function, xs, args...; kwargs...) = broadcast(x -> f(x, args...; kwargs...), xs) diff --git a/src/commands.jl b/src/commands.jl index 1a9e6a2..6eef5a2 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -44,10 +44,11 @@ function rewrite(::Val{:replace}, command::Command) else $setup eltype_RHS = $RHS isa AbstractVector ? eltype($RHS) : typeof($RHS) - if eltype_RHS != eltype($target_df[!, $target_column]) - local $third_vector = Vector{eltype_RHS}(undef, nrow($local_copy)) + eltype_LHS = eltype($local_copy[.!$bitmask, $target_column]) + if eltype_RHS != eltype_LHS + local $third_vector = Vector{promote_type(eltype_LHS, eltype_RHS)}(undef, nrow($local_copy)) $third_vector[$bitmask] .= $RHS - $third_vector[.!$bitmask] .= $local_copy[!, $target_column][.!$bitmask] + $third_vector[.!$bitmask] .= $local_copy[.!$bitmask, $target_column] $local_copy[!, $target_column] = $third_vector else $target_df[!, $target_column] .= $RHS diff --git a/test/codegen.jl b/test/codegen.jl index 7768838..96b53ae 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -1,3 +1,10 @@ +module MyModule +myfunc(x) = 2x +myaggreg(v::Vector) = sum(x.^2) +mymiss(::Missing) = missing +mymiss(x) = 3x +end + @testset "Replace variable references" begin @test_expr replace_variable_references(:(x + y + f(z) - g.(x))) == :(:x + :y + f(:z) - g.(:x)) @test_expr replace_variable_references(:(f(x, <=))) == :(f(:x, <=)) @@ -39,4 +46,75 @@ end @testset "Unknown functions are passed through `passmissing`" begin @test_expr vectorize_function_calls(:(y = Dates.year(x))) == :(y = (passmissing(Dates.year)).(x)) end + @testset "Functions in other modules" begin + using .MyModule + @test vectorize_function_calls(:(MyModule.myfunc(x))) == :((passmissing(MyModule.myfunc)).(x)) + @test vectorize_function_calls(:(MyModule.myaggreg(x))) == :(MyModule.myaggreg(keep_only_values(x))) + @test vectorize_function_calls(:(MyModule.mymiss(x))) == :(MyModule.mymiss.(x)) + end + + @testset "Functions in other modules with DNV" begin + using .MyModule + @test vectorize_function_calls(:(DNV(MyModule.myfunc(x)))) == :(MyModule.myfunc(x)) + @test vectorize_function_calls(:(DNV(MyModule.myaggreg(x)))) == :(MyModule.myaggreg(x)) + @test vectorize_function_calls(:(DNV(MyModule.mymiss(x)))) == :(MyModule.mymiss(x)) + end +end + +@testset "Helper functions" begin + @testset "operates_on_type" begin + @test Kezdi.operates_on_type(:log, Number) + @test !Kezdi.operates_on_type(:log, String) + @test Kezdi.operates_on_type(:log, Missing) + @test !Kezdi.operates_on_type(:sum, Missing) + @test !Kezdi.operates_on_type(:sum, Missing) + @test !Kezdi.operates_on_type(:log, AbstractVector) + + @test_throws Exception Kezdi.operates_on_type(4, Missing) + + @test Kezdi.operates_on_missing(:log) + @test !Kezdi.operates_on_missing(:sum) + @test Kezdi.operates_on_vector(:mean) + @test !Kezdi.operates_on_vector(:log) + end + + @testset "split_assignment" begin + @test Kezdi.isassignment(:(x = 2)) + @test !Kezdi.isassignment(:(x == 2)) + @test Kezdi.split_assignment(:(x = 2)) == (:x, 2) + @test Kezdi.split_assignment(:(x = 2 + 3)) == (:x, :(2 + 3)) + @test Kezdi.split_assignment(:(x = f(y) + 1)) == (:x, :(f(y) + 1)) + end + + @testset "get_LHS" begin + @test Kezdi.get_LHS(:(x = 2)) == "x" + @test Kezdi.get_LHS(:(x = 2 + 3)) == "x" + @test Kezdi.get_LHS(:(x = f(y) + 1)) == "x" + end + + @testset "Operators" begin + @test Kezdi.is_operator(:+) + @test !Kezdi.is_operator(:x) + @test !Kezdi.is_operator(:log) + @test Kezdi.is_operator(:&&) + @test Kezdi.is_operator(:<=) + @test Kezdi.is_dotted_operator(:.+) + end + + @testset "Variable reference and function call" begin + @test Kezdi.is_variable_reference(:x) + @test !Kezdi.is_variable_reference(:(x.y)) + @test !Kezdi.is_variable_reference(:(log(x))) + @test Kezdi.is_function_call(:(log(x))) + @test Kezdi.is_function_call(:(log.(x))) + @test Kezdi.is_function_call(:(log.(x, y))) + @test Kezdi.is_function_call(:(Main.log(x))) + @test !Kezdi.is_function_call(:x) + end + + @testset "get_dot_parts" begin + @test Kezdi.get_dot_parts(:x) == [:x] + @test Kezdi.get_dot_parts(:(x.y)) == [:x, :y] + @test Kezdi.get_dot_parts(:(x.y.z)) == [:x, :y, :z] + end end diff --git a/test/commands.jl b/test/commands.jl index c15bc50..991c3a8 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -46,14 +46,23 @@ @test all(df2.y .=== [-1, -2, -3, missing]) end + @testset "_n and _N with @if" begin + df = DataFrame(x=1:4) + df2 = @with df @generate z = 9 @if _n >= 2 + @test all(df2.z .=== [missing, 9, 9, 9]) + df2 = @with df @generate z = _n + @test df2.z == [1, 2, 3, 4] + df2 = @with df @generate z = _n @if _n >= 2 + @test all(df2.z .=== [missing, 2, 3, 4]) + end + @testset "Lists-valued variables" begin df = DataFrame(x=[[1, 2], [3, 4], [5, 6], [7, 8]]) @test (@with df @generate x1 = getindex(x, 1)).x1 == [1, 3, 5, 7] @test (@with df @generate x2 = getindex(x, 2)).x2 == [2, 4, 6, 8] - end - - @testset "Error handling" begin - @test_throws Exception @with df @generate x = 1 + df = DataFrame(text = ["a,b", "c,d,e", "f"]) + df2 = @with df @generate n_terms = length.(split.(text, ",")) + @test df2.n_terms == [2, 3, 1] end end @@ -87,6 +96,19 @@ end @test eltype(df.x) == eltype(df3.x) end + @testset "Mixed types" begin + df = DataFrame(x=[1, 2, 3]) + @test eltype((@with df @replace x = 1.1 @if _n == 1).x) <: AbstractFloat + @test eltype((@with df @replace x = missing @if _n == 1).x) == Union{Missing, Int} + @test eltype((@with df @replace x = "a" @if _n == 1).x) == Any + df = DataFrame(x=[missing, 2, 3]) + @test eltype((@with df @replace x = 1 @if _n == 1).x) == Union{Int, Missing} + df = DataFrame(x=[1.1, 2, 3]) + @test eltype((@with df @replace x = 1 @if _n == 1).x) <: AbstractFloat + df = DataFrame(x=[1, 2, missing]) + @test eltype((@with df @replace x = 1.1 @if _n == 1).x) <: Union{T, Missing} where T <: AbstractFloat + end + @testset "Error handling" begin @test_throws Exception @with df @replace y = 1 end @@ -220,6 +242,18 @@ end df2 = @with df @egen y = maximum(x), by(group, s) @test df2.y == [3, 2, 3, 4, 6, 6] end + + @testset "_n and _N with @if" begin + df = DataFrame(x=1:6, g=[:a, :a, :a, :a, :b, :b]) + df2 = @with df @egen z = 9 @if _n >= 2, by(g) + @test all(df2.z .=== [missing, 9, 9, 9, missing, 9]) + df2 = @with df @egen z = _n, by(g) + @test df2.z == [1, 2, 3, 4, 1, 2] + df2 = @with df @egen z = _N, by(g) + @test df2.z == [4, 4, 4, 4, 2, 2] + df2 = @with df @egen z = _n @if _n >= 2, by(g) + @test all(df2.z .=== [missing, 2, 3, 4, missing, 2]) + end end @testset "Keep if" begin @@ -313,6 +347,13 @@ end df2 = @with df @generate y = sum(x) @if x < 3 @test all(df2.y .=== [3, 3, missing, missing]) end + + @testset "Errors" begin + @test_throws Exception Main.eval(:(@with DataFrame(a=1:10) @generate y)) + @test_throws Exception Main.eval(:(@with DataFrame(a=1:10) @generate y x)) + @test_throws Exception Main.eval(:(@with DataFrame(a=1:10) @generate y = x z = w)) + @test_throws Exception Main.eval(:(@with DataFrame(a=1:10) @generate y, by(z))) + end end @testset "x in list" begin