Skip to content

Commit

Permalink
Merge pull request #159 from codedthinking/0.4-bugfixes
Browse files Browse the repository at this point in the history
0.4 bugfixes
  • Loading branch information
gergelyattilakiss authored Jul 15, 2024
2 parents 88bb004 + 21e3303 commit 26aae12
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 50 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Kezdi"
uuid = "48308a23-c29e-446c-b4c0-d9446a767439"
authors = ["Miklos Koren <[email protected]>", "Gergely Attila Kiss <[email protected]>"]
version = "0.4.7"
version = "0.4.8"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
9 changes: 3 additions & 6 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,20 +86,17 @@ See the benchmarking code for [Stata](https://github.com/codedthinking/Kezdi.jl/
The function can operate on individual elements,
```julia
get_make(text) = split(text, " ")[1]
@generate Make = Main.get_make(Model)
@generate Make = get_make(Model)
```
or on the entire column:
```julia
function geometric_mean(x::AbstractVector)
function geometric_mean(x::Vector)
n = length(x)
return exp(sum(log.(x)) / n)
end
@collapse geom_NPG = Main.geometric_mean(MPG), by(Cylinders)
@collapse geom_NPG = geometric_mean(MPG), by(Cylinders)
```

!!! tip "Note: `Main.` prefix"
If you define a function in your own code, you need to prefix the function name with `Main.` to use it in other commands. To make use of [Automatic vectorization](@ref), make sure to give the function a vector argument type.

## Commands

### Setting and inspecting the global DataFrame
Expand Down
64 changes: 64 additions & 0 deletions goals.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,67 @@ An in-place version of `@with!` should do everything in place. This can mean all
- non-standard evaluation makes it hard to wrap Kezdi.jl code in functions
6. For loops
- implement `scalars()` and automatic expansion of locals in context

# 2024-07-12 In-flight debugging session
```julia
julia> using Kezdi

julia> module MyModule
myfunc(x) = 2x
end
Main.MyModule

julia> df = DataFrame(x = 1:10)
10×1 DataFrame
Row │ x
│ Int64
─────┼───────
11
22
33
44
55
66
77
88
99
1010

julia> @with df @generate y = MyModule.myfunc(x)
10×2 DataFrame
Row │ x y
│ Int64 Int64
─────┼──────────────
11 2
22 4
33 6
44 8
55 10
66 12
77 14
88 16
99 18
1010 20
```

How about aggreator function?

```julia
julia> module MyModule
myfunc(x) = 2x
myaggreg(v::Vector) = sum(x.^2)
end
WARNING: replacing module MyModule.
Main.MyModule

julia> @with df @egen y = MyModule.myaggreg(x)
┌ Warning: transform!(var"##237", [:x] => (((x,)->(passmissing(MyModule.myaggreg)).(x)) => $(QuoteNode("y"))))
└ @ Kezdi ~/Tresorit/Mac/code/julia/Kezdi.jl/src/commands.jl:100
ERROR: MethodError: no method matching myaggreg(::Int64)

Closest candidates are:
myaggreg(::Vector)
@ Main.MyModule REPL[8]:3
```

This means it was vectorized at compile time, but it is found at runtime.
58 changes: 22 additions & 36 deletions src/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ function generate_command(command::Command; options=[], allowed=[])
teardown = Expr[]
process = (x -> x)
tdfunction = gensym()
# this points to the DataFrame that the command will return to the user
target_df = df2

given_options = get_top_symbol.(command.options)
Expand All @@ -31,18 +32,26 @@ function generate_command(command::Command; options=[], allowed=[])
push!(setup, :(local $df2 = copy(getdf())))
variables_condition = (:ifable in options) ? vcat(extract_variable_references(command.condition)...) : Symbol[]
variables_RHS = (:variables in options) ? vcat(extract_variable_references.(command.arguments)...) : Symbol[]
variables = vcat(variables_condition, variables_RHS)
if :replace_variables in options
process(x) = replace_variable_references(sdf, x)
end
if :vectorize in options
process = vectorize_function_calls process
end
if :_n in variables_condition
push!(setup, :(transform!($df2, eachindex => :_n)))
# where should special variables be created?
# when grouped by, then couting rows should be done on the grouped data
_n_goes_to = df2
if :by in given_options && (:_n in variables || :_N in variables)
by_cols = get_by(command)
_n_goes_to = :(groupby($df2, $by_cols))
end
if :_n in variables
push!(setup, :(transform!($_n_goes_to, eachindex => :_n)))
push!(teardown, :(select!($df2, Not(:_n))))
end
if :_N in variables_condition
push!(setup, :(transform!($df2, nrow => :_N)))
if :_N in variables
push!(setup, :(transform!($_n_goes_to, nrow => :_N)))
push!(teardown, :(select!($df2, Not(:_N))))
end
if :ifable in options
Expand All @@ -60,14 +69,6 @@ function generate_command(command::Command; options=[], allowed=[])
by_cols = get_by(command)
push!(setup, :(local $gdf = groupby($sdf, $by_cols)))
end
if :_n in variables_RHS
push!(setup, :(transform!($target_df, eachindex => :_n)))
push!(teardown, :(select!($target_df, Not(:_n))))
end
if :_N in variables_RHS
push!(setup, :(transform!($target_df, nrow => :_N)))
push!(teardown, :(select!($target_df, Not(:_N))))
end
push!(setup, quote
function $tdfunction(x)
$(Expr(:block, teardown...))
Expand Down Expand Up @@ -244,9 +245,10 @@ function vectorize_function_calls(expr::Any)
end
end

get_dot_parts(ex::Symbol) = [ex]
function get_dot_parts(ex::Expr)
is_dot_reference(ex) || error("Expected a dot reference, got $ex")
parts = []
parts = Symbol[]
while is_dot_reference(ex)
push!(parts, ex.args[2].value)
ex = ex.args[1]
Expand Down Expand Up @@ -274,30 +276,14 @@ isalphanumeric(c::AbstractChar) = isletter(c) || isdigit(c) || c == '_'
isalphanumeric(str::AbstractString) = all(isalphanumeric, str)

isassignment(expr::Any) = expr isa Expr && expr.head == :(=) && length(expr.args) == 2
function operates_on_vector(expr::Any)
try
length(methodswith(Vector, eval(expr); supertypes=true)) > 0
catch e
if isa(e, UndefVarError)
return false
else
rethrow(e)
end
end
end
operates_on_missing(expr::Any) = (expr isa Symbol && expr == :ismissing) || operates_on_type(expr, Missing)
operates_on_vector(expr::Any) = operates_on_type(expr, Vector)

function operates_on_missing(expr::Any)
expr isa Symbol && expr == :ismissing && return true
function operates_on_type(expr::Any, T::Type)
try
length(methodswith(Missing, eval(expr); supertypes=true)) > 0
catch e
if isa(e, UndefVarError)
return false
else
rethrow(e)
end
return length(methodswith(T, Main.eval(expr); supertypes=true)) > 0
catch ee
!isa(ee, UndefVarError) && rethrow(ee)
return false
end
end

# only broadcast first argument. For example, [1, 2, 3] in [2, 3] should evaluate to [false, true, true]
BFA(f::Function, xs, args...; kwargs...) = broadcast(x -> f(x, args...; kwargs...), xs)
7 changes: 4 additions & 3 deletions src/commands.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@ function rewrite(::Val{:replace}, command::Command)
else
$setup
eltype_RHS = $RHS isa AbstractVector ? eltype($RHS) : typeof($RHS)
if eltype_RHS != eltype($target_df[!, $target_column])
local $third_vector = Vector{eltype_RHS}(undef, nrow($local_copy))
eltype_LHS = eltype($local_copy[.!$bitmask, $target_column])
if eltype_RHS != eltype_LHS
local $third_vector = Vector{promote_type(eltype_LHS, eltype_RHS)}(undef, nrow($local_copy))
$third_vector[$bitmask] .= $RHS
$third_vector[.!$bitmask] .= $local_copy[!, $target_column][.!$bitmask]
$third_vector[.!$bitmask] .= $local_copy[.!$bitmask, $target_column]
$local_copy[!, $target_column] = $third_vector
else
$target_df[!, $target_column] .= $RHS
Expand Down
78 changes: 78 additions & 0 deletions test/codegen.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
module MyModule
myfunc(x) = 2x
myaggreg(v::Vector) = sum(x.^2)
mymiss(::Missing) = missing
mymiss(x) = 3x
end

@testset "Replace variable references" begin
@test_expr replace_variable_references(:(x + y + f(z) - g.(x))) == :(:x + :y + f(:z) - g.(:x))
@test_expr replace_variable_references(:(f(x, <=))) == :(f(:x, <=))
Expand Down Expand Up @@ -39,4 +46,75 @@ end
@testset "Unknown functions are passed through `passmissing`" begin
@test_expr vectorize_function_calls(:(y = Dates.year(x))) == :(y = (passmissing(Dates.year)).(x))
end
@testset "Functions in other modules" begin
using .MyModule
@test vectorize_function_calls(:(MyModule.myfunc(x))) == :((passmissing(MyModule.myfunc)).(x))
@test vectorize_function_calls(:(MyModule.myaggreg(x))) == :(MyModule.myaggreg(keep_only_values(x)))
@test vectorize_function_calls(:(MyModule.mymiss(x))) == :(MyModule.mymiss.(x))
end

@testset "Functions in other modules with DNV" begin
using .MyModule
@test vectorize_function_calls(:(DNV(MyModule.myfunc(x)))) == :(MyModule.myfunc(x))
@test vectorize_function_calls(:(DNV(MyModule.myaggreg(x)))) == :(MyModule.myaggreg(x))
@test vectorize_function_calls(:(DNV(MyModule.mymiss(x)))) == :(MyModule.mymiss(x))
end
end

@testset "Helper functions" begin
@testset "operates_on_type" begin
@test Kezdi.operates_on_type(:log, Number)
@test !Kezdi.operates_on_type(:log, String)
@test Kezdi.operates_on_type(:log, Missing)
@test !Kezdi.operates_on_type(:sum, Missing)
@test !Kezdi.operates_on_type(:sum, Missing)
@test !Kezdi.operates_on_type(:log, AbstractVector)

@test_throws Exception Kezdi.operates_on_type(4, Missing)

@test Kezdi.operates_on_missing(:log)
@test !Kezdi.operates_on_missing(:sum)
@test Kezdi.operates_on_vector(:mean)
@test !Kezdi.operates_on_vector(:log)
end

@testset "split_assignment" begin
@test Kezdi.isassignment(:(x = 2))
@test !Kezdi.isassignment(:(x == 2))
@test Kezdi.split_assignment(:(x = 2)) == (:x, 2)
@test Kezdi.split_assignment(:(x = 2 + 3)) == (:x, :(2 + 3))
@test Kezdi.split_assignment(:(x = f(y) + 1)) == (:x, :(f(y) + 1))
end

@testset "get_LHS" begin
@test Kezdi.get_LHS(:(x = 2)) == "x"
@test Kezdi.get_LHS(:(x = 2 + 3)) == "x"
@test Kezdi.get_LHS(:(x = f(y) + 1)) == "x"
end

@testset "Operators" begin
@test Kezdi.is_operator(:+)
@test !Kezdi.is_operator(:x)
@test !Kezdi.is_operator(:log)
@test Kezdi.is_operator(:&&)
@test Kezdi.is_operator(:<=)
@test Kezdi.is_dotted_operator(:.+)
end

@testset "Variable reference and function call" begin
@test Kezdi.is_variable_reference(:x)
@test !Kezdi.is_variable_reference(:(x.y))
@test !Kezdi.is_variable_reference(:(log(x)))
@test Kezdi.is_function_call(:(log(x)))
@test Kezdi.is_function_call(:(log.(x)))
@test Kezdi.is_function_call(:(log.(x, y)))
@test Kezdi.is_function_call(:(Main.log(x)))
@test !Kezdi.is_function_call(:x)
end

@testset "get_dot_parts" begin
@test Kezdi.get_dot_parts(:x) == [:x]
@test Kezdi.get_dot_parts(:(x.y)) == [:x, :y]
@test Kezdi.get_dot_parts(:(x.y.z)) == [:x, :y, :z]
end
end
49 changes: 45 additions & 4 deletions test/commands.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,23 @@
@test all(df2.y .=== [-1, -2, -3, missing])
end

@testset "_n and _N with @if" begin
df = DataFrame(x=1:4)
df2 = @with df @generate z = 9 @if _n >= 2
@test all(df2.z .=== [missing, 9, 9, 9])
df2 = @with df @generate z = _n
@test df2.z == [1, 2, 3, 4]
df2 = @with df @generate z = _n @if _n >= 2
@test all(df2.z .=== [missing, 2, 3, 4])
end

@testset "Lists-valued variables" begin
df = DataFrame(x=[[1, 2], [3, 4], [5, 6], [7, 8]])
@test (@with df @generate x1 = getindex(x, 1)).x1 == [1, 3, 5, 7]
@test (@with df @generate x2 = getindex(x, 2)).x2 == [2, 4, 6, 8]
end

@testset "Error handling" begin
@test_throws Exception @with df @generate x = 1
df = DataFrame(text = ["a,b", "c,d,e", "f"])
df2 = @with df @generate n_terms = length.(split.(text, ","))
@test df2.n_terms == [2, 3, 1]
end
end

Expand Down Expand Up @@ -87,6 +96,19 @@ end
@test eltype(df.x) == eltype(df3.x)
end

@testset "Mixed types" begin
df = DataFrame(x=[1, 2, 3])
@test eltype((@with df @replace x = 1.1 @if _n == 1).x) <: AbstractFloat
@test eltype((@with df @replace x = missing @if _n == 1).x) == Union{Missing, Int}
@test eltype((@with df @replace x = "a" @if _n == 1).x) == Any
df = DataFrame(x=[missing, 2, 3])
@test eltype((@with df @replace x = 1 @if _n == 1).x) == Union{Int, Missing}
df = DataFrame(x=[1.1, 2, 3])
@test eltype((@with df @replace x = 1 @if _n == 1).x) <: AbstractFloat
df = DataFrame(x=[1, 2, missing])
@test eltype((@with df @replace x = 1.1 @if _n == 1).x) <: Union{T, Missing} where T <: AbstractFloat
end

@testset "Error handling" begin
@test_throws Exception @with df @replace y = 1
end
Expand Down Expand Up @@ -220,6 +242,18 @@ end
df2 = @with df @egen y = maximum(x), by(group, s)
@test df2.y == [3, 2, 3, 4, 6, 6]
end

@testset "_n and _N with @if" begin
df = DataFrame(x=1:6, g=[:a, :a, :a, :a, :b, :b])
df2 = @with df @egen z = 9 @if _n >= 2, by(g)
@test all(df2.z .=== [missing, 9, 9, 9, missing, 9])
df2 = @with df @egen z = _n, by(g)
@test df2.z == [1, 2, 3, 4, 1, 2]
df2 = @with df @egen z = _N, by(g)
@test df2.z == [4, 4, 4, 4, 2, 2]
df2 = @with df @egen z = _n @if _n >= 2, by(g)
@test all(df2.z .=== [missing, 2, 3, 4, missing, 2])
end
end

@testset "Keep if" begin
Expand Down Expand Up @@ -313,6 +347,13 @@ end
df2 = @with df @generate y = sum(x) @if x < 3
@test all(df2.y .=== [3, 3, missing, missing])
end

@testset "Errors" begin
@test_throws Exception Main.eval(:(@with DataFrame(a=1:10) @generate y))
@test_throws Exception Main.eval(:(@with DataFrame(a=1:10) @generate y x))
@test_throws Exception Main.eval(:(@with DataFrame(a=1:10) @generate y = x z = w))
@test_throws Exception Main.eval(:(@with DataFrame(a=1:10) @generate y, by(z)))
end
end

@testset "x in list" begin
Expand Down

0 comments on commit 26aae12

Please sign in to comment.