codedthinking · korenmiklos · Nov 15, 2024 · Jul 21, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Kezdi"
 uuid = "48308a23-c29e-446c-b4c0-d9446a767439"
 authors = ["Miklos Koren <[email protected]>", "Gergely Attila Kiss <[email protected]>"]
-version = "0.5.2"
+version = "0.5.3"
 
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ It imports and reexports [CSV](https://csv.juliadata.org/stable/), [DataFrames](
 
 ## Getting started
 
-> `Kezdi.jl` is currently in beta. We have more than 400 unit tests and a large code coverage. [![Coverage](https://codecov.io/gh/codedthinking/Kezdi.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/codedthinking/Kezdi.jl) The package, however, is not guaranteed to be bug-free. If you encounter any issues, please report them as a [GitHub issue](https://github.com/codedthinking/Kezdi.jl/issues/new).
+> `Kezdi.jl` is currently in beta. We have more than 500 unit tests and a large code coverage. [![Coverage](https://codecov.io/gh/codedthinking/Kezdi.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/codedthinking/Kezdi.jl) The package, however, is not guaranteed to be bug-free. If you encounter any issues, please report them as a [GitHub issue](https://github.com/codedthinking/Kezdi.jl/issues/new).
 >
 > If you would like to receive updates on the package, please star the repository on GitHub and sign up for [email notifications here](https://relentless-producer-1210.ck.page/62d7ebb237).
 
@@ -161,7 +161,7 @@ If you need to apply a function to individual elements of a column, you need to
 @generate n_words = length.(words)
 ```
 
-> Here, `words` becomes a vector of vectors, where each element is a vector of words in the corresponding `Model` string. The function `legth.` will operate on each cell in `words`, counting the number of words in each `Model` string. By contrast, `length(words)` would return the number of elements in the `words` vector, which is the number of rows in the DataFrame.
+> Here, `words` becomes a vector of vectors, where each element is a vector of words in the corresponding `Model` string. The function `length.` will operate on each cell in `words`, counting the number of words in each `Model` string. By contrast, `length(words)` would return the number of elements in the `words` vector, which is the number of rows in the DataFrame.
 
 ### The `@if` condition
 Almost every command can be followed by an `@if` condition that filters the data frame. The command will only be executed on the subset of rows for which the condition evaluates to `true`. The condition can use any combination of column names and functions.
@@ -221,4 +221,4 @@ Inspiration for the package came from [Tidier.jl](https://tidierorg.github.io/Ti
 
 The package is built on top of [DataFrames.jl](https://dataframes.juliadata.org/stable/), [FreqTables.jl](https://github.com/nalimilan/FreqTables.jl) and [FixedEffectModels.jl](https://github.com/FixedEffects/FixedEffectModels.jl). The `@with` function relies on [Chain.jl](https://github.com/jkrumbiegel/Chain.jl) by Julius Krumbiegel.
 
-The package is named after [Gabor Kezdi](https://kezdigabor.life/), a Hungarian economist who has made significant contributions to [teaching data analysis](https://gabors-data-analysis.com/).
+The package is named after [Gabor Kezdi](https://kezdigabor.life/), a Hungarian economist who has made significant contributions to [teaching data analysis](https://gabors-data-analysis.com/).
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -108,6 +108,10 @@ setdf
 @use
 ```
 
+```@docs
+@save
+```
+
 ```@docs
 getdf
 ```
@@ -132,6 +136,9 @@ getdf
 @clear
 ```
 
+```@docs
+@describe
+```
 ### Filtering columns and rows
 ```@docs
 @keep
@@ -154,6 +161,10 @@ getdf
 @replace
 ```
 
+```@docs
+@mvencode
+```
+
 ```@docs
 @egen
 ```
@@ -166,6 +177,17 @@ getdf
 @sort
 ```
 
+```@docs
+@order
+```
+
+```@docs
+@reshape
+```
+
+```@docs
+@append
+```
 
 ### Summarizing and analyzing data
 ```@docs
@@ -278,7 +300,7 @@ If you need to apply a function to individual elements of a column, you need to
 ```
 
 !!! tip "Note: `length(words)` vs `length.(words)`" 
-    Here, `words` becomes a vector of vectors, where each element is a vector of words in the corresponding `Model` string. The function `legth.` will operate on each cell in `words`, counting the number of words in each `Model` string. By contrast, `length(words)` would return the number of elements in the `words` vector, which is the number of rows in the DataFrame.
+    Here, `words` becomes a vector of vectors, where each element is a vector of words in the corresponding `Model` string. The function `length.` will operate on each cell in `words`, counting the number of words in each `Model` string. By contrast, `length(words)` would return the number of elements in the `words` vector, which is the number of rows in the DataFrame.
 
 ### The `@if` condition
 Almost every command can be followed by an `@if` condition that filters the data frame. The command will only be executed on the subset of rows for which the condition evaluates to `true`. The condition can use any combination of column names and functions.
@@ -470,4 +492,4 @@ Inspiration for the package came from [Tidier.jl](https://tidierorg.github.io/Ti
 
 The package is built on top of [DataFrames.jl](https://dataframes.juliadata.org/stable/), [FreqTables.jl](https://github.com/nalimilan/FreqTables.jl) and [FixedEffectModels.jl](https://github.com/FixedEffects/FixedEffectModels.jl). The `@with` function relies on [Chain.jl](https://github.com/jkrumbiegel/Chain.jl) by Julius Krumbiegel.
 
-The package is named after [Gabor Kezdi](https://kezdigabor.life/), a Hungarian economist who has made significant contributions to [teaching data analysis](https://gabors-data-analysis.com/).
+The package is named after [Gabor Kezdi](https://kezdigabor.life/), a Hungarian economist who has made significant contributions to [teaching data analysis](https://gabors-data-analysis.com/).
diff --git a/src/Kezdi.jl b/src/Kezdi.jl
@@ -2,7 +2,8 @@
 Kezdi.jl is a Julia package for data manipulation and analysis. It is inspired by Stata, but it is written in Julia, which makes it faster and more flexible. It is designed to be used in the Julia REPL, but it can also be used in Jupyter notebooks or in scripts.
 """
 module Kezdi
-export @generate, @replace, @egen, @collapse, @keep, @drop, @summarize, @regress, @use, @tabulate, @count, @sort, @order, @list, @head, @tail, @names, @rename, @clear, @describe, @mvencode, @save, @append
+
+export @generate, @replace, @egen, @collapse, @keep, @drop, @summarize, @regress, @use, @tabulate, @count, @sort, @order, @list, @head, @tail, @names, @rename, @clear, @describe, @mvencode, @save, @append, @reshape
 
 export getdf, setdf, display_and_return, keep_only_values, rowcount, distinct, cond, mvreplace, append
 

diff --git a/src/commands.jl b/src/commands.jl
@@ -1,6 +1,63 @@
 # use multiple dispatch to generate code 
 rewrite(command::Command) = rewrite(Val(command.command), command)
 
+function rewrite(::Val{:reshape_long}, command::Command)
+    gc = generate_command(command; options=[:variables], allowed=[:i, :j])
+    (; local_copy, target_df, setup, teardown, arguments, options) = gc
+    get_option(command, :i) isa Nothing && ArgumentError("i() is mandatory. Syntax is @reshape long y1 y2 ... i(var) j(var)") |> throw
+    get_option(command, :j) isa Nothing && ArgumentError("j() is mandatory. Syntax is @reshape long y1 y2 ... i(var) j(var)") |> throw
+    length(get_option(command, :j)) > 1 && ArgumentError("Only one variable can be specified for j() in @reshape long") |> throw
+    i = get_option(command, :i) |> replace_column_references
+    j = get_option(command, :j)[1] |> replace_column_references
+    vars = collect(arguments) |> replace_column_references
+    var_lists = gensym()
+    combined_df = gensym()
+    df_list = gensym()
+    quote
+        $setup
+        $var_lists = [[Symbol(name) for name in names($target_df) if startswith(name, String(var))] for var in $vars]
+        $df_list = [stack($target_df, list) for list in $var_lists]
+        for (n, df) in enumerate($df_list)
+            df[!, $j] = df[:, :variable] .|> x -> Base.parse(Int, x[length(String($vars[n]))+1:end])
+            rename!(df, :value => String($vars[n]))
+            select!(df, Not(:variable))
+        end
+        $combined_df = $df_list[1]
+        for df in $df_list[2:end]
+            $combined_df = innerjoin($combined_df, df, on=[$i..., $j], makeunique=true)
+        end
+        $combined_df = select!($combined_df, collect(union(intersect(names.($df_list)...), String.($vars))))
+        $combined_df |> $teardown |> setdf
+    end |> esc
+end
+
+function rewrite(::Val{:reshape_wide}, command::Command)
+    gc = generate_command(command; options=[:variables], allowed=[:i, :j])
+    (; local_copy, target_df, setup, teardown, arguments, options) = gc
+    get_option(command, :i) isa Nothing && ArgumentError("i() is mandatory. Syntax is @reshape wide y1 y2 ... i(var) j(var)") |> throw
+    get_option(command, :j) isa Nothing && ArgumentError("j() is mandatory. Syntax is @reshape wide y1 y2 ... i(var) j(var)") |> throw
+    length(get_option(command, :j)) > 1 && ArgumentError("Only one variable can be specified for j() in @reshape wide") |> throw
+    i = get_option(command, :i) |> replace_column_references
+    j = get_option(command, :j)[1] |> replace_column_references
+    vars = collect(arguments) |> replace_column_references
+    df_list = gensym()
+    combined_df = gensym()
+    length(vars) > 1 ?
+    quote
+        $setup
+        $df_list = [unstack($target_df, $i, $j, var, renamecols=x -> Symbol(var, x)) for var in $vars]
+        $combined_df = $df_list[1]
+        for df in $df_list[2:end]
+            $combined_df = innerjoin($combined_df, df, on=$i)
+        end
+        $combined_df |> $teardown |> setdf
+    end |> esc :
+    quote
+        $setup
+        unstack($target_df, $i, $j, $vars[1], renamecols=x -> Symbol($vars[1], x)) |> $teardown |> setdf
+    end |> esc
+end
+
 function rewrite(::Val{:rename}, command::Command)
     gc = generate_command(command; options=[:variables], allowed=[])
     (; local_copy, target_df, setup, teardown, arguments, options) = gc
@@ -57,7 +114,7 @@ function rewrite(::Val{:keep}, command::Command)
     cols = isempty(command.arguments) ? :(:) : :(collect($command.arguments))
     quote
         $setup
-        $target_df[!, $cols]  |> $teardown |> setdf
+        $target_df[!, $cols] |> $teardown |> setdf
     end |> esc
 end
 
@@ -69,7 +126,7 @@ function rewrite(::Val{:drop}, command::Command)
             $setup
             select!($local_copy, Not(collect($(command.arguments)))) |> $teardown |> setdf
         end |> esc
-    end 
+    end
     bitmask = build_bitmask(local_copy, command.condition)
     return quote
         $setup
@@ -112,7 +169,7 @@ function rewrite(::Val{:sort}, command::Command)
 end
 
 function rewrite(::Val{:order}, command::Command)
-    gc = generate_command(command; options = [:variables, :nofunction], allowed=[:desc, :last, :after, :before , :alphabetical])
+    gc = generate_command(command; options=[:variables, :nofunction], allowed=[:desc, :last, :after, :before, :alphabetical])
     (; local_copy, target_df, setup, teardown, arguments, options) = gc
     desc = :desc in get_top_symbol.(options)
     last = :last in get_top_symbol.(options)
@@ -129,7 +186,7 @@ function rewrite(::Val{:order}, command::Command)
     if desc && !alphabetical
         ArgumentError("Cannot use `desc` without `alphabetical` option in @order") |> throw
     end
-    
+
     if before
         var = get_option(command, :before)
     elseif after
@@ -150,7 +207,7 @@ function rewrite(::Val{:order}, command::Command)
         $setup
         $cols = [Symbol(col) for col in names($target_df) if Symbol(col) ∉ $target_cols]
         if $alphabetical
-            $cols = sort($cols, rev = $desc)
+            $cols = sort($cols, rev=$desc)
         end
 
         if $after
@@ -173,7 +230,7 @@ function rewrite(::Val{:order}, command::Command)
             $cols = pushfirst!($cols, $target_cols...)
         end
 
-        $target_df[!, $cols]|> $teardown
+        $target_df[!, $cols] |> $teardown
     end |> esc
 end
 
@@ -193,7 +250,7 @@ function rewrite(::Val{:mvencode}, command::Command)
     coltype = gensym()
     quote
         $setup
-        $valtype = typeof($value)   
+        $valtype = typeof($value)
         for col in $cols
             $coltype = eltype($local_copy[.!($bitmask), col])
             if $valtype != $coltype
@@ -204,4 +261,4 @@ function rewrite(::Val{:mvencode}, command::Command)
         $local_copy[$bitmask, $cols] = mvreplace.($local_copy[$bitmask, $cols], $value)
         $local_copy |> $teardown
     end |> esc
-end
+end
diff --git a/src/functions.jl b/src/functions.jl
@@ -4,19 +4,19 @@ save(fname::AbstractString) = writestat(fname, getdf())
 function append(fname::AbstractString)
     ispath(fname) || ArgumentError("File $fname does not exist.") |> throw
     _, ext = splitext(fname)
-    if ext in [".dta", ".sav", ".por", ".sas7bdat", ".xpt"]
+    if ext in [".dta", ".save", ".por", ".sas7bdat", ".xpt"]
         df = readstat(fname) |> DataFrame
     else
         df = CSV.read(fname, DataFrame)
     end
     cdf = getdf()
     cdf, df = create_cols(cdf, df)
-    df = vcat(cdf,df)
+    df = vcat(cdf, df)
     setdf(df)
 end
 
 function append(df::DataFrame)
-    cdf, df  = create_cols(getdf(), df)
+    cdf, df = create_cols(getdf(), df)
     setdf(vcat(cdf, df))
 end
 
@@ -49,7 +49,7 @@ getdf() = _global_dataframe
 
 Set the global data frame.
 """
-setdf(df::Union{AbstractDataFrame, Nothing}) = global _global_dataframe = isnothing(df) ? nothing : copy(df)
+setdf(df::Union{AbstractDataFrame,Nothing}) = global _global_dataframe = isnothing(df) ? nothing : copy(df)
 display_and_return(x) = (display(x); x)
 
 """
@@ -80,7 +80,7 @@ function summarize(df::AbstractDataFrame, column::Symbol)::Summarize
     skewness_val = skewness(data)
     # julia reports excess kurtosis, so we add 3 to get the kurtosis
     kurtosis_val = 3.0 + kurtosis(data)
-    
+
     percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]
     percentiles_values = quantile(data, percentiles ./ 100; alpha=0.5, beta=0.5)
 
@@ -149,4 +149,9 @@ function _describe(df::AbstractDataFrame, cols::Vector{Symbol}=Symbol[])
     table[!, [:variable, :eltype]]
 end
 
-mvreplace(x, y) = ismissing(x) ? y : x
+"""
+    mvreplace(x, y)
+
+Return `y` if `x` is `missing`, otherwise return `x`. If `x` is a vector, the operation is vectorized. This function mimics `x ? y : z`, which cannot be vectorized.
+"""
+mvreplace(x, y) = ismissing(x) ? y : x