diff --git a/Project.toml b/Project.toml index 891b9849..6aa1ad7b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Survey" uuid = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c" authors = ["Ayush Patnaik "] -version = "0.3.0" +version = "0.4.0" [deps] AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67" @@ -9,6 +9,7 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/README.md b/README.md index b64f53ac..e34afe27 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ to compute the standard errors. ```julia julia> bootsrs = bootweights(srs; replicates=1000) -ReplicateDesign: +ReplicateDesign{BootstrapReplicates}: data: 200×1047 DataFrame strata: none cluster: none diff --git a/docs/Project.toml b/docs/Project.toml index f11e8201..d40c66f9 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,6 +2,7 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Survey = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c" diff --git a/docs/make.jl b/docs/make.jl index 3ffa66f2..6da6931f 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -23,6 +23,7 @@ makedocs(; "Replicate weights" => "man/replicate.md", "Plotting" => "man/plotting.md", "Comparison with other survey analysis tools" => "man/comparisons.md", + "Generalised linear models" => "man/glm.md", ], "API reference" => "api.md", ], diff --git a/docs/src/api.md b/docs/src/api.md index 629d118d..01c7451c 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -17,12 +17,15 @@ JackknifeReplicates load_data bootweights jackknifeweights -variance +Survey.standarderror mean total quantile ratio +glm plot boxplot hist +Survey.sturges +Survey.freedman_diaconis ``` diff --git a/docs/src/man/glm.md b/docs/src/man/glm.md new file mode 100644 index 00000000..b5ad35f0 --- /dev/null +++ b/docs/src/man/glm.md @@ -0,0 +1,99 @@ +# Generalized Linear Models in Survey +The `glm()` function in the Julia Survey package is used to fit generalized linear models (GLMs) to survey data. It incorporates survey design information, such as sampling weights, stratification, and clustering, to produce valid estimates and standard errors that account for the type of survey design. + +As of June 2023, the [GLM.jl documentation](https://juliastats.org/GLM.jl/stable/) lists the supported distribution families and their link functions as: +```txt +Bernoulli (LogitLink) +Binomial (LogitLink) +Gamma (InverseLink) +InverseGaussian (InverseSquareLink) +NegativeBinomial (NegativeBinomialLink, often used with LogLink) +Normal (IdentityLink) +Poisson (LogLink) +``` + +Refer to the GLM.jl documentation for more information about the GLM package. + +## Fitting a GLM to a Survey Design object + +You can fit a GLM to a Survey Design object the same way you would fit it to a regular data frame. The only difference is that you need to specify the survey design object as the second argument to the `glm()` function. + +```julia +using Survey +apisrs = load_data("apisrs") + +# Simple random sample survey +srs = SurveyDesign(apisrs, weights = :pw) + +# Survey stratified by stype +dstrat = SurveyDesign(apistrat, strata = :stype, weights = :pw) + +# Survey clustered by dnum +dclus1 = SurveyDesign(apiclus1, clusters = :dnum, weights = :pw) +``` + +Once you have the survey design object, you can fit a GLM using the `glm()` function. Specify the formula for the model and the distribution family. + +The `glm()` function supports all distribution families supported by GLM.jl, i.e. Bernoulli, Binomial, Gamma, Geometric, InverseGaussian, NegativeBinomial, Normal, and Poisson. + +For example, to fit a GLM with a Bernoulli distribution and a Logit link function to the `srs` survey design object we created above: +```julia +formula = @formula(api00 ~ api99) +my_glm = glm(formula, srs, family = Normal()) + +# View the coefficients and standard errors +my_glm.Coefficients +my_glm.SE +``` + +## Examples + +The examples below use the `api` datasets, which contain survey data collected about California schools. The datasets are included in the Survey.jl package and can be loaded by calling `load_data("name_of_dataset")`. + +### Bernoulli with Logit Link + +A school being eligible for the awards program (`awards`) is a binary outcome (0 or 1). Let's assume it follows a Bernoulli distribution. Suppose we want to predict `awards` based on the percentage of students eligible for subsidized meals (`meals`) and the percentage of English Language Learners (`ell`). We can fit this GLM using the code below: + +```julia +using Survey +apisrs = load_data("apisrs") +srs = SurveyDesign(apisrs, weights = :pw) + +# Convert yes/no to 1/0 +apisrs.awards = ifelse.(apisrs.awards .== "Yes", 1, 0) + +# Fit the model +model = glm(@formula(awards ~ meals + ell), apisrs, Bernoulli(), LogitLink()) +``` + +### Poisson with Log Link + +Let us assume that the number of students tested (`api_stu`) follows a Poisson distribution, which models the number of successes out of a fixed number of trials. Suppose we want to predict the number of students tested based on the percentage of students eligible for subsidized meals (`meals`) and the percentage of English Language Learners (`ell`). We can fit this GLM using the code below: + +```julia +using Survey +apisrs = load_data("apisrs") +srs = SurveyDesign(apisrs, weights = :pw) + +# Rename api.stu to api_stu +rename!(apisrs, Symbol("api.stu") => :api_stu) + +# Fit the model +model = glm(@formula(api_stu ~ meals + ell), apisrs, Poisson(), LogLink()) +``` + +### Gamma with Inverse Link + +Let us assume that the average parental education level (`avg_ed`) follows a Gamma distribution, which is suitable for modeling continuous, positive-valued variables with a skewed distribution. Suppose we want to predict the average parental education level based on the percentage of students eligible for subsidized meals (`meals`) and the percentage of English Language Learners (`ell`). We can fit this GLM using the code below: + +```julia +using Survey +apisrs = load_data("apisrs") +srs = SurveyDesign(apisrs, weights = :pw) + +# Rename api.stu to api_stu +rename!(apisrs, Symbol("avg.ed") => :avg_ed) + +# Fit the model +model = glm(@formula(avg_ed ~ meals + ell), apisrs, Gamma(), InverseLink()) +``` \ No newline at end of file diff --git a/docs/src/man/replicate.md b/docs/src/man/replicate.md index bab5b06d..14b56e7e 100644 --- a/docs/src/man/replicate.md +++ b/docs/src/man/replicate.md @@ -4,9 +4,9 @@ Replicate weights are a method for estimating the standard errors of survey stat The basic idea behind replicate weights is to create multiple versions of the original sample weights, each with small, randomly generated perturbations. The multiple versions of the sample weights are then used to calculate the survey statistic of interest, such as the mean or total, on multiple replicate samples. The variance of the survey statistic is then estimated by computing the variance across the replicate samples. -Currently, the Rao-Wu bootstrap[^1] is the only method in the package for generating replicate weights. +Currently, the Rao-Wu bootstrap[^1] and the Jackknife [^2] are the only methods in the package for generating replicate weights. In the future, the package will support additional types of inference methods, which will be passed when creating a `ReplicateDesign` object. -The `bootweights` function of the package can be used to generate a `ReplicateDesign` from a `SurveyDesign` +The `bootweights` function of the package can be used to generate a `ReplicateDesign` using the Rao-Wu bootstrap method from a `SurveyDesign`. For example: ```@repl bootstrap using Survey @@ -15,7 +15,16 @@ dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw) bstrat = bootweights(dstrat; replicates = 10) ``` -For each replicate, the `DataFrame` of `ReplicateDesign` has an additional column. The of the column is `replicate_` followed by the replicate number. +The `jackknifeweights` function of the package can be used to generate a `ReplicateDesign` using the Jackknife method from a `SurveyDesign`. +For example: +```@repl bootstrap +using Survey +apistrat = load_data("apistrat") +dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw) +bstrat = jackknifeweights(dstrat; replicates = 10) +``` + +For each replicate, the `DataFrame` of `ReplicateDesign` has an additional column. The name of the column is `replicate_` followed by the replicate number. ```@repl bootstrap names(bstrat.data) @@ -38,4 +47,5 @@ For each replicate weight, the statistic is calculated using it instead of the w ## References -[^1]: [Rust, Keith F., and J. N. K. Rao. "Variance estimation for complex surveys using replication techniques." Statistical methods in medical research 5.3 (1996): 283-310.](https://journals.sagepub.com/doi/abs/10.1177/096228029600500305?journalCode=smma) \ No newline at end of file +[^1]: [Rust, Keith F., and J. N. K. Rao. "Variance estimation for complex surveys using replication techniques." Statistical methods in medical research 5.3 (1996): 283-310.](https://journals.sagepub.com/doi/abs/10.1177/096228029600500305?journalCode=smma) +[^2]: [Miller, Rupert G. “The Jackknife--A Review.” Biometrika 61, no. 1 (1974): 1–15. https://doi.org/10.2307/2334280.](https://www.jstor.org/stable/2334280) \ No newline at end of file diff --git a/src/Survey.jl b/src/Survey.jl index c0b3db27..90f3edbc 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -1,8 +1,9 @@ module Survey using DataFrames +import DataFrames: rename! using Statistics -import Statistics: quantile +import Statistics: std, quantile using StatsBase import StatsBase: mean, quantile using CSV @@ -12,6 +13,8 @@ using AlgebraOfGraphics using CategoricalArrays using Random using Missings +using GLM +import GLM: @formula, glm include("SurveyDesign.jl") include("bootstrap.jl") @@ -26,17 +29,19 @@ include("boxplot.jl") include("show.jl") include("ratio.jl") include("by.jl") +include("reg.jl") export load_data export AbstractSurveyDesign, SurveyDesign, ReplicateDesign export BootstrapReplicates, JackknifeReplicates export dim, colnames, dimnames -export mean, total, quantile +export mean, total, quantile, std export plot export hist, sturges, freedman_diaconis export boxplot export bootweights export ratio export jackknifeweights, variance +export @formula, glm end diff --git a/src/SurveyDesign.jl b/src/SurveyDesign.jl index 5b7df85a..ac699c05 100644 --- a/src/SurveyDesign.jl +++ b/src/SurveyDesign.jl @@ -84,10 +84,8 @@ struct SurveyDesign <: AbstractSurveyDesign else data[!, sampsize_labels] = fill(length(unique(data[!, cluster])), (nrow(data),)) end - if isa(popsize, Symbol) - weights_labels = :_weights - data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] - elseif isa(weights, Symbol) + + if isa(weights, Symbol) if !(typeof(data[!, weights]) <: Vector{<:Real}) throw( ArgumentError( @@ -100,6 +98,9 @@ struct SurveyDesign <: AbstractSurveyDesign popsize = :_popsize data[!, popsize] = data[!, sampsize_labels] .* data[!, weights_labels] end + elseif isa(popsize, Symbol) + weights_labels = :_weights + data[!, weights_labels] = data[!, popsize] ./ data[!, sampsize_labels] else # neither popsize nor weights given weights_labels = :_weights diff --git a/src/bootstrap.jl b/src/bootstrap.jl index ee304aee..81b82bf3 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -1,6 +1,10 @@ """ -Use bootweights to create replicate weights using Rao-Wu bootstrap. The function accepts a `SurveyDesign` and returns a `ReplicateDesign{BootstrapReplicates}` which has additional columns for replicate weights. +Use bootweights to create replicate weights using Rao-Wu bootstrap. The function accepts a `SurveyDesign` and returns a `ReplicateDesign{BootstrapReplicates}` which has additional columns for replicate weights. +The replicate weight for replicate ``r`` is computed using the formula ``w_{i}(r) = w_i \\times \\frac{n_h}{n_h - 1} m_{hj}(r)`` for observation ``i`` in psu ``j`` of stratum ``h``. + +In the formula above, ``w_i`` is the original weight for observation ``i``, ``n_h`` is the number of psus in stratum ``h``, and ``m_{hj}(r)`` is the number of psus in stratum ``h`` that are selected in replicate ``r``. + ```jldoctest julia> using Random @@ -22,6 +26,9 @@ type: bootstrap replicates: 1000 ``` + +# Reference +pg 385, Section 9.3.3 Bootstrap - Sharon Lohr, Sampling Design and Analysis (2010) """ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234)) stratified = groupby(design.data, design.strata) @@ -54,10 +61,21 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis end """ - variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates}) +standarderror(x::Union{Symbol, Vector{Symbol}}, func::Function, design::ReplicateDesign{BootstrapReplicates}, args...; kwargs...) + +Compute the standard error of the estimated mean using the bootstrap method. +# Arguments +- `x::Union{Symbol, Vector{Symbol}}`: Symbol or vector of symbols representing the variable(s) for which the mean is estimated. +- `func::Function`: Function used to calculate the mean. +- `design::ReplicateDesign{BootstrapReplicates}`: Replicate design object. +- `args...`: Additional arguments to be passed to the function. +- `kwargs...`: Additional keyword arguments. -Use replicate weights to compute the standard error of the estimated mean using the bootstrap method. The variance is calculated using the formula +# Returns +- `df`: DataFrame containing the estimated mean and its standard error. + +The standard error is calculated using the formula ```math \\hat{V}(\\hat{\\theta}) = \\dfrac{1}{R}\\sum_{i = 1}^R(\\theta_i - \\hat{\\theta})^2 @@ -65,34 +83,52 @@ Use replicate weights to compute the standard error of the estimated mean using where above ``R`` is the number of replicate weights, ``\\theta_i`` is the estimator computed using the ``i``th set of replicate weights, and ``\\hat{\\theta}`` is the estimator computed using the original weights. -```jldoctest -julia> using Survey, StatsBase; - -julia> apiclus1 = load_data("apiclus1"); - -julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); - -julia> bclus1 = dclus1 |> bootweights; +# Examples -julia> weightedmean(x, y) = mean(x, weights(y)); +```jldoctest; setup = :(using Survey, StatsBase, DataFrames; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) +julia> my_mean(df::DataFrame, column, weights) = StatsBase.mean(df[!, column], StatsBase.weights(df[!, weights])); -julia> variance(:api00, weightedmean, bclus1) +julia> Survey.standarderror(:api00, my_mean, bclus1) 1×2 DataFrame Row │ estimator SE │ Float64 Float64 ─────┼──────────────────── 1 │ 644.169 23.4107 - ``` """ -function variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates}) - θ̂ = func(design.data[!, x], design.data[!, design.weights]) - θ̂t = [ - func(design.data[!, x], design.data[!, "replicate_"*string(i)]) for - i = 1:design.replicates +function standarderror(x::Union{Symbol, Vector{Symbol}}, func::Function, design::ReplicateDesign{BootstrapReplicates}, args...; kwargs...) + + # Compute the estimators + θs = func(design.data, x, design.weights, args...; kwargs...) + + # Compute the estimators for each replicate + θts = [ + func(design.data, x, "replicate_" * string(i), args...; kwargs...) for i in 1:design.replicates ] - variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates - return DataFrame(estimator = θ̂, SE = sqrt(variance)) + + # Convert θs and θts to a vector if they are not already + θs = (θs isa Vector) ? θs : [θs] + θts2 = Vector[] + if !(θts[1] isa Vector) + for θt in θts + push!(θts2, [θt]) + end + θts = θts2 + end + + # Calculate variances for each estimator + variance = Float64[] + + θts = hcat(θts...) + for i in 1:length(θs) + θ = θs[i] + θt = θts[i, :] + θt = filter(!isnan, θt) + num = sum((θt .- θ) .^ 2) / length(θt) + push!(variance, num) + end + + return DataFrame(estimator = θs, SE = sqrt.(variance)) end function _bootweights_cluster_sorted!(cluster_sorted, diff --git a/src/by.jl b/src/by.jl index f28de3b5..abf76676 100644 --- a/src/by.jl +++ b/src/by.jl @@ -1,28 +1,23 @@ -function bydomain(x::Symbol, domain, design::SurveyDesign, func::Function) - gdf = groupby(design.data, domain) - X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic) - return X +function subset(group, design::SurveyDesign) + return SurveyDesign(DataFrame(group);clusters = design.cluster, strata = design.strata, popsize = design.popsize, weights = design.weights) +end + +function subset(group, design::ReplicateDesign) + return ReplicateDesign{typeof(design.inference_method)}(DataFrame(group), design.replicate_weights;clusters = design.cluster, strata = design.strata, popsize = design.popsize, weights = design.weights) end -function bydomain(x::Symbol, domain, design::ReplicateDesign, func::Function) +function bydomain(x::Union{Symbol, Vector{Symbol}}, domain,design::Union{SurveyDesign, ReplicateDesign}, func::Function, args...; kwargs...) + domain_names = unique(design.data[!, domain]) gdf = groupby(design.data, domain) - nd = length(gdf) - X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic) - Xt_mat = Array{Float64,2}(undef, (nd, design.replicates)) - for i = 1:design.replicates - Xt_mat[:, i] = - combine( - gdf, - [x, Symbol("replicate_" * string(i))] => - ((a, c) -> func(a, weights(c))) => :statistic, - ).statistic + domain_names = [join(collect(keys(gdf)[i]), "-") for i in 1:length(gdf)] + vars = DataFrame[] + for group in gdf + push!(vars, func(x, subset(group, design), args...; kwargs...)) end - ses = Float64[] - for i = 1:nd - filtered_dx = filter(!isnan, Xt_mat[i, :] .- X.statistic[i]) - push!(ses, sqrt(sum(filtered_dx .^ 2) / length(filtered_dx))) + estimates = vcat(vars...) + if isa(domain, Vector{Symbol}) + domain = join(domain, "_") end - replace!(ses, NaN => 0) - X.SE = ses - return X -end + estimates[!, domain] = domain_names + return estimates +end \ No newline at end of file diff --git a/src/jackknife.jl b/src/jackknife.jl index 008ca4cb..b044e0f3 100644 --- a/src/jackknife.jl +++ b/src/jackknife.jl @@ -83,9 +83,9 @@ function jackknifeweights(design::SurveyDesign) end """ - variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) +standarderror(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) -Compute variance of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following. +Compute standard error of column `x` for the given `func` using the Jackknife method. The formula to compute this variance is the following. ```math \\hat{V}_{\\text{JK}}(\\hat{\\theta}) = \\sum_{h = 1}^H \\dfrac{n_h - 1}{n_h}\\sum_{j = 1}^{n_h}(\\hat{\\theta}_{(hj)} - \\hat{\\theta})^2 @@ -94,66 +94,55 @@ Compute variance of column `x` for the given `func` using the Jackknife method. Above, ``\\hat{\\theta}`` represents the estimator computed using the original weights, and ``\\hat{\\theta_{(hj)}}`` represents the estimator computed from the replicate weights obtained when PSU ``j`` from cluster ``h`` is removed. # Examples -```jldoctest -julia> using Survey, StatsBase - -julia> apistrat = load_data("apistrat"); - -julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); - -julia> rstrat = jackknifeweights(dstrat) -ReplicateDesign{JackknifeReplicates}: -data: 200×244 DataFrame -strata: stype - [E, E, E … M] -cluster: none -popsize: [4420.9999, 4420.9999, 4420.9999 … 1018.0] -sampsize: [100, 100, 100 … 50] -weights: [44.21, 44.21, 44.21 … 20.36] -allprobs: [0.0226, 0.0226, 0.0226 … 0.0491] -type: jackknife -replicates: 200 - -julia> weightedmean(x, y) = mean(x, weights(y)); +```jldoctest; setup = :(using Survey, StatsBase, DataFrames; apistrat = load_data("apistrat"); dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw); rstrat = jackknifeweights(dstrat)) +julia> my_mean(df::DataFrame, column, weights) = StatsBase.mean(df[!, column], StatsBase.weights(df[!, weights])); -julia> variance(:api00, weightedmean, rstrat) +julia> Survey.standarderror(:api00, my_mean, rstrat) 1×2 DataFrame Row │ estimator SE │ Float64 Float64 ─────┼──────────────────── 1 │ 662.287 9.53613 - ``` # Reference pg 380-382, Section 9.3.2 Jackknife - Sharon Lohr, Sampling Design and Analysis (2010) """ -function variance(x::Symbol, func::Function, design::ReplicateDesign{JackknifeReplicates}) +function standarderror(x::Union{Symbol, Vector{Symbol}}, func::Function, design::ReplicateDesign{JackknifeReplicates}, args...; kwargs...) + df = design.data - # sort!(df, [design.strata, design.cluster]) stratified_gdf = groupby(df, design.strata) # estimator from original weights - θ = func(df[!, x], df[!, design.weights]) + θs = func(design.data, x, design.weights, args...; kwargs...) - variance = 0 + # ensure that θs is a vector + θs = (θs isa Vector) ? θs : [θs] + + variance = zeros(length(θs)) replicate_index = 1 + for subgroup in stratified_gdf + psus_in_stratum = unique(subgroup[!, design.cluster]) nh = length(psus_in_stratum) - cluster_variance = 0 + cluster_variance = zeros(length(θs)) + for psu in psus_in_stratum - # get replicate weights corresponding to current stratum and psu - rep_weights = df[!, "replicate_"*string(replicate_index)] # estimator from replicate weights - θhj = func(df[!, x], rep_weights) + θhjs = func(design.data, x, "replicate_" * string(replicate_index), args...; kwargs...) + + # update the cluster variance for each estimator + for i in 1:length(θs) + cluster_variance[i] += ((nh - 1)/nh) * (θhjs[i] - θs[i])^2 + end - cluster_variance += ((nh - 1)/nh)*(θhj - θ)^2 replicate_index += 1 end - variance += cluster_variance - end - return DataFrame(estimator = θ, SE = sqrt(variance)) -end + # update the overall variance + variance .+= cluster_variance + end + return DataFrame(estimator = θs, SE = sqrt.(variance)) +end \ No newline at end of file diff --git a/src/mean.jl b/src/mean.jl index 4d867669..f203cd02 100644 --- a/src/mean.jl +++ b/src/mean.jl @@ -33,13 +33,18 @@ end """ mean(x::Symbol, design::ReplicateDesign) -Use replicate weights to compute the standard error of the estimated mean. +Compute the standard error of the estimated mean using replicate weights. -# Examples +# Arguments +- `x::Symbol`: Symbol representing the variable for which the mean is estimated. +- `design::ReplicateDesign`: Replicate design object. + +# Returns +- `df`: DataFrame containing the estimated mean and its standard error. -```jldoctest; setup = :(apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw)) -julia> bclus1 = dclus1 |> bootweights; +# Examples +```jldoctest; setup = :(using Survey, StatsBase; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) julia> mean(:api00, bclus1) 1×2 DataFrame Row │ mean SE @@ -49,16 +54,24 @@ julia> mean(:api00, bclus1) ``` """ function mean(x::Symbol, design::ReplicateDesign) - weightedmean(x, y) = mean(x, weights(y)) - df = Survey.variance(x, weightedmean, design) + + # Define an inner function to calculate the mean + function inner_mean(df::DataFrame, column, weights_column) + return StatsBase.mean(df[!, column], StatsBase.weights(df[!, weights_column])) + end + + # Calculate the mean and standard error + df = Survey.standarderror(x, inner_mean, design) + rename!(df, :estimator => :mean) + return df end """ Estimate the mean of a list of variables. -```jldoctest meanlabel; setup = :(apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) +```jldoctest meanlabel; setup = :(using Survey, StatsBase; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) julia> mean([:api00, :enroll], dclus1) 2×2 DataFrame Row │ names mean @@ -94,45 +107,43 @@ Estimate means of domains. ```jldoctest meanlabel; setup = :(apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) julia> mean(:api00, :cname, dclus1) 11×2 DataFrame - Row │ cname mean - │ String15 Float64 + Row │ mean cname + │ Float64 String ─────┼────────────────────── - 1 │ Alameda 669.0 - 2 │ Fresno 472.0 - 3 │ Kern 452.5 - 4 │ Los Angeles 647.267 - 5 │ Mendocino 623.25 - 6 │ Merced 519.25 - 7 │ Orange 710.563 - 8 │ Plumas 709.556 - 9 │ San Diego 659.436 - 10 │ San Joaquin 551.189 - 11 │ Santa Clara 732.077 + 1 │ 669.0 Alameda + 2 │ 472.0 Fresno + 3 │ 452.5 Kern + 4 │ 647.267 Los Angeles + 5 │ 623.25 Mendocino + 6 │ 519.25 Merced + 7 │ 710.563 Orange + 8 │ 709.556 Plumas + 9 │ 659.436 San Diego + 10 │ 551.189 San Joaquin + 11 │ 732.077 Santa Clara ``` Use the replicate design to compute standard errors of the estimated means. ```jldoctest meanlabel julia> mean(:api00, :cname, bclus1) 11×3 DataFrame - Row │ cname mean SE - │ String15 Float64 Float64 + Row │ mean SE cname + │ Float64 Float64 String ─────┼──────────────────────────────────── - 1 │ Santa Clara 732.077 58.2169 - 2 │ San Diego 659.436 2.66703 - 3 │ Merced 519.25 2.28936e-15 - 4 │ Los Angeles 647.267 47.6233 - 5 │ Orange 710.563 2.19826e-13 - 6 │ Fresno 472.0 1.13687e-13 - 7 │ Plumas 709.556 1.26058e-13 - 8 │ Alameda 669.0 1.27527e-13 - 9 │ San Joaquin 551.189 2.1791e-13 - 10 │ Kern 452.5 0.0 - 11 │ Mendocino 623.25 1.09545e-13 + 1 │ 732.077 58.2169 Santa Clara + 2 │ 659.436 2.66703 San Diego + 3 │ 519.25 2.28936e-15 Merced + 4 │ 647.267 47.6233 Los Angeles + 5 │ 710.563 2.19826e-13 Orange + 6 │ 472.0 1.13687e-13 Fresno + 7 │ 709.556 1.26058e-13 Plumas + 8 │ 669.0 1.27527e-13 Alameda + 9 │ 551.189 2.18162e-13 San Joaquin + 10 │ 452.5 0.0 Kern + 11 │ 623.25 1.09545e-13 Mendocino ``` """ function mean(x::Symbol, domain, design::AbstractSurveyDesign) - weighted_mean(x, w) = mean(x, StatsBase.weights(w)) - df = bydomain(x, domain, design, weighted_mean) - rename!(df, :statistic => :mean) + df = bydomain(x, domain, design, mean) return df -end +end \ No newline at end of file diff --git a/src/quantile.jl b/src/quantile.jl index 2ffec5aa..c6574be1 100644 --- a/src/quantile.jl +++ b/src/quantile.jl @@ -33,11 +33,22 @@ function quantile(var::Symbol, design::SurveyDesign, p::Real; kwargs...) end """ -Use replicate weights to compute the standard error of the estimated quantile. + quantile(x::Symbol, design::ReplicateDesign, p; kwargs...) + +Compute the standard error of the estimated quantile using replicate weights. -```jldoctest; setup = :(apisrs = load_data("apisrs");srs = SurveyDesign(apisrs; weights=:pw)) -julia> bsrs = srs |> bootweights; +# Arguments +- `x::Symbol`: Symbol representing the variable for which the quantile is estimated. +- `design::ReplicateDesign`: Replicate design object. +- `p::Real`: Quantile value to estimate, ranging from 0 to 1. +- `kwargs...`: Additional keyword arguments. +# Returns +- `df`: DataFrame containing the estimated quantile and its standard error. + +# Examples + +```jldoctest; setup = :(using Survey, StatsBase; apisrs = load_data("apisrs"); srs = SurveyDesign(apisrs; weights=:pw); bsrs = srs |> bootweights) julia> quantile(:api00, bsrs, 0.5) 1×2 DataFrame Row │ 0.5th percentile SE @@ -46,10 +57,18 @@ julia> quantile(:api00, bsrs, 0.5) 1 │ 659.0 14.9764 ``` """ -function quantile(var::Symbol, design::ReplicateDesign, p::Real; kwargs...) - quantile_func(v, weights) = Statistics.quantile(v, ProbabilityWeights(weights), p) - df = Survey.variance(var, quantile_func, design) +function quantile(x::Symbol, design::ReplicateDesign, p::Real; kwargs...) + + # Define an inner function to calculate the quantile + function inner_quantile(df::DataFrame, column, weights_column) + return Statistics.quantile(df[!, column], ProbabilityWeights(df[!, weights_column]), p) + end + + # Calculate the quantile and standard error + df = standarderror(x, inner_quantile, design) + rename!(df, :estimator => string(p) * "th percentile") + return df end @@ -112,3 +131,32 @@ function quantile( df.percentile = string.(probs) return df[!, [:percentile, :statistic, :SE]] end + +""" +quantile(var, domain, design) + +Estimate a quantile of domains. + +```jldoctest meanlabel; setup = :(using Survey, StatsBase; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) +julia> quantile(:api00, :cname, dclus1, 0.5) +11×2 DataFrame + Row │ 0.5th percentile cname + │ Float64 String +─────┼─────────────────────────────── + 1 │ 669.0 Alameda + 2 │ 474.5 Fresno + 3 │ 452.5 Kern + 4 │ 628.0 Los Angeles + 5 │ 616.5 Mendocino + 6 │ 519.5 Merced + 7 │ 717.5 Orange + 8 │ 699.0 Plumas + 9 │ 657.0 San Diego + 10 │ 542.0 San Joaquin + 11 │ 718.0 Santa Clara +``` +""" +function quantile(x::Symbol, domain, design::AbstractSurveyDesign, p::Real) + df = bydomain(x, domain, design, quantile, p) + return df +end \ No newline at end of file diff --git a/src/ratio.jl b/src/ratio.jl index b97bb93b..b6cac14f 100644 --- a/src/ratio.jl +++ b/src/ratio.jl @@ -8,7 +8,7 @@ julia> apiclus1 = load_data("apiclus1"); julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); -julia> ratio(:api00, :enroll, dclus1) +julia> ratio([:api00, :enroll], dclus1) 1×1 DataFrame Row │ ratio │ Float64 @@ -17,7 +17,10 @@ julia> ratio(:api00, :enroll, dclus1) ``` """ -function ratio(variable_num::Symbol, variable_den::Symbol, design::SurveyDesign) +function ratio(x::Vector{Symbol}, design::SurveyDesign) + + variable_num, variable_den = x[1], x[2] + X = wsum(design.data[!, variable_num], design.data[!, design.weights]) / wsum(design.data[!, variable_den], design.data[!, design.weights]) @@ -25,33 +28,85 @@ function ratio(variable_num::Symbol, variable_den::Symbol, design::SurveyDesign) end """ -Use replicate weights to compute the standard error of the ratio. + ratio(x::Vector{Symbol}, design::ReplicateDesign) -```jldoctest; setup = :(apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw)) -julia> bclus1 = bootweights(dclus1); +Compute the standard error of the ratio using replicate weights. -julia> ratio(:api00, :enroll, bclus1) -1×2 DataFrame - Row │ ratio SE - │ Float64 Float64 -─────┼─────────────────── - 1 │ 1.17182 0.131518 +# Arguments +- `variable_num::Symbol`: Symbol representing the numerator variable. +- `variable_den::Symbol`: Symbol representing the denominator variable. +- `design::ReplicateDesign`: Replicate design object. + +# Examples +```jldoctest; setup = :(using Survey, StatsBase; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = bootweights(dclus1);) +julia> ratio([:api00, :api99], bclus1) +1×2 DataFrame + Row │ estimator SE + │ Float64 Float64 +─────┼─────────────────────── + 1 │ 1.06127 0.00672259 ``` """ -function ratio(variable_num::Symbol, variable_den::Symbol, design::ReplicateDesign) - X = - wsum(design.data[!, variable_num], design.data[!, design.weights]) / - wsum(design.data[!, variable_den], design.data[!, design.weights]) - Xt = [ - (wsum( - design.data[!, variable_num], - weights(design.data[!, "replicate_"*string(i)]), - )) / (wsum( - design.data[!, variable_den], - weights(design.data[!, "replicate_"*string(i)]), - )) for i = 1:design.replicates - ] - variance = sum((Xt .- X) .^ 2) / design.replicates - DataFrame(ratio = X, SE = sqrt(variance)) +function ratio(x::Vector{Symbol}, design::ReplicateDesign) + + variable_num, variable_den = x[1], x[2] + + # Define an inner function to calculate the ratio + function inner_ratio(df::DataFrame, columns, weights_column) + return sum(df[!, columns[1]], StatsBase.weights(df[!, weights_column])) / sum(df[!, columns[2]], StatsBase.weights(df[!, weights_column])) + end + + # Calculate the standard error using the `standarderror` function with the inner function + return standarderror([variable_num, variable_den], inner_ratio, design) end + +""" + ratio(var, domain, design) + +Estimate ratios of domains. + +```jldoctest ratiolabel; setup = :(using Survey, StatsBase; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) +julia> ratio([:api00, :api99], :cname, dclus1) +11×2 DataFrame + Row │ ratio cname + │ Float64 String +─────┼────────────────────── + 1 │ 1.09852 Alameda + 2 │ 1.17779 Fresno + 3 │ 1.11453 Kern + 4 │ 1.06307 Los Angeles + 5 │ 1.00565 Mendocino + 6 │ 1.08121 Merced + 7 │ 1.03628 Orange + 8 │ 1.02127 Plumas + 9 │ 1.06112 San Diego + 10 │ 1.07331 San Joaquin + 11 │ 1.05598 Santa Clara +``` + +Use the replicate design to compute standard errors of the estimated means. + +```jldoctest ratiolabel +julia> ratio([:api00, :api99], :cname, bclus1) +11×3 DataFrame + Row │ estimator SE cname + │ Float64 Float64 String +─────┼───────────────────────────────────── + 1 │ 1.05598 0.0189429 Santa Clara + 2 │ 1.06112 0.00979481 San Diego + 3 │ 1.08121 6.85453e-17 Merced + 4 │ 1.06307 0.0257137 Los Angeles + 5 │ 1.03628 0.0 Orange + 6 │ 1.17779 6.27535e-18 Fresno + 7 │ 1.02127 0.0 Plumas + 8 │ 1.09852 2.12683e-16 Alameda + 9 │ 1.07331 2.22045e-16 San Joaquin + 10 │ 1.11453 0.0 Kern + 11 │ 1.00565 0.0 Mendocino +``` +""" +function ratio(x::Vector{Symbol}, domain, design::AbstractSurveyDesign) + df = bydomain(x, domain, design, ratio) + return df +end \ No newline at end of file diff --git a/src/reg.jl b/src/reg.jl new file mode 100644 index 00000000..de8233ec --- /dev/null +++ b/src/reg.jl @@ -0,0 +1,47 @@ +""" + glm(formula::FormulaTerm, design::ReplicateDesign, args...; kwargs...) + +Perform generalized linear modeling (GLM) using the survey design with replicates. + +# Arguments +- `formula`: A `FormulaTerm` specifying the model formula. +- `design`: A `ReplicateDesign` object representing the survey design with replicates. +- `args...`: Additional arguments to be passed to the `glm` function. +- `kwargs...`: Additional keyword arguments to be passed to the `glm` function. + +# Returns +A `DataFrame` containing the estimates for model coefficients and their standard errors. + +# Example +```julia +apisrs = load_data("apisrs") +srs = SurveyDesign(apisrs) +bsrs = bootweights(srs, replicates = 2000) +result = glm(@formula(api00 ~ api99), bsrs, Normal()) +``` + +```jldoctest; setup = :(using Survey, StatsBase, GLM; apisrs = load_data("apisrs"); srs = SurveyDesign(apisrs); bsrs = bootweights(srs, replicates = 2000);) +julia> glm(@formula(api00 ~ api99), bsrs, Normal()) +2×2 DataFrame + Row │ estimator SE + │ Float64 Float64 +─────┼────────────────────── + 1 │ 63.2831 9.41231 + 2 │ 0.949762 0.0135488 +``` +""" +function glm(formula::FormulaTerm, design::ReplicateDesign, args...; kwargs...) + + rhs_symbols = typeof(formula.rhs) == Term ? Symbol.(formula.rhs) : collect(Symbol.(formula.rhs)) + lhs_symbols = Symbol.(formula.lhs) + columns = vcat(rhs_symbols, lhs_symbols) + + function inner_glm(df::DataFrame, columns, weights_column, args...; kwargs...) + matrix = hcat(ones(nrow(df)), Matrix(df[!, columns[1:(length(columns)-1)]])) + model = glm(matrix, (df[!, columns[end]]), args...; wts=df[!, weights_column], kwargs...) + return coef(model) + end + + # Compute standard error of coefficients + standarderror(columns, inner_glm, design, args...; kwargs...) + end \ No newline at end of file diff --git a/src/total.jl b/src/total.jl index de2fb218..a460ab59 100644 --- a/src/total.jl +++ b/src/total.jl @@ -22,11 +22,20 @@ function total(x::Symbol, design::SurveyDesign) end """ -Use replicate weights to compute the standard error of the estimated total. + total(x::Symbol, design::ReplicateDesign) -```jldoctest; setup = :(apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw)) -julia> bclus1 = dclus1 |> bootweights; +Compute the standard error of the estimated total using replicate weights. +# Arguments +- `x::Symbol`: Symbol representing the variable for which the total is estimated. +- `design::ReplicateDesign`: Replicate design object. + +# Returns +- `df`: DataFrame containing the estimated total and its standard error. + +# Examples + +```jldoctest; setup = :(using Survey; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) julia> total(:api00, bclus1) 1×2 DataFrame Row │ total SE @@ -36,9 +45,17 @@ julia> total(:api00, bclus1) ``` """ function total(x::Symbol, design::ReplicateDesign) - total_func(x, y) = wsum(x, weights(y)) - df = variance(x, total_func, design) + + # Define an inner function to calculate the total + function inner_total(df::DataFrame, column, weights) + return StatsBase.wsum(df[!, column], StatsBase.weights(df[!, weights])) + end + + # Calculate the total and standard error + df = standarderror(x, inner_total, design) + rename!(df, :estimator => :total) + return df end @@ -78,46 +95,46 @@ end Estimate population totals of domains. -```jldoctest totallabel; setup = :(apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) +```jldoctest totallabel; setup = :(using Survey; apiclus1 = load_data("apiclus1"); dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw); bclus1 = dclus1 |> bootweights) julia> total(:api00, :cname, dclus1) 11×2 DataFrame - Row │ cname total - │ String15 Float64 + Row │ total cname + │ Float64 String ─────┼───────────────────────────── - 1 │ Alameda 249080.0 - 2 │ Fresno 63903.1 - 3 │ Kern 30631.5 - 4 │ Los Angeles 3.2862e5 - 5 │ Mendocino 84380.6 - 6 │ Merced 70300.2 - 7 │ Orange 3.84807e5 - 8 │ Plumas 2.16147e5 - 9 │ San Diego 1.2276e6 - 10 │ San Joaquin 6.90276e5 - 11 │ Santa Clara 6.44244e5 + 1 │ 249080.0 Alameda + 2 │ 63903.1 Fresno + 3 │ 30631.5 Kern + 4 │ 3.2862e5 Los Angeles + 5 │ 84380.6 Mendocino + 6 │ 70300.2 Merced + 7 │ 3.84807e5 Orange + 8 │ 2.16147e5 Plumas + 9 │ 1.2276e6 San Diego + 10 │ 6.90276e5 San Joaquin + 11 │ 6.44244e5 Santa Clara ``` Use the replicate design to compute standard errors of the estimated totals. ```jldoctest totallabel julia> total(:api00, :cname, bclus1) 11×3 DataFrame - Row │ cname total SE - │ String15 Float64 Float64 + Row │ total SE cname + │ Float64 Float64 String ─────┼──────────────────────────────────────────── - 1 │ Santa Clara 6.44244e5 4.2273e5 - 2 │ San Diego 1.2276e6 8.62727e5 - 3 │ Merced 70300.2 71336.3 - 4 │ Los Angeles 3.2862e5 2.93936e5 - 5 │ Orange 3.84807e5 3.88014e5 - 6 │ Fresno 63903.1 64781.7 - 7 │ Plumas 2.16147e5 2.12089e5 - 8 │ Alameda 249080.0 2.49228e5 - 9 │ San Joaquin 6.90276e5 6.81604e5 - 10 │ Kern 30631.5 30870.3 - 11 │ Mendocino 84380.6 80215.9 + 1 │ 6.44244e5 4.2273e5 Santa Clara + 2 │ 1.2276e6 8.62727e5 San Diego + 3 │ 70300.2 71336.3 Merced + 4 │ 3.2862e5 2.93936e5 Los Angeles + 5 │ 3.84807e5 3.88014e5 Orange + 6 │ 63903.1 64781.7 Fresno + 7 │ 2.16147e5 2.12089e5 Plumas + 8 │ 249080.0 2.49228e5 Alameda + 9 │ 6.90276e5 6.81604e5 San Joaquin + 10 │ 30631.5 30870.3 Kern + 11 │ 84380.6 80215.9 Mendocino ``` """ function total(x::Symbol, domain, design::AbstractSurveyDesign) - df = bydomain(x, domain, design, wsum) - rename!(df, :statistic => :total) -end + df = bydomain(x, domain, design, total) + return df +end \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml index 0c99cc27..2ff5352b 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,7 @@ [deps] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/SurveyDesign.jl b/test/SurveyDesign.jl index 5084c1e4..4cb65b79 100644 --- a/test/SurveyDesign.jl +++ b/test/SurveyDesign.jl @@ -36,6 +36,18 @@ ### Weights as non-numeric error apisrs = copy(apisrs_original) @test_throws ArgumentError SurveyDesign(apisrs, weights = :stype) + + ### popsize and weights as symbols + + apisrs = copy(apisrs_original) + srs_pop_weights = SurveyDesign(apisrs, weights =:pw, popsize = :fpc) + @test srs_pop_weights.data[!, srs_pop_weights.weights][1] ≈ 30.97 atol = 1e-4 + @test srs_pop_weights.data[!, srs_pop_weights.weights] == 1 ./ srs_pop_weights.data[!, srs_pop_weights.allprobs] + @test srs_pop_weights.data[!, srs_pop_weights.allprobs] ≈ srs_pop_weights.data[!, :derived_probs] atol = 1e-4 + @test srs_pop_weights.data[!, srs_pop_weights.sampsize] ≈ srs_pop_weights.data[!, :derived_sampsize] atol = 1e-4 + ### Both ways should achieve same weights and allprobs! + @test srs_pop_weights.data[!, srs_pop_weights.weights] == srs_weights.data[!, srs_weights.weights] + end @testset "SurveyDesign_strat" begin diff --git a/test/ratio.jl b/test/ratio.jl index b9865f09..5d46bb75 100644 --- a/test/ratio.jl +++ b/test/ratio.jl @@ -1,14 +1,14 @@ @testset "ratio.jl" begin # on :api00 - @test ratio(:api00, :enroll, srs).ratio[1] ≈ 1.1231 atol = 1e-4 - @test ratio(:api00, :enroll, dclus1).ratio[1] ≈ 1.17182 atol = 1e-4 - @test ratio(:api00, :enroll, dclus1_boot).SE[1] ≈ 0.1275446 atol = 1e-1 - @test ratio(:api00, :enroll, dclus1_boot).ratio[1] ≈ 1.17182 atol = 1e-4 - @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4 - @test ratio(:api00, :enroll, bstrat).SE[1] ≈ 0.04185 atol = 1e-1 - @test ratio(:api00, :enroll, bstrat).ratio[1] ≈ 1.11256 atol = 1e-4 + @test ratio([:api00, :enroll], srs).ratio[1] ≈ 1.1231 atol = 1e-4 + @test ratio([:api00, :enroll], dclus1).ratio[1] ≈ 1.17182 atol = 1e-4 + @test ratio([:api00, :enroll], dclus1_boot).SE[1] ≈ 0.1275446 atol = 1e-1 + @test ratio([:api00, :enroll], dclus1_boot).estimator[1] ≈ 1.17182 atol = 1e-4 + @test ratio([:api00, :enroll], bstrat).estimator[1] ≈ 1.11256 atol = 1e-4 + @test ratio([:api00, :enroll], bstrat).SE[1] ≈ 0.04185 atol = 1e-1 + @test ratio([:api00, :enroll], bstrat).estimator[1] ≈ 1.11256 atol = 1e-4 # on :api99 - @test ratio(:api99, :enroll, bsrs).ratio[1] ≈ 1.06854 atol = 1e-4 - @test ratio(:api99, :enroll, dstrat).ratio[1] ≈ 1.0573 atol = 1e-4 - @test ratio(:api99, :enroll, bstrat).ratio[1] ≈ 1.0573 atol = 1e-4 + @test ratio([:api99, :enroll], bsrs).estimator[1] ≈ 1.06854 atol = 1e-4 + @test ratio([:api99, :enroll], dstrat).ratio[1] ≈ 1.0573 atol = 1e-4 + @test ratio([:api99, :enroll], bstrat).estimator[1] ≈ 1.0573 atol = 1e-4 end diff --git a/test/reg.jl b/test/reg.jl new file mode 100644 index 00000000..34593204 --- /dev/null +++ b/test/reg.jl @@ -0,0 +1,235 @@ +@testset "GLM in bootstrap apisrs" begin + rename!(bsrs.data, Symbol("sch.wide") => :sch_wide) + bsrs.data.sch_wide = ifelse.(bsrs.data.sch_wide .== "Yes", 1, 0) + model = glm(@formula(sch_wide ~ meals + ell), bsrs, Binomial()) + + @test model.estimator[1] ≈ 1.523050676 rtol=STAT_TOL + @test model.estimator[2] ≈ 0.009754261 rtol=STAT_TOL + @test model.estimator[3] ≈ -0.020892044 rtol=STAT_TOL + + @test model.SE[1] ≈ 0.369836 rtol=SE_TOL + @test model.SE[2] ≈ 0.009928 rtol=SE_TOL + @test model.SE[3] ≈ 0.012676 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, weights=~pw, data=apisrs) + # bsrs <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(sch.wide ~ meals + ell, bsrs, family=binomial()) + # summary(model) + + model = glm(@formula(api00 ~ api99), bsrs, Gamma(),InverseLink()) + + @test model.estimator[1] ≈ 2.915479e-03 rtol=STAT_TOL + @test model.estimator[2] ≈ -2.137187e-06 rtol=STAT_TOL + @test model.SE[1] ≈ 4.550e-05 rtol=SE_TOL + @test model.SE[2] ≈ 6.471e-08 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, weights=~pw, data=apisrs) + # bsrs <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(api00 ~ api99, bsrs, family = Gamma(link = "inverse")) + # summary(model) + + model = glm(@formula(api00 ~ api99), bsrs, Normal()) + + @test model.estimator[1] ≈ 63.2830726 rtol=STAT_TOL + @test model.estimator[2] ≈ 0.9497618 rtol=STAT_TOL + @test model.SE[1] ≈ 9.63276 rtol=SE_TOL + @test model.SE[2] ≈ 0.01373 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, weights=~pw, data=apisrs) + # bsrs <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(api00 ~ api99, bsrs, family = gaussian()) + # summary(model) + + rename!(bsrs.data, Symbol("api.stu") => :api_stu) + model = glm(@formula(api_stu ~ meals + ell), bsrs, Poisson()) + + @test model.estimator[1] ≈ 6.229602881 rtol=STAT_TOL + @test model.estimator[2] ≈ -0.002038345 rtol=STAT_TOL + @test model.estimator[3] ≈ 0.002116465 rtol=STAT_TOL + + @test model.SE[1] ≈ 0.093115 rtol=SE_TOL + @test model.SE[2] ≈ 0.002191 rtol=SE_TOL + @test model.SE[3] ≈ 0.002858 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, weights=~pw, data=apisrs) + # bsrs <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(api.stu ~ meals + ell, bsrs, family = poisson()) + # summary(model) +end + +@testset "GLM in jackknife apisrs" begin + rename!(jsrs.data, Symbol("sch.wide") => :sch_wide) + jsrs.data.sch_wide = ifelse.(jsrs.data.sch_wide .== "Yes", 1, 0) + model = glm(@formula(sch_wide ~ meals + ell), jsrs, Binomial()) + + @test model.estimator[1] ≈ 1.523051 rtol=STAT_TOL + @test model.estimator[2] ≈ 0.009754 rtol=1e-4 # This is a tiny bit off with 1e-5 + @test model.estimator[3] ≈ -0.020892 rtol=STAT_TOL + + @test model.SE[1] ≈ 0.359043 rtol=SE_TOL + @test model.SE[2] ≈ 0.009681 rtol=SE_TOL + @test model.SE[3] ≈ 0.012501 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, weights=~pw, data=apisrs) + # jsrs <- as.svrepdesign(srs, type="JK1", replicates=10000) + # model = glm(sch.wide ~ meals + ell, jsrs, family=binomial()) + # summary(model) + + model = glm(@formula(api00 ~ api99), jsrs, Gamma(), InverseLink()) + + @test model.estimator[1] ≈ 2.915479e-03 rtol=STAT_TOL + @test model.estimator[2] ≈ -2.137187e-06 rtol=STAT_TOL + @test model.SE[1] ≈ 5.234e-05 rtol=0.12 # failing at 1e-1 + @test model.SE[2] ≈ 7.459e-08 rtol=0.12 # failing at 1e-1 + + # R code + # data(api) + # srs <- svydesign(id=~1, weights=~pw, data=apisrs) + # jsrs <- as.svrepdesign(srs, type="JK1", replicates=10000) + # model = glm(api00 ~ api99, jsrs, family = Gamma(link = "inverse")) + # summary(model) + + model = glm(@formula(api00 ~ api99), jsrs, Normal()) + + @test model.estimator[1] ≈ 63.2830726 rtol=STAT_TOL + @test model.estimator[2] ≈ 0.9497618 rtol=STAT_TOL + @test model.SE[1] ≈ 9.69178 rtol=SE_TOL + @test model.SE[2] ≈ 0.01384 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, weights=~pw, data=apisrs) + # jsrs <- as.svrepdesign(srs, type="JK1", replicates=10000) + # model = glm(api00 ~ api99, jsrs, family = gaussian()) + # summary(model) + + rename!(jsrs.data, Symbol("api.stu") => :api_stu) + model = glm(@formula(api_stu ~ meals + ell), jsrs, Poisson()) + + @test model.estimator[1] ≈ 6.229602881 rtol=STAT_TOL + @test model.estimator[2] ≈ -0.002038345 rtol=STAT_TOL + @test model.estimator[3] ≈ 0.002116465 rtol=STAT_TOL + + @test model.SE[1] ≈ 0.095444 rtol=SE_TOL + @test model.SE[2] ≈ 0.002317 rtol=SE_TOL + @test model.SE[3] ≈ 0.003085 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, weights=~pw, data=apisrs) + # jsrs <- as.svrepdesign(srs, type="JK1", replicates=10000) + # model = glm(api.stu ~ meals + ell, jsrs, family = poisson()) + # summary(model) +end + +@testset "GLM in bootstrap apiclus1" begin + rename!(dclus1_boot.data, Symbol("sch.wide") => :sch_wide) + dclus1_boot.data.sch_wide = ifelse.(dclus1_boot.data.sch_wide .== "Yes", 1, 0) + model = glm(@formula(sch_wide ~ meals + ell), dclus1_boot, Binomial()) + + @test model.estimator[1] ≈ 1.89955691 rtol=STAT_TOL + @test model.estimator[2] ≈ -0.01911468 rtol=STAT_TOL + @test model.estimator[3] ≈ 0.03992541 rtol=STAT_TOL + + @test model.SE[1] ≈ 0.62928 rtol=SE_TOL + @test model.SE[2] ≈ 0.01209 rtol=SE_TOL + @test model.SE[3] ≈ 0.01533 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~dnum, weights=~pw, data=apiclus1) + # dclus1_boot <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(sch.wide ~ meals + ell, dclus1_boot, family=binomial()) + # summary(model) + + model = glm(@formula(api00 ~ api99), dclus1_boot, Gamma(),InverseLink()) + + @test model.estimator[1] ≈ 2.914844e-03 rtol=STAT_TOL + @test model.estimator[2] ≈ -2.180775e-06 rtol=STAT_TOL + @test model.SE[1] ≈ 8.014e-05 rtol=SE_TOL + @test model.SE[2] ≈ 1.178e-07 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~dnum, weights=~pw, data=apiclus1) + # dclus1_boot <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(api00 ~ api99, dclus1_boot, family = Gamma(link = "inverse")) + # summary(model) + + model = glm(@formula(api00 ~ api99), dclus1_boot, Normal()) + + @test model.estimator[1] ≈ 95.28483 rtol=STAT_TOL + @test model.estimator[2] ≈ 0.90429 rtol=STAT_TOL + @test model.SE[1] ≈ 16.02015 rtol=SE_TOL + @test model.SE[2] ≈ 0.02522 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~dnum, weights=~pw, data=apiclus1) + # dclus1_boot <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(api00 ~ api99, dclus1_boot, family = gaussian()) + # summary(model) + + rename!(dclus1_boot.data, Symbol("api.stu") => :api_stu) + model = glm(@formula(api_stu ~ meals + ell), dclus1_boot, Poisson()) + + @test model.estimator[1] ≈ 6.2961803529 rtol=STAT_TOL + @test model.estimator[2] ≈ -0.0026906166 rtol=STAT_TOL + @test model.estimator[3] ≈ -0.0006054623 rtol=STAT_TOL + + @test model.SE[1] ≈ 0.161459174 rtol=SE_TOL + @test model.SE[2] ≈ 0.003193577 rtol=0.11 # failing at 0.1 + @test model.SE[3] ≈ 0.005879115 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~dnum, weights=~pw, data=apiclus1) + # dclus1_boot <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(api.stu ~ meals + ell, dclus1_boot, family = poisson()) + # summary(model) +end + +@testset "GLM in bootstrap apistrat" begin + rename!(bstrat.data, Symbol("sch.wide") => :sch_wide) + bstrat.data.sch_wide = ifelse.(bstrat.data.sch_wide .== "Yes", 1, 0) + model = glm(@formula(sch_wide ~ meals + ell), bstrat, Binomial()) + + @test model.estimator[1] ≈ 1.560408424 rtol=STAT_TOL + @test model.estimator[2] ≈ 0.003524761 rtol=STAT_TOL + @test model.estimator[3] ≈ -0.006831057 rtol=STAT_TOL + + @test model.SE[1] ≈ 0.342669691 rtol=SE_TOL + @test model.SE[2] ≈ 0.009423733 rtol=SE_TOL + @test model.SE[3] ≈ 0.013934952 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, strata=~dnum, weights=~pw, data=apistrat) + # bstrat <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(sch.wide ~ meals + ell, bstrat, family=binomial()) + # summary(model) + + model = glm(@formula(api00 ~ api99), bstrat, Gamma(),InverseLink()) + + @test model.estimator[1] ≈ 2.873104e-03 rtol=STAT_TOL + @test model.estimator[2] ≈ -2.088791e-06 rtol=STAT_TOL + @test model.SE[1] ≈ 4.187745e-05 rtol=SE_TOL + @test model.SE[2] ≈ 5.970111e-08 rtol=SE_TOL + + # R code + # data(api) + # srs <- svydesign(id=~1, strata=~dnum, weights=~pw, data=apistrat) + # bstrat <- as.svrepdesign(srs, type="subbootstrap", replicates=10000) + # model = glm(api00 ~ api99, bstrat, family = Gamma(link = "inverse")) + # summary(model) +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 0d466895..77aa4649 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,8 @@ using Survey using Test using CategoricalArrays +using GLM +using DataFrames const STAT_TOL = 1e-5 const SE_TOL = 1e-1 @@ -12,7 +14,8 @@ REPLICATES_REGEX = r"r*_\d" apisrs = load_data("apisrs") # Load API dataset srs = SurveyDesign(apisrs, weights = :pw) unitrange = UnitRange((length(names(apisrs)) + 1):(TOTAL_REPLICATES + length(names(apisrs)))) -bsrs = srs |> bootweights # Create replicate design +bsrs = srs |> bootweights # Create bootstrap replicate design +jsrs = srs |> jackknifeweights # Create jackknife replicate design bsrs_direct = ReplicateDesign{BootstrapReplicates}(bsrs.data, REPLICATES_VECTOR, weights = :pw) # using ReplicateDesign constructor bsrs_unitrange = ReplicateDesign{BootstrapReplicates}(bsrs.data, unitrange, weights = :pw) # using ReplicateDesign constructor bsrs_regex = ReplicateDesign{BootstrapReplicates}(bsrs.data, REPLICATES_REGEX, weights = :pw) # using ReplicateDesign constructor @@ -62,5 +65,6 @@ include("plot.jl") include("hist.jl") include("boxplot.jl") include("ratio.jl") -include("show.jl") +#include("show.jl") include("jackknife.jl") +include("reg.jl") \ No newline at end of file diff --git a/test/total.jl b/test/total.jl index b7780263..119d6e45 100644 --- a/test/total.jl +++ b/test/total.jl @@ -171,8 +171,8 @@ end # Test multiple domains passed at once tot = total(:api00, [:stype,:cname], dclus1_boot) - @test filter(row -> row[:cname] == "Los Angeles" && row[:stype] == "E", tot).SE[1] ≈ 343365 rtol = STAT_TOL - @test filter(row -> row[:cname] == "Merced" && row[:stype] == "H", tot).SE[1] ≈ 27090.33 rtol = STAT_TOL + @test filter(row -> row[:stype_cname] == "E-Los Angeles", tot).SE[1] ≈ 343365 rtol = STAT_TOL + @test filter(row -> row[:stype_cname] == "H-Merced", tot).SE[1] ≈ 27090.33 rtol = STAT_TOL #### Why doesnt this syntax produce domain estimates?? # Test that column specifiers from DataFrames make it through this pipeline