Skip to content

Commit

Permalink
Merge pull request #250 from xKDR/as/allow_arbitrary_cols
Browse files Browse the repository at this point in the history
Allows multiple columns in domain estimation
  • Loading branch information
smishr authored Apr 10, 2023
2 parents 9d69475 + 8a4bf69 commit 31a80ef
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Survey"
uuid = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
authors = ["Ayush Patnaik <[email protected]>"]
version = "0.1.0"
version = "0.2.0"

[deps]
AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"
Expand Down
7 changes: 3 additions & 4 deletions src/by.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
function bydomain(x::Symbol, domain::Symbol, design::SurveyDesign, func::Function)
function bydomain(x::Symbol, domain, design::SurveyDesign, func::Function)
gdf = groupby(design.data, domain)
nd = length(unique(design.data[!, domain]))
X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
return X
end

function bydomain(x::Symbol, domain::Symbol, design::ReplicateDesign, func::Function)
function bydomain(x::Symbol, domain, design::ReplicateDesign, func::Function)
gdf = groupby(design.data, domain)
nd = length(unique(design.data[!, domain]))
nd = length(gdf)
X = combine(gdf, [x, design.weights] => ((a, b) -> func(a, weights(b))) => :statistic)
Xt_mat = Array{Float64,2}(undef, (nd, design.replicates))
for i = 1:design.replicates
Expand Down
2 changes: 1 addition & 1 deletion src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ julia> mean(:api00, :cname, bclus1)
11 │ Mendocino 623.25 1.09545e-13
```
"""
function mean(x::Symbol, domain::Symbol, design::AbstractSurveyDesign)
function mean(x::Symbol, domain, design::AbstractSurveyDesign)
weighted_mean(x, w) = mean(x, StatsBase.weights(w))
df = bydomain(x, domain, design, weighted_mean)
rename!(df, :statistic => :mean)
Expand Down
2 changes: 1 addition & 1 deletion src/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ julia> total(:api00, :cname, bclus1)
11 │ Mendocino 84380.6 80215.9
```
"""
function total(x::Symbol, domain::Symbol, design::AbstractSurveyDesign)
function total(x::Symbol, domain, design::AbstractSurveyDesign)
df = bydomain(x, domain, design, wsum)
rename!(df, :statistic => :total)
end
55 changes: 55 additions & 0 deletions test/total.jl
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,59 @@ end
# equivalent R code (results cause clutter):
# > svyby(~api00, ~cname, clus1rep, svytotal)
# > svyby(~api00, ~cname, clus1rep, svymean)

# Test multiple domains passed at once
tot = total(:api00, [:stype,:cname], dclus1_boot)
@test filter(row -> row[:cname] == "Los Angeles" && row[:stype] == "E", tot).SE[1] 343365 rtol = STAT_TOL
@test filter(row -> row[:cname] == "Merced" && row[:stype] == "H", tot).SE[1] 27090.33 rtol = STAT_TOL

#### Why doesnt this syntax produce domain estimates??
# Test that column specifiers from DataFrames make it through this pipeline
# These tests replicate what you see above...just with a different syntax.
# tot = total(:api00, Survey.DataFrames.Cols(==(:cname)), dclus1_boot)
######## Above Survey.DataFrames.Cols(==(:cname)) syntax doesnt give domain estimates
# @test size(tot)[1] == apiclus1.cname |> unique |> length
# @test filter(:cname => ==("Los Angeles"), tot).total[1] ≈ 489980.87 rtol = STAT_TOL
# @test filter(:cname => ==("Los Angeles"), tot).SE[1] ≈ 430469.28 rtol = SE_TOL
# @test filter(:cname => ==("San Diego"), tot).total[1] ≈ 1830375.53 rtol = STAT_TOL
# @test filter(:cname => ==("San Diego"), tot).SE[1] ≈ 1298696.64 rtol = SE_TOL
end

#### R code for vector{Symbol} domain estimation
# > data(api)
# > apiclus1$pw = rep(757/15,nrow(apiclus1))
# > ### 9.04.23
# > dclus1<-svydesign(id=~dnum, weights=~pw, data=apiclus1);
# > rclus1<-as.svrepdesign(dclus1, type="subbootstrap", compress=FALSE, replicates = 4000)
# > svyby(~api00, ~stype+cname, rclus1, svytotal)
# stype cname api00 se
# E.Alameda E Alameda 273428.40 275423.33
# H.Alameda H Alameda 30683.73 30907.60
# M.Alameda M Alameda 67272.07 67762.88
# E.Fresno E Fresno 48599.40 47484.67
# H.Fresno H Fresno 22356.73 21843.93
# M.Fresno M Fresno 24324.93 23766.99
# E.Kern E Kern 24930.53 24847.76
# M.Kern M Kern 20741.80 20672.93
# E.Los Angeles E Los Angeles 395154.00 341692.92
# M.Los Angeles M Los Angeles 94826.87 95416.42
# E.Mendocino E Mendocino 58844.13 57711.15
# H.Mendocino H Mendocino 35124.80 34448.51
# M.Mendocino M Mendocino 31844.47 31231.33
# E.Merced E Merced 50517.13 51424.65
# H.Merced H Merced 26696.87 27176.47
# M.Merced M Merced 27605.27 28101.18
# E.Orange E Orange 463536.33 465047.76
# M.Orange M Orange 110219.20 110578.59
# E.Plumas E Plumas 144284.20 146672.86
# H.Plumas H Plumas 143729.07 146108.54
# M.Plumas M Plumas 34266.87 34834.16
# E.San Diego E San Diego 1670497.13 1233144.04
# H.San Diego H San Diego 63386.13 63693.54
# M.San Diego M San Diego 96492.27 96960.22
# E.San Joaquin E San Joaquin 848243.73 848605.33
# H.San Joaquin H San Joaquin 79585.93 79619.86
# M.San Joaquin M San Joaquin 101387.53 101430.75
# E.Santa Clara E Santa Clara 737418.93 484164.71
# H.Santa Clara H Santa Clara 35478.07 35311.28
# M.Santa Clara M Santa Clara 187685.53 131278.63

0 comments on commit 31a80ef

Please sign in to comment.