Skip to content

Commit

Permalink
add a true bootstrap (see #12)
Browse files Browse the repository at this point in the history
  • Loading branch information
tpoisot committed Oct 12, 2023
1 parent 60e5862 commit 3b213da
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 23 deletions.
6 changes: 6 additions & 0 deletions code/crossvalidate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,10 @@ function sm(f, M::Vector{ConfusionMatrix})
m = round(mean(v); digits=3)
s = round(std(v); digits=3)
return "$(m) ± $(s)"
end

function sm(f, M::ConfusionMatrix)
v = f(M)
m = round(v; digits=3)
return "$(m)"
end
5 changes: 5 additions & 0 deletions code/splitters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,9 @@ function kfold(y, X; k=10, permute=true)
end
end
return folds
end

function bootstrap(y, X; n=20)
@assert size(y,1) == size(X, 1)
return [sample(1:size(X, 1), size(X, 1), replace=true) for i in 1:n]
end
42 changes: 19 additions & 23 deletions slides.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -520,24 +520,19 @@ current_figure()
```{julia}
#| echo: true
#| output: false
C2 = zeros(ConfusionMatrix, length(folds))
for (i,f) in enumerate(folds)
trn, val = f
foldmodel = naivebayes(ty[trn], tX[trn,:])
foldpred = vec(mapslices(foldmodel, tX[val,:]; dims=2))
C2[i] = ConfusionMatrix(foldpred, ty[val], thr[m])
end
N_v2 = crossvalidate(naivebayes, ty, tX[:,available_variables], folds, thr[m])
```

## Measures on the confusion matrix

| | Initial | Var. sel. | Tuned |
|------------------|------------------|------------------|------------------|
| FPR | `{julia} round(mean(fpr.(C0)); digits=2)` | `{julia} round(mean(fpr.(C1)); digits=2)` | `{julia} round(mean(fpr.(C2)); digits=2)` |
| FNR | `{julia} round(mean(fnr.(C0)); digits=2)` | `{julia} round(mean(fnr.(C1)); digits=2)` | `{julia} round(mean(fnr.(C2)); digits=2)` |
| TPR | `{julia} round(mean(tpr.(C0)); digits=2)` | `{julia} round(mean(tpr.(C1)); digits=2)` | `{julia} round(mean(tpr.(C2)); digits=2)` |
| TNR | `{julia} round(mean(tnr.(C0)); digits=2)` | `{julia} round(mean(tnr.(C1)); digits=2)` | `{julia} round(mean(tnr.(C2)); digits=2)` |
| MCC | `{julia} round(mean(mcc.(C0)); digits=2)` | `{julia} round(mean(mcc.(C1)); digits=2)` | `{julia} round(mean(mcc.(C2)); digits=2)` |
| | BioClim | NBC | BioClim (v.s.) | NBC (v.s.) | NBC (v.s. + tuning) |
|-----|----|-----------|----|-----------|----|
| FPR | `{julia} sm(fpr, B_v0)` | `{julia} sm(fpr, N_v0)` | `{julia} sm(fpr, B_v1)` | `{julia} sm(fpr, N_v1)` | `{julia} sm(fpr, N_v2)` |
| FNR | `{julia} sm(fnr, B_v0)` | `{julia} sm(fnr, N_v0)` | `{julia} sm(fnr, B_v1)` | `{julia} sm(fnr, N_v1)` | `{julia} sm(fnr, N_v2)` |
| TPR | `{julia} sm(tpr, B_v0)` | `{julia} sm(tpr, N_v0)` | `{julia} sm(tpr, B_v1)` | `{julia} sm(tpr, N_v1)` | `{julia} sm(tpr, N_v2)` |
| TNR | `{julia} sm(tnr, B_v0)` | `{julia} sm(tnr, N_v0)` | `{julia} sm(tnr, B_v1)` | `{julia} sm(tnr, N_v1)` | `{julia} sm(tnr, N_v2)` |
| TSS | `{julia} sm(trueskill, B_v0)` | `{julia} sm(trueskill, N_v0)` | `{julia} sm(trueskill, B_v1)` | `{julia} sm(trueskill, N_v1)` | `{julia} sm(trueskill, N_v2)` |
| MCC | `{julia} sm(mcc, B_v0)` | `{julia} sm(mcc, N_v0)` | `{julia} sm(mcc, B_v1)` | `{julia} sm(mcc, N_v1)` | `{julia} sm(mcc, N_v2)` |

## Tuned model performance

Expand All @@ -546,20 +541,21 @@ We can retrain over *all* the training data
```{julia}
#| echo: true
#| output: false
finalmodel = naivebayes(ty, tX)
finalmodel = naivebayes(ty, tX[:,available_variables])
prediction = vec(mapslices(finalmodel, X[tidx,available_variables]; dims=2))
Cf = ConfusionMatrix(prediction, y[tidx], thr[m])
C = ConfusionMatrix(prediction, y[tidx], thr[m])
```

## Estimated performance

| | Final model |
|-----|------------------------------------|
| FPR | `{julia} round(fpr(Cf); digits=2)` |
| FNR | `{julia} round(fnr(Cf); digits=2)` |
| TPR | `{julia} round(tpr(Cf); digits=2)` |
| TNR | `{julia} round(tnr(Cf); digits=2)` |
| MCC | `{julia} round(mcc(Cf); digits=2)` |
| FPR | `{julia} sm(fpr, C)` |
| FNR | `{julia} sm(fnr, C)` |
| TPR | `{julia} sm(tpr, C)` |
| TNR | `{julia} sm(tnr, C)` |
| MCC | `{julia} sm(trueskill, C)` |
| MCC | `{julia} sm(mcc, C)` |

## Acceptable bias

Expand All @@ -577,10 +573,10 @@ predictors = bioclim_clipped
function iqr(x)
return first(diff(quantile(x, [0.25, 0.75])))
end
foldmodels = [naivebayes(ty[f[1]], tX[f[1],:]) for f in folds]
samplemodels = [naivebayes(ty, tX[b,available_variables]) for b in bootstrap(ty, tX)]
variability = similar(first(predictors))
Threads.@threads for k in keys(variability)
bootstraps = [foldmodel([p[k] for p in predictors[available_variables]]) for foldmodel in foldmodels]
bootstraps = [samplemodel([p[k] for p in predictors[available_variables]]) for samplemodel in samplemodels]
variability[k] = iqr(bootstraps)
end
```
Expand Down

0 comments on commit 3b213da

Please sign in to comment.