Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 6411097
Author: Tmonster <[email protected]>
Date:   Thu Apr 6 11:26:36 2023 +0000

    more fixes for duckdb-latest

commit 2cb611f
Author: Tmonster <[email protected]>
Date:   Thu Apr 6 10:45:14 2023 +0000

    fix run.sh script and ver-duckdb-latest script

commit a15c3dd
Author: Tmonster <[email protected]>
Date:   Thu Apr 6 12:36:02 2023 +0200

    update .gitignore and add duckdb-latest dir

commit 76530c8
Author: Tmonster <[email protected]>
Date:   Thu Apr 6 12:32:33 2023 +0200

    add run.sh

commit de7803d
Author: Tmonster <[email protected]>
Date:   Thu Apr 6 12:32:09 2023 +0200

    add duckdb latest
  • Loading branch information
Tmonster committed Apr 6, 2023
1 parent 40379d3 commit 17d41db
Show file tree
Hide file tree
Showing 12 changed files with 609 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ GA/
utils/
*/py-*/
*/r-*/
duckdb-latest/duckdb
report-done
db-benchmark.gh-pages/
run.out
Expand Down
37 changes: 34 additions & 3 deletions _benchplot/benchplot-dict.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ solution.dict = {list(
"clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")),
"polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")),
"arrow" = list(name=c(short="arrow", long="Arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
"duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100"))
"duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")),
"duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100"))
)}
#barplot(rep(c(0L,1L,1L), length(solution.dict)),
# col=rev(c(rbind(sapply(solution.dict, `[[`, "color"), "black"))),
Expand Down Expand Up @@ -194,6 +195,18 @@ groupby.syntax.dict = {list(
"largest two v3 by id6" = "SELECT id6, v3 AS largest2_v3 FROM (SELECT id6, v3, row_number() OVER (PARTITION BY id6 ORDER BY v3 DESC) AS order_v3 FROM x WHERE v3 IS NOT NULL) sub_query WHERE order_v3 <= 2",
"regression v1 v2 by id2 id4" = "SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM tbl GROUP BY id2, id4",
"sum v3 count by id1:id6" = "SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count(*) AS count FROM tbl GROUP BY id1, id2, id3, id4, id5, id6"
)},
"duckdb-latest" = {c(
"sum v1 by id1" = "SELECT id1, sum(v1) AS v1 FROM tbl GROUP BY id1",
"sum v1 by id1:id2" = "SELECT id1, id2, sum(v1) AS v1 FROM tbl GROUP BY id1, id2",
"sum v1 mean v3 by id3" = "SELECT id3, sum(v1) AS v1, mean(v3) AS v3 FROM tbl GROUP BY id3",
"mean v1:v3 by id4" = "SELECT id4, mean(v1) AS v1, mean(v2) AS v2, mean(v3) AS v3 FROM tbl GROUP BY id4",
"sum v1:v3 by id6" = "SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM tbl GROUP BY id6",
"median v3 sd v3 by id4 id5" = "SELECT id4, id5, quantile_cont(v3, 0.5) AS median_v3, stddev(v3) AS sd_v3 FROM tbl GROUP BY id4, id5",
"max v1 - min v2 by id3" = "SELECT id3, max(v1)-min(v2) AS range_v1_v2 FROM tbl GROUP BY id3",
"largest two v3 by id6" = "SELECT id6, v3 AS largest2_v3 FROM (SELECT id6, v3, row_number() OVER (PARTITION BY id6 ORDER BY v3 DESC) AS order_v3 FROM x WHERE v3 IS NOT NULL) sub_query WHERE order_v3 <= 2",
"regression v1 v2 by id2 id4" = "SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM tbl GROUP BY id2, id4",
"sum v3 count by id1:id6" = "SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count(*) AS count FROM tbl GROUP BY id1, id2, id3, id4, id5, id6"
)}
)}
groupby.query.exceptions = {list(
Expand All @@ -207,7 +220,8 @@ groupby.query.exceptions = {list(
"clickhouse" = list(),
"polars" = list(),
"arrow" = list(),
"duckdb" = list()
"duckdb" = list(),
"duckdb-latest" = list()
)}
groupby.data.exceptions = {list( # exceptions as of run 1575727624
"data.table" = {list(
Expand Down Expand Up @@ -258,6 +272,10 @@ groupby.data.exceptions = {list(
"duckdb" = {list(
# "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0"),
# "incorrect: duckdb#1737" = c("G1_1e7_1e2_5_0","G1_1e8_1e2_5_0")
)},
"duckdb-latest" = {list(
# "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0"),
# "incorrect: duckdb#1737" = c("G1_1e7_1e2_5_0","G1_1e8_1e2_5_0")
)}
)}
groupby.exceptions = task.exceptions(groupby.query.exceptions, groupby.data.exceptions)
Expand Down Expand Up @@ -347,6 +365,13 @@ join.syntax.dict = {list(
"medium outer on int" = "SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)",
"medium inner on factor" = "SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)",
"big inner on int" = "SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)"
)},
"duckdb-latest" = {c(
"small inner on int" = "SELECT x.*, small.id4 AS small_id4, v2 FROM x JOIN small USING (id1)",
"medium inner on int" = "SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x JOIN medium USING (id2)",
"medium outer on int" = "SELECT x.*, medium.id1 AS medium_id1, medium.id4 AS medium_id4, medium.id5 AS medium_id5, v2 FROM x LEFT JOIN medium USING (id2)",
"medium inner on factor" = "SELECT x.*, medium.id1 AS medium_id1, medium.id2 AS medium_id2, medium.id4 AS medium_id4, v2 FROM x JOIN medium USING (id5)",
"big inner on int" = "SELECT x.*, big.id1 AS big_id1, big.id2 AS big_id2, big.id4 AS big_id4, big.id5 AS big_id5, big.id6 AS big_id6, v2 FROM x JOIN big USING (id3)"
)}
)}
join.query.exceptions = {list(
Expand All @@ -360,7 +385,8 @@ join.query.exceptions = {list(
"clickhouse" = list(),
"polars" = list(),
"arrow" = list(),
"duckdb" = list()
"duckdb" = list(),
"duckdb-latest" = list()
)}
join.data.exceptions = {list( # exceptions as of run 1575727624
"data.table" = {list(
Expand Down Expand Up @@ -402,6 +428,11 @@ join.data.exceptions = {list(
# "internal error: duckdb#1739" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1"),
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")#,
#"incorrect: duckdb#1737" = c("J1_1e7_NA_5_0","J1_1e8_NA_5_0")
)},
"duckdb-latest" = {list(
# "internal error: duckdb#1739" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1"),
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")#,
#"incorrect: duckdb#1737" = c("J1_1e7_NA_5_0","J1_1e8_NA_5_0")
)}
)}
join.exceptions = task.exceptions(join.query.exceptions, join.data.exceptions)
Expand Down
2 changes: 2 additions & 0 deletions _control/solutions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ arrow,groupby
arrow,join
duckdb,groupby
duckdb,join
duckdb-latest,groupby
duckdb-latest,join
2 changes: 1 addition & 1 deletion _launcher/launcher.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ readret = function(x) {
file.ext = function(x) {
ans = switch(
x,
"data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R",
"data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
"pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "polars"="py",
"clickhouse"="sql",
"juliadf"="jl"
Expand Down
2 changes: 1 addition & 1 deletion _launcher/solution.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ if ("quiet" %in% names(args)) {
file.ext = function(x) {
ans = switch(
x,
"data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R",
"data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
"pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "polars"="py",
"clickhouse"="sql",
"juliadf"="jl"
Expand Down
42 changes: 42 additions & 0 deletions _report/history.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,48 @@ plot(d, "duckdb", 1e8, "join")
plot(d, "duckdb", 1e9, "join")
```

### duckdb-latest {.tabset .tabset-fade .tabset-pills}

#### groupby {.tabset .tabset-fade .tabset-pills}

##### 0.5 GB

```{r duckdb-latest.groupby.1e7}
plot(d, "duckdb-latest", 1e7, "groupby")
```

##### 5 GB

```{r duckdb-latest.groupby.1e8}
plot(d, "duckdb-latest", 1e8, "groupby")
```

##### 50 GB {.active}

```{r duckdb-latest.groupby.1e9}
plot(d, "duckdb-latest", 1e9, "groupby")
```

#### join {.tabset .tabset-fade .tabset-pills}

##### 0.5 GB

```{r duckdb-latest.join.1e7}
plot(d, "duckdb-latest", 1e7, "join")
```

##### 5 GB {.active}

```{r duckdb-latest.join.1e8}
plot(d, "duckdb-latest", 1e8, "join")
```

##### 50 GB

```{r duckdb-latest.join.1e9}
plot(d, "duckdb-latest", 1e9, "join")
```

## Details

### Environment
Expand Down
2 changes: 1 addition & 1 deletion _report/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) {
file.path(path, "report-done")
}
get_report_solutions = function() {
c("data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "clickhouse", "cudf", "polars","arrow","duckdb")
c("data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "clickhouse", "cudf", "polars","arrow","duckdb", "duckdb-latest")
}
get_data_levels = function() {
## groupby
Expand Down
Loading

0 comments on commit 17d41db

Please sign in to comment.