修订版 | 755b11e0bd678429fa40a6870432fea49cfde86f (tree) |
---|---|
时间 | 2025-01-03 03:35:54 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I can now choose whether to run the left_join with arrow or duckplyr.
@@ -1,9 +1,13 @@ | ||
1 | 1 | library(tidyverse) |
2 | 2 | |
3 | +choose_arrow <- 1 ## choose whether to use arrow or duckplyr | |
4 | + | |
5 | +if (choose_arrow==1){ | |
6 | + | |
3 | 7 | library(arrow) |
4 | 8 | |
5 | 9 | |
6 | -# Uncomment and run this only once | |
10 | +## Uncomment and run this only once | |
7 | 11 | ## dd <- tibble(x=1:100000000, y=rep(LETTERS[1:20], 5000000)) |
8 | 12 | |
9 | 13 |
@@ -32,27 +36,29 @@ | ||
32 | 36 | |
33 | 37 | df_out2|>glimpse() |
34 | 38 | |
35 | - | |
36 | -## uncomment to run --this takes a lot of memory on my system | |
37 | - | |
38 | -## library(duckplyr) | |
39 | +} else { | |
39 | 40 | |
40 | -## duck_exec("set memory_limit='1GB'") | |
41 | 41 | |
42 | -## df <- duck_csv("test.csv") | |
42 | +library(duckplyr) | |
43 | 43 | |
44 | -## system.time({ | |
45 | -## df_stat <- df |> | |
46 | -## summarise(total=sum(x), .by = y) | |
44 | +duck_exec("set memory_limit='1GB'") | |
45 | + | |
46 | +df <- duck_csv("test.csv") | |
47 | + | |
48 | +system.time({ | |
49 | +df_stat <- df |> | |
50 | + summarise(total=sum(x), .by = y) | |
47 | 51 | |
48 | 52 | |
49 | 53 | |
50 | -## df_out <- df |> | |
51 | -## left_join(y=df_stat, by=c("y")) |> | |
52 | -## collect() | |
54 | +df_out <- df |> | |
55 | + left_join(y=df_stat, by=c("y")) |> | |
56 | + as_tibble() | |
53 | 57 | |
54 | -## }) | |
58 | +}) | |
55 | 59 | |
60 | +df_out |> glimpse() | |
61 | +} | |
56 | 62 | |
57 | 63 | sessionInfo() |
58 | 64 |