修订版 | c1a8b701554dd8371c2d9274a2ea4d79ae3bf908 (tree) |
---|---|
时间 | 2025-01-01 02:33:41 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I added a script showcasing how to work on a csv file both with arrow and duckplyr.
@@ -0,0 +1,40 @@ | ||
1 | +rm(list=ls()) | |
2 | +library(tidyverse) | |
3 | +library(duckplyr) | |
4 | +library(tictoc) | |
5 | +library(arrow) | |
6 | + | |
7 | + | |
8 | +# Increased the size of data | |
9 | +## dd <- tibble(x=1:100000000, y=rep(LETTERS[1:20], 5000000)) | |
10 | + | |
11 | + | |
12 | +## write_csv(dd, "test.csv") | |
13 | + | |
14 | + | |
15 | +df <- duck_csv("test.csv") | |
16 | + | |
17 | +system.time({ | |
18 | +df_stat <- df |> | |
19 | + summarise(total=sum(x), .by = y) |> | |
20 | + collect() |> | |
21 | + as_tibble() | |
22 | + | |
23 | +}) | |
24 | + | |
25 | + | |
26 | +df2 <- open_dataset("test.csv", | |
27 | + format = "csv", | |
28 | + skip_rows = 0) | |
29 | + | |
30 | +system.time({ | |
31 | + df_stat2 <- df2 |> | |
32 | + group_by(y) |> | |
33 | + summarise(total=sum(x)) |> | |
34 | + ungroup() |> | |
35 | + collect() | |
36 | + | |
37 | +}) | |
38 | + | |
39 | + | |
40 | +print("So far so good") |