修订版 | 501a2b56575175faf3d325786395c1e2ce9a51db (tree) |
---|---|
时间 | 2022-07-07 00:00:33 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
A script showing the revision of the csv_to_sqlite function.
@@ -0,0 +1,212 @@ | ||
1 | +rm(list=ls()) | |
2 | + | |
3 | +library(tidyverse) | |
4 | + | |
5 | +library(DBI) # main DB interface | |
6 | +library(dbplyr) # dplyr back-end for DBs | |
7 | +library(RSQLite) | |
8 | +library(lubridate) | |
9 | + | |
10 | + | |
11 | + | |
12 | +csv_to_sqlite <- function(csv_file, sqlite_file, table_name, | |
13 | + delim = ",", | |
14 | + pre_process_size = 1000, chunk_size = 50000, | |
15 | + show_progress_bar = TRUE, ...) { | |
16 | + con <- dbConnect(SQLite(), dbname = sqlite_file) | |
17 | + | |
18 | + # read a first chunk of data to extract the colnames and types | |
19 | + # to figure out the date and the datetime columns | |
20 | + df <- read_delim(csv_file, delim = delim, n_max = pre_process_size, ...) | |
21 | + date_cols <- df %>% | |
22 | + select_if(is.Date) %>% | |
23 | + colnames() | |
24 | + datetime_cols <- df %>% | |
25 | + select_if(is.POSIXt) %>% | |
26 | + colnames() | |
27 | + | |
28 | + # write the first batch of lines to SQLITE table, converting dates to string | |
29 | + # representation | |
30 | + df <- df %>% | |
31 | + mutate_at(.vars = date_cols, .funs = as.character.Date) %>% | |
32 | + mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt) | |
33 | + dbWriteTable(con, table_name, df, overwrite = TRUE) | |
34 | + | |
35 | + # readr chunk functionality | |
36 | + read_delim_chunked( | |
37 | + csv_file, | |
38 | + callback = append_to_sqlite(con = con, table_name = table_name, | |
39 | + date_cols = date_cols, | |
40 | + datetime_cols = datetime_cols), | |
41 | + delim = delim, | |
42 | + skip = pre_process_size, chunk_size = chunk_size, | |
43 | + progress = show_progress_bar, | |
44 | + col_names = colnames(df), ...) | |
45 | + dbDisconnect(con) | |
46 | +} | |
47 | + | |
48 | +#' Callback function that appends new sections to the SQLite table. | |
49 | +#' @param con A valid connection to SQLite database. | |
50 | +#' @param table_name Name of the table to store the data table in the sqlite | |
51 | +#' database. | |
52 | +#' @param date_cols Name of columns containing Date objects | |
53 | +#' @param datetime_cols Name of columns containint POSIXt objects. | |
54 | +#' | |
55 | +#' @keywords internal | |
56 | +append_to_sqlite <- function(con, table_name, | |
57 | + date_cols, datetime_cols) { | |
58 | + #' @param x Data.frame we are reading from. | |
59 | + function(x, pos) { | |
60 | + | |
61 | + x <- as.data.frame(x) | |
62 | + x <- x %>% | |
63 | + mutate_at(.vars = date_cols, .funs = as.character.Date) %>% | |
64 | + mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt) | |
65 | + # append data frame to table | |
66 | + dbWriteTable(con, table_name, x, append = TRUE) | |
67 | + | |
68 | + } | |
69 | +} | |
70 | + | |
71 | + | |
72 | + | |
73 | + | |
74 | + | |
75 | + | |
76 | + | |
77 | + | |
78 | + | |
79 | + | |
80 | +csv_to_sqlite2 <- function(csv_file, sqlite_file, table_name, | |
81 | + delim = ",", | |
82 | + pre_process_size = 1000, chunk_size = 50000, | |
83 | + show_progress_bar = TRUE, ...) { | |
84 | + con <- dbConnect(SQLite(), dbname = sqlite_file) | |
85 | + | |
86 | + # read a first chunk of data to extract the colnames and types | |
87 | + # to figure out the date and the datetime columns | |
88 | + df <- read_delim(csv_file, delim = delim, n_max = pre_process_size, ...) | |
89 | + | |
90 | + | |
91 | + date_cols <- df |> | |
92 | + select(where(is.Date)) |> | |
93 | + colnames() | |
94 | + | |
95 | + datetime_cols <- df |> | |
96 | + select(where(is.POSIXt)) |> | |
97 | + colnames() | |
98 | + | |
99 | + # write the first batch of lines to SQLITE table, converting dates to string | |
100 | + # representation | |
101 | + ## df <- df %>% | |
102 | + ## mutate_at(.vars = date_cols, .funs = as.character.Date) %>% | |
103 | + ## mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt) | |
104 | + | |
105 | + | |
106 | + df <- df |> | |
107 | + mutate(across( all_of(date_cols), \(x) as.character.Date(x))) |> | |
108 | + mutate(across(all_of(datetime_cols), \(x) as.character.POSIXt(x))) | |
109 | + | |
110 | + | |
111 | + | |
112 | + dbWriteTable(con, table_name, df, overwrite = TRUE) | |
113 | + | |
114 | + # readr chunk functionality | |
115 | + read_delim_chunked( | |
116 | + csv_file, | |
117 | + callback = append_to_sqlite2(con = con, table_name = table_name, | |
118 | + date_cols = date_cols, | |
119 | + datetime_cols = datetime_cols), | |
120 | + delim = delim, | |
121 | + skip = pre_process_size, chunk_size = chunk_size, | |
122 | + progress = show_progress_bar, | |
123 | + col_names = colnames(df), ...) | |
124 | + dbDisconnect(con) | |
125 | +} | |
126 | + | |
127 | + | |
128 | + | |
129 | + | |
130 | + | |
131 | +append_to_sqlite2 <- function(con, table_name, | |
132 | + date_cols, datetime_cols) { | |
133 | + #' @param x Data.frame we are reading from. | |
134 | + function(x, pos) { | |
135 | + | |
136 | + x <- as.data.frame(x) | |
137 | + x <- x |> | |
138 | + mutate(across( all_of(date_cols), \(x) as.character.Date(x))) |> | |
139 | + mutate(across( all_of(datetime_cols), \(x) as.character.POSIXt(x))) | |
140 | + # append data frame to table | |
141 | + dbWriteTable(con, table_name, x, append = TRUE) | |
142 | + | |
143 | + } | |
144 | +} | |
145 | + | |
146 | + | |
147 | + | |
148 | + | |
149 | + | |
150 | + | |
151 | + | |
152 | + | |
153 | + | |
154 | + | |
155 | + | |
156 | +################################################################ | |
157 | +################################################################ | |
158 | +################################################################ | |
159 | +################################################################ | |
160 | +################################################################ | |
161 | + | |
162 | + | |
163 | + | |
164 | + | |
165 | + | |
166 | +csv_to_sqlite("scoreboard.csv.gz", "scoreboard.db", "mytable") | |
167 | + | |
168 | +csv_to_sqlite2("scoreboard.csv.gz", "scoreboard2.db", "mytable") | |
169 | + | |
170 | + | |
171 | + | |
172 | + | |
173 | +con <- dbConnect(drv = RSQLite::SQLite(), # give me a SQLite connection | |
174 | + dbname = "scoreboard.db") | |
175 | + | |
176 | +con2 <- dbConnect(drv = RSQLite::SQLite(), # give me a SQLite connection | |
177 | + dbname = "scoreboard2.db") | |
178 | + | |
179 | + | |
180 | +table_list <- dbListTables(con) # List me the tables at the connection | |
181 | + | |
182 | +print(table_list) | |
183 | + | |
184 | + | |
185 | + | |
186 | +fields <- dbListFields(con, # Reach into my connection and ... | |
187 | + "mytable") # tell me what fields does the `bakers` table have? | |
188 | + | |
189 | + | |
190 | +print("the database fields are") | |
191 | +print(fields) | |
192 | + | |
193 | +my_glimpse <- tbl(con, "mytable") | |
194 | + | |
195 | +print(my_glimpse) | |
196 | + | |
197 | + | |
198 | + | |
199 | +my_glimpse2 <- tbl(con2, "mytable") | |
200 | + | |
201 | +print(my_glimpse2) | |
202 | + | |
203 | + | |
204 | + | |
205 | + | |
206 | +## df <- read_csv("scoreboard.csv.gz") | |
207 | + | |
208 | + | |
209 | + | |
210 | + | |
211 | + | |
212 | +print("So far so good") |