• R/O
  • SSH

提交

标签
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Commit MetaInfo

修订版501a2b56575175faf3d325786395c1e2ce9a51db (tree)
时间2022-07-07 00:00:33
作者Lorenzo Isella <lorenzo.isella@gmai...>
CommiterLorenzo Isella

Log Message

A script showing the revision of the csv_to_sqlite function.

更改概述

差异

diff -r c2a9e93444f3 -r 501a2b565751 R-codes/revise_create_sqlite.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/R-codes/revise_create_sqlite.R Wed Jul 06 17:00:33 2022 +0200
@@ -0,0 +1,212 @@
1+rm(list=ls())
2+
3+library(tidyverse)
4+
5+library(DBI) # main DB interface
6+library(dbplyr) # dplyr back-end for DBs
7+library(RSQLite)
8+library(lubridate)
9+
10+
11+
12+csv_to_sqlite <- function(csv_file, sqlite_file, table_name,
13+ delim = ",",
14+ pre_process_size = 1000, chunk_size = 50000,
15+ show_progress_bar = TRUE, ...) {
16+ con <- dbConnect(SQLite(), dbname = sqlite_file)
17+
18+ # read a first chunk of data to extract the colnames and types
19+ # to figure out the date and the datetime columns
20+ df <- read_delim(csv_file, delim = delim, n_max = pre_process_size, ...)
21+ date_cols <- df %>%
22+ select_if(is.Date) %>%
23+ colnames()
24+ datetime_cols <- df %>%
25+ select_if(is.POSIXt) %>%
26+ colnames()
27+
28+ # write the first batch of lines to SQLITE table, converting dates to string
29+ # representation
30+ df <- df %>%
31+ mutate_at(.vars = date_cols, .funs = as.character.Date) %>%
32+ mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt)
33+ dbWriteTable(con, table_name, df, overwrite = TRUE)
34+
35+ # readr chunk functionality
36+ read_delim_chunked(
37+ csv_file,
38+ callback = append_to_sqlite(con = con, table_name = table_name,
39+ date_cols = date_cols,
40+ datetime_cols = datetime_cols),
41+ delim = delim,
42+ skip = pre_process_size, chunk_size = chunk_size,
43+ progress = show_progress_bar,
44+ col_names = colnames(df), ...)
45+ dbDisconnect(con)
46+}
47+
48+#' Callback function that appends new sections to the SQLite table.
49+#' @param con A valid connection to SQLite database.
50+#' @param table_name Name of the table to store the data table in the sqlite
51+#' database.
52+#' @param date_cols Name of columns containing Date objects
53+#' @param datetime_cols Name of columns containint POSIXt objects.
54+#'
55+#' @keywords internal
56+append_to_sqlite <- function(con, table_name,
57+ date_cols, datetime_cols) {
58+ #' @param x Data.frame we are reading from.
59+ function(x, pos) {
60+
61+ x <- as.data.frame(x)
62+ x <- x %>%
63+ mutate_at(.vars = date_cols, .funs = as.character.Date) %>%
64+ mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt)
65+ # append data frame to table
66+ dbWriteTable(con, table_name, x, append = TRUE)
67+
68+ }
69+}
70+
71+
72+
73+
74+
75+
76+
77+
78+
79+
80+csv_to_sqlite2 <- function(csv_file, sqlite_file, table_name,
81+ delim = ",",
82+ pre_process_size = 1000, chunk_size = 50000,
83+ show_progress_bar = TRUE, ...) {
84+ con <- dbConnect(SQLite(), dbname = sqlite_file)
85+
86+ # read a first chunk of data to extract the colnames and types
87+ # to figure out the date and the datetime columns
88+ df <- read_delim(csv_file, delim = delim, n_max = pre_process_size, ...)
89+
90+
91+ date_cols <- df |>
92+ select(where(is.Date)) |>
93+ colnames()
94+
95+ datetime_cols <- df |>
96+ select(where(is.POSIXt)) |>
97+ colnames()
98+
99+ # write the first batch of lines to SQLITE table, converting dates to string
100+ # representation
101+ ## df <- df %>%
102+ ## mutate_at(.vars = date_cols, .funs = as.character.Date) %>%
103+ ## mutate_at(.vars = datetime_cols, .funs = as.character.POSIXt)
104+
105+
106+ df <- df |>
107+ mutate(across( all_of(date_cols), \(x) as.character.Date(x))) |>
108+ mutate(across(all_of(datetime_cols), \(x) as.character.POSIXt(x)))
109+
110+
111+
112+ dbWriteTable(con, table_name, df, overwrite = TRUE)
113+
114+ # readr chunk functionality
115+ read_delim_chunked(
116+ csv_file,
117+ callback = append_to_sqlite2(con = con, table_name = table_name,
118+ date_cols = date_cols,
119+ datetime_cols = datetime_cols),
120+ delim = delim,
121+ skip = pre_process_size, chunk_size = chunk_size,
122+ progress = show_progress_bar,
123+ col_names = colnames(df), ...)
124+ dbDisconnect(con)
125+}
126+
127+
128+
129+
130+
131+append_to_sqlite2 <- function(con, table_name,
132+ date_cols, datetime_cols) {
133+ #' @param x Data.frame we are reading from.
134+ function(x, pos) {
135+
136+ x <- as.data.frame(x)
137+ x <- x |>
138+ mutate(across( all_of(date_cols), \(x) as.character.Date(x))) |>
139+ mutate(across( all_of(datetime_cols), \(x) as.character.POSIXt(x)))
140+ # append data frame to table
141+ dbWriteTable(con, table_name, x, append = TRUE)
142+
143+ }
144+}
145+
146+
147+
148+
149+
150+
151+
152+
153+
154+
155+
156+################################################################
157+################################################################
158+################################################################
159+################################################################
160+################################################################
161+
162+
163+
164+
165+
166+csv_to_sqlite("scoreboard.csv.gz", "scoreboard.db", "mytable")
167+
168+csv_to_sqlite2("scoreboard.csv.gz", "scoreboard2.db", "mytable")
169+
170+
171+
172+
173+con <- dbConnect(drv = RSQLite::SQLite(), # give me a SQLite connection
174+ dbname = "scoreboard.db")
175+
176+con2 <- dbConnect(drv = RSQLite::SQLite(), # give me a SQLite connection
177+ dbname = "scoreboard2.db")
178+
179+
180+table_list <- dbListTables(con) # List me the tables at the connection
181+
182+print(table_list)
183+
184+
185+
186+fields <- dbListFields(con, # Reach into my connection and ...
187+ "mytable") # tell me what fields does the `bakers` table have?
188+
189+
190+print("the database fields are")
191+print(fields)
192+
193+my_glimpse <- tbl(con, "mytable")
194+
195+print(my_glimpse)
196+
197+
198+
199+my_glimpse2 <- tbl(con2, "mytable")
200+
201+print(my_glimpse2)
202+
203+
204+
205+
206+## df <- read_csv("scoreboard.csv.gz")
207+
208+
209+
210+
211+
212+print("So far so good")