修订版 | c6eb618c6013552a3d1b3042b11d80725af581aa (tree) |
---|---|
时间 | 2022-10-23 16:46:19 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I now open the tsv file without reading it. It is no longer loaded into memory.
@@ -8,10 +8,45 @@ | ||
8 | 8 | source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R") |
9 | 9 | |
10 | 10 | |
11 | -## for mysterious reasons this does not work... | |
12 | -## data <- open_dataset("export.tsv", format="tsv") | |
13 | 11 | |
14 | -data <- read_tsv_arrow("export.tsv",as_data_frame=F) | |
12 | +## data <- read_tsv_arrow("export.tsv",as_data_frame=F) | |
13 | + | |
14 | +data <- open_dataset("export.tsv", | |
15 | + format = "tsv", | |
16 | + skip_rows = 1, | |
17 | + schema = schema( | |
18 | + AID_MEASURE_ID = string(), | |
19 | + DATE_CREATED = string(), | |
20 | + DATE_GRANTED = string(), | |
21 | + AA_PUBLISHED_DATE = string(), | |
22 | + SERVER_REF = string(), | |
23 | + AM_TITLE = string(), | |
24 | + AM_TITLE_EN = string(), | |
25 | + STATUS = string(), | |
26 | + AM_PROC_TYPE_CD = string(), | |
27 | + COFINANCE = string(), | |
28 | + OBJECTIVE = string(), | |
29 | + OTHER_OBJECTIVE_EN = string(), | |
30 | + AID_INSTRUMENT = string(), | |
31 | + OTHER_AID_INSTRUMENT_EN = string(), | |
32 | + BENEFICIARY_NAME = string(), | |
33 | + BENEFICIARY_NAME_ENGLISH = string(), | |
34 | + BENEFICIARY_NATIONAL_ID = string(), | |
35 | + BENEFICIARY_NAT_ID_TYPE_SD = string(), | |
36 | + BENEFICIARY_TYPE_SD = string(), | |
37 | + COUNTRY_SD = string(), | |
38 | + REGION_SD = string(), | |
39 | + SECTOR_SD = string(), | |
40 | + GRANTED_AMOUNT_FROM_EUR = double(), | |
41 | + NOMINAL_AMOUNT_EUR_FROM = double(), | |
42 | + GRANT_RANGE = string(), | |
43 | + GRANTING_AUTHORITY_NAME = string(), | |
44 | + GRANTING_AUTHORITY_NAME_EN = string(), | |
45 | + NUTS_CD = string(), | |
46 | + GRANTING_AUTHORITY_COUNTRY = string() | |
47 | + ) | |
48 | +) | |
49 | + | |
15 | 50 | |
16 | 51 | write_dataset( |
17 | 52 | data, |
@@ -99,52 +134,6 @@ | ||
99 | 134 | |
100 | 135 | |
101 | 136 | |
102 | - | |
103 | - | |
104 | - | |
105 | - | |
106 | - | |
107 | -## test <- df_new[1:10, ] |> | |
108 | -## collect() | |
109 | - | |
110 | - | |
111 | - | |
112 | - | |
113 | -## ranges <- df_new |> | |
114 | -## select(granted_aid_absolute_eur,lower_bound, upper_bound) |> | |
115 | -## collect() |> | |
116 | -## mutate(across(everything(), ~as.numeric(.x))) |> | |
117 | -## mutate(estimated_value=(lower_bound+upper_bound)/2) |> | |
118 | -## pattern_to_na(0) |> | |
119 | -## pull(estimated_value) | |
120 | - | |
121 | -## df_new <- df_new |> | |
122 | -## mutate(estimated_value=ranges) | |
123 | - | |
124 | - ## mutate(lower_bound=as.numeric(lower_bound), | |
125 | - ## upper_bound=as.numeric(upper_bound)) |> | |
126 | - ## mutate(estimated_value=(lower_bound+upper_bound)/2) |> | |
127 | - ## ## pattern_to_na(0) |> | |
128 | - ## mutate(granted_value_extended_eur = case_when( | |
129 | - ## !is.na(granted_aid_absolute_eur) ~ granted_aid_absolute_eur, | |
130 | - ## is.na(granted_aid_absolute_eur) & !is.na(estimated_value) ~estimated_value, | |
131 | - ## is.na(granted_aid_absolute_eur) & is.na(estimated_value) ~ nominal_aid_absolute_eur)) |> | |
132 | - | |
133 | - ## mutate(nominal_value_extended_eur= | |
134 | - ## case_when(!is.na(nominal_aid_absolute_eur) ~ nominal_aid_absolute_eur, | |
135 | - ## is.na(nominal_aid_absolute_eur)~granted_value_extended_eur | |
136 | - ## )) |> | |
137 | - ## select(-c(lower_bound, upper_bound, estimated_value)) | |
138 | -## |> | |
139 | - ## mutate(is_covid_case=if_else(case_reference %in% covid_data$case_reference, | |
140 | - ## "Yes", "No")) |> | |
141 | - ## mutate(granted_value_extended_eur=if_else(is_covid_case=="Yes", | |
142 | - ## NA_real_,granted_value_extended_eur )) | |
143 | - | |
144 | - | |
145 | - | |
146 | - | |
147 | - | |
148 | 137 | write_dataset( |
149 | 138 | df_new, |
150 | 139 | format = "csv", |