修订版 | 5c3b73e73ddbbc798b6b3e0db3a0b9cc0a0514de (tree) |
---|---|
时间 | 2022-10-23 17:01:34 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I fixed a bug (repeated lines in covid) and I now save the output also as a parquet file.
@@ -61,7 +61,8 @@ | ||
61 | 61 | covid <- open_dataset("SA-Covid19.csv", format="csv") |> |
62 | 62 | rename("case_reference"="Case Reference") |> |
63 | 63 | select(case_reference) |> |
64 | - mutate(is_covid_case="Yes") | |
64 | + mutate(is_covid_case="Yes") |> | |
65 | + distinct() | |
65 | 66 | |
66 | 67 | |
67 | 68 |
@@ -141,4 +142,13 @@ | ||
141 | 142 | max_rows_per_file = 1e7 |
142 | 143 | ) |
143 | 144 | |
145 | + | |
146 | +write_dataset( | |
147 | + df_new, | |
148 | + format = "parquet", | |
149 | + path = "./data_output/", | |
150 | + max_rows_per_file = 1e7 | |
151 | +) | |
152 | + | |
153 | + | |
144 | 154 | print("So far so good") |