修订版 | 160d2f95fe605f31e3a4822172381ce887262ee1 (tree) |
---|---|
时间 | 2022-03-16 18:53:51 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
A script using several cores to read the csv files in different locations.
@@ -0,0 +1,99 @@ | ||
1 | +rm(list=ls()) | |
2 | + | |
3 | +library(tidyverse) | |
4 | +library(janitor) | |
5 | +library(openxlsx) | |
6 | +library(stringr) | |
7 | +library(lubridate) | |
8 | +library(stringi) | |
9 | +library(future) | |
10 | +library(furrr) | |
11 | +library(openxlsx) | |
12 | + | |
13 | + | |
14 | +source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R") | |
15 | + | |
16 | + | |
17 | + | |
18 | + | |
19 | +read_large <- function(file_pos){ | |
20 | + | |
21 | + m.out <- paste("Processing ", file_pos, sep="") | |
22 | + print(m.out) | |
23 | + | |
24 | + res <- read_csv(file_pos) %>% | |
25 | + clean_names() %>% | |
26 | + ## all_to_char() %>% | |
27 | + select(contains(c("nominalna", "brutto"))) %>% | |
28 | + mutate(across(contains(c("nominalna", "brutto")), ~as.numeric(.x))) ## %>% | |
29 | + ## filter(brutto_eur>=1e5) | |
30 | + | |
31 | + | |
32 | +return(res) | |
33 | + | |
34 | + | |
35 | + | |
36 | + | |
37 | +} | |
38 | + | |
39 | +######################################################################### | |
40 | + | |
41 | + | |
42 | + | |
43 | +n_cores <- 6 | |
44 | + | |
45 | +plan(multicore(workers=return_cores(n_cores))) | |
46 | + | |
47 | + | |
48 | + | |
49 | + | |
50 | +df_sa <- read_excel("SA-Case_Full_Extract_Case_EC-608096.xlsx") %>% | |
51 | + clean_data() %>% | |
52 | + date_to_date() | |
53 | + | |
54 | +tf_cases <- df_sa %>% | |
55 | + filter(internal_qualifier_s=="Covid19-TF") %>% | |
56 | + pull(case_reference) | |
57 | + | |
58 | + | |
59 | + | |
60 | + | |
61 | +mypath <- "./data1/text_conversion/" | |
62 | +mypath2 <- "./data2/text_conversion/" | |
63 | + | |
64 | +flist <- list.files(path=mypath , pattern= "*csv") | |
65 | +flist2 <- list.files(path=mypath2 , pattern= "*csv") | |
66 | + | |
67 | + | |
68 | +flist_ext <- paste(mypath, flist, sep="") | |
69 | + | |
70 | +flist_ext2 <- paste(mypath2, flist2, sep="") | |
71 | + | |
72 | + | |
73 | + | |
74 | +df_filtered1 <- future_map_dfr(flist_ext, read_large) | |
75 | + | |
76 | + | |
77 | +df_filtered2 <- future_map_dfr(flist_ext2, read_large) | |
78 | + | |
79 | + | |
80 | +df.out <- bind_rows(df_filtered1, df_filtered2) ## %>% | |
81 | + ## convert_to_utf_all() ## %>% | |
82 | + ## distinct(akt_data, pkd_kod, ustawa_kod,dzien_udzelenia_pomocy,identyfikator, | |
83 | + ## .keep_all=T) %>% | |
84 | + ## mutate(date=parse_date_time(akt_data, c("dmy", "ymd"))) %>% | |
85 | + ## mutate(year=year(date)) %>% | |
86 | + ## filter(year %in% c(2019, 2020)) %>% | |
87 | + ## mutate(case_reference=substrLeft(srodekpomoc_nr,8)) %>% | |
88 | + ## mutate(covid19_TF=if_else(case_reference %in% tf_cases, | |
89 | + ## "yes", "no")) ## %>% | |
90 | + ## filter(covid19_TF=="yes" & brutto_eur>=500e3 | | |
91 | + ## covid19_TF=="no" & brutto_eur>=100e3) | |
92 | + | |
93 | + | |
94 | + | |
95 | +saveRDS(df.out, "all_PL_tam.RDS") | |
96 | + | |
97 | + | |
98 | + | |
99 | +print("So far so good") |