修订版 | 569fd3539a771af93df5e82d49c77b29f19bf175 (tree) |
---|---|
时间 | 2022-09-29 23:19:28 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I added a script to look (in parallel) for multiple keywords and showing the progress of the search.
@@ -0,0 +1,80 @@ | ||
1 | +rm(list=ls()) | |
2 | + | |
3 | +library(tidyverse) | |
4 | +library(openxlsx) | |
5 | +library(janitor) | |
6 | +library(furrr) | |
7 | + | |
8 | +source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R") | |
9 | + | |
10 | + | |
11 | +find_text_multiple_keywords_exact_matches_future <- function( df, keywords){ | |
12 | + | |
13 | + res <- future_map_dfr(keywords, | |
14 | + function(x) find_text_filter_exact_matches(df, x) , | |
15 | + .progress =T) | |
16 | + | |
17 | + return(res) | |
18 | + | |
19 | +} | |
20 | + | |
21 | + | |
22 | + | |
23 | +#################################################################à | |
24 | +n_cores <- 2 | |
25 | + | |
26 | +plan(multicore(workers=return_cores(n_cores))) | |
27 | + | |
28 | + | |
29 | + | |
30 | +df_sc_ini <- readRDS("scoreboard.RDS") | |
31 | + | |
32 | +df_sc <- df_sc_ini |> | |
33 | + all_to_lower() | |
34 | + | |
35 | +rm(df_sc_ini) | |
36 | + | |
37 | +df_tam_ini <- readRDS("TAM_cleaned_for_shiny.RDS") | |
38 | + | |
39 | +df_tam <- df_tam_ini |> | |
40 | + all_to_lower() | |
41 | + | |
42 | +rm(df_tam_ini) | |
43 | + | |
44 | +keywords <- read_delim("words.txt", delim="\n", col_names=F) |> | |
45 | + clean_data() |> | |
46 | + all_to_lower() |> | |
47 | + ## mutate(x1=paste("(?i)", x1, sep="")) |> | |
48 | + pull(x1) | |
49 | + | |
50 | +print("Done reading the data") | |
51 | + | |
52 | +tam_words <- find_text_multiple_keywords_exact_matches_future(df_tam, keywords) | |
53 | + | |
54 | + | |
55 | +save_excel(tam_words, "tam_results.xlsx") | |
56 | + | |
57 | + | |
58 | +tam_words_unique <- tam_words |> | |
59 | + distinct(case_reference, .keep_all=T) | |
60 | + | |
61 | + | |
62 | +save_excel(tam_words_unique, "tam_results_short.xlsx") | |
63 | + | |
64 | + | |
65 | +print("Done reading the TAM") | |
66 | + | |
67 | + | |
68 | +scoreboard_words <- find_text_multiple_keywords_exact_matches_future(df_sc, keywords) | |
69 | + | |
70 | + | |
71 | +save_excel(scoreboard_words, "scoreboard_results.xlsx") | |
72 | + | |
73 | + | |
74 | +scoreboard_words_unique <- scoreboard_words |> | |
75 | + distinct(case_number, .keep_all=T) | |
76 | + | |
77 | +save_excel(scoreboard_words_unique, "scoreboard_results_short.xlsx") | |
78 | + | |
79 | + | |
80 | +print("So far so good") |