Rev. | 5de46eb7bec4ef014306eb072775d685ee790fa6 |
---|---|
大小 | 12,157 字节 |
时间 | 2024-07-06 04:10:48 |
作者 | Lorenzo Isella |
Log Message | I rewrote part of the script. |
rm(list=ls())
library(tidyverse)
library(janitor)
library(stringr)
library(tidytext)
library(SnowballC)
library(udpipe)
library(furrr)
source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
semi_jaccard <- function(x, y){
lx <- length(unique(x))
ly <- length(unique(y))
lint <- length(intersect(x,y))
res <- lint/lx
return(res)
}
#######################################################################
n_cores <- 2
plan(multicore(workers=return_cores(n_cores)))
do_labels <- 1
## see https://cran.r-project.org/web/packages/udpipe/vignettes/udpipe-usecase-postagging-lemmatisation.html
# Download and load the model
model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(model$file_model)
jobs <- read_csv("../input/wi_dataset.csv")
jobs_token <- jobs |>
mutate(description=remove_text_within_brackets(description, " ")) |>
mutate(description=remove_special_char_num(description, " ")) |>
mutate(description=remove_short_words(description, 3)) |>
unnest_tokens(word, description) |>
mutate(word=wordStem(word)) |>
anti_join(stop_words) |>
distinct()
jobs_list <- jobs_token |>
select(-title) |>
group_by(id) |>
group_split()
labels <- read_csv("../input/wi_labels.csv") |>
select(-starts_with("isco"))
labels_split <- labels |>
group_by(code) |>
group_split()
labels_annotated <- future_map(labels_split, \(x) udpipe_annotate(ud_model, x$description) |> as_tibble() )
labels_keywords <- future_map(labels_annotated, \(x) keywords_rake(x = x, term = "lemma",
group = "doc_id",
relevant = x$upos %in% c("NOUN" ## , "ADJ"
),
ngram_max=1,
n_min=1)
)
labels_keywords_df <- labels_keywords |>
list_to_df() |> as_tibble() |>
filter(freq>1 | rake>0) |>
group_by(source) |>
summarise(description=paste(keyword, collapse=" ")) |>
ungroup() |>
mutate(source=as.numeric(source)) |>
arrange(source)
labels_token_df <- labels_keywords_df |>
mutate(description=remove_text_within_brackets(description, " ")) |>
mutate(description=remove_special_char_num(description, " ")) |>
mutate(description=remove_short_words(description, 3)) |>
unnest_tokens(word, description) |>
mutate(word=wordStem(word)) |>
anti_join(stop_words) |>
distinct()
labels_list_df <- labels_token_df |>
## select(-label) |>
group_by(source) |>
group_split()
## labels_token <- labels |>
## mutate(description=remove_text_within_brackets(description, " ")) |>
## mutate(description=remove_special_char_num(description, " ")) |>
## mutate(description=remove_short_words(description, 3)) |>
## unnest_tokens(word, description) |>
## mutate(word=wordStem(word)) |>
## anti_join(stop_words) |>
## distinct()
## labels_list <- labels_token |>
## select(-label) |>
## group_by(code) |>
## group_split()
mm3 <- outer(
labels_list_df,
jobs_list[1:100],
Vectorize(\(x, y) jaccard(x$word, y$word))
)
## isco_labels <- read_csv("../input/ISCO-08_EN.csv", col_types = cols(.default = "c")) |>
## mutate(unit4=str_pad(unit, width=4, side="left", pad="0")) |>
## clean_names()|>
## rename("description_isco"="description") |>
## filter(nchar(sub_major)>=2)
## isco_distinct <- isco_labels |>
## get_dupes_short(major, sub_major, minor, unit)
## test <- isco_labels |>
## filter(nchar(unit)<4)
## wi_labels <- read_csv("../input/wi_labels.csv")
## labels_tot <- wi_labels |>
## left_join(y=isco_labels, by=c("code"="unit4")) |>
## reorder_columns()
## hand_keywords <- list()
## hand_keywords[[1]] <- tibble(keywords=c("armed", "forces", "officer", "leadership", "management",
## "lieutenant", "admiral", "commodore", "marshal", "brigadier","captain", "colonel", "flying", "general", "major","midshipman", "commander", "cadet", "customs", "inspectors", "squadron", "leader", "custom" ))
## hand_keywords[[2]] <- tibble(keywords=c("Non-commissioned", "armed", "forces" ,"officers",
## "Sub-major", "Group", "sergeant", "warrant","Boatswain" ))
## hand_keywords[[3]] <- tibble(keywords=c( "Armed", "forces", "ranks", "conscripted",
## "officers", "Airman", "Bombardier","Corporal", "Coxswain",
## "Gunner", "Infantryman", "Infantrywoman",
## "Paratrooper", "Rifleman", "riflewoman",
## "Seaman", "woman" ))
## hand_keywords[[4]] <- tibble(keywords=c("Legislators", " policies" , "national", "state", "regional" , "local", "governments" , "international", "agencies", "ratify", "amend",
## "repeal" ,"laws", "public" ,"rules" ," regulations",
## "elected" , "members" , "parliaments", "councils",
## "proceedings", "bodies", "administrative",
## "statutory", "constitutional", "serving" ,
## "boards" , "investigating", "constituencies" ,
## "community" , "opinion", "negotiating", "senior",
## "administrators", "officials","councillor","minister",
## "Mayor", "President", "Secretary", "senator" ))
## hand_keywords[[5]] <- tibble(keywords=c(
## "Senior", "government", "officials", "advise", "policy", "interpretation",
## "implementation" , "government", "policies" , "legislation",
## "departments", "agencies" ,"represent", "country", "intergovernmental",
## "organizations", "plan", "organize", "direct", "control" , "evaluate",
## "municipal", "local", "regional" , "national", "government",
## "departments", "boards", "agencies", "commissions", "legislation",
## "policies", "budgets", "laws" , "regulations", "amendments", "objectives",
## "programmes", "procedures" , "recommending", "reviewing", "approving",
## "middle", "managers", "presentations",
## "Ambassador", "Civil", "service", "commissioner", "Consul",
## "Director", "head", "constable"
## ))
## hand_keywords[[6]] <- tibble(keywords=c(
## "Traditional", "chiefs" , "heads" , "villages",
## "legislative", "administrative" , "ceremonial",
## "tasks" , "duties", "ancient", "traditions",
## "rights" , "responsibilities", "local", "regional" , "national",
## "authorities",
## "allocating", "communal", "land", "resources",
## "households", "community", "collecting",
## "distributing", "surplus", "production",
## "disputes", "members", "disciplining", "violation",
## "rules" , "customs", "ceremonial", "duties",
## "births", "marriages", "deaths", "harvests" , "councils",
## "chief", "head"
## ))
## hand_keywords[[7]] <- tibble(keywords=c(
## "Senior", "officials", "special","interest", "organizations",
## "determine", "formulate" , "direct",
## "implementation" , "policies" , "political","party" , "trade", "unions",
## "employers", "organizations", "industry", "associations",
## "humanitarian" , "charity",
## "sports", "policies", "rules" ,
## "regulations","reviewing", "operations",
## "reporting" ,"boards" , "directors",
## "membership" , "funding", "agencies",
## "developed" ,"budgetary", "control",
## "monitoring", "evaluating",
## "performance",
## "enterprise" , "objectives", "policies",
## "Chairperson", "Director", "Leader", "secretary","environment", "protection" , "human", "rights"
## ))
## hand_keywords[[8]] <- tibble(keywords=c(
## "Managing", "directors", "chief", "executives", "formulate",
## "review", "policies", "plan",
## "direct", "coordinate", "evaluate", "activities",
## "enterprises" , "organizations", "guidelines", "board",
## "directors", "governing", "body" , "operations", "results".\nTasks include -\n(a) planning, directing and coordinating the general functioning of an enterprise or organization;\n(b) reviewing the operations and results of the enterprise or organization and reporting to boards of directors and governing bodies;\n(c) determining objectives, strategies, policies and programmes for the enterprise or organization;\n(d) providing overall leadership and management to the enterprise or organization;\n(e) establishing and managing budgets, controlling expenditure and ensuring the efficient use of resources;\n(f) authorizing material, human and financial resources to implement organizational policies and programmes;\n(g) monitoring and evaluating performance of the organization or enterprise against established objectives and policies;\n(h) consulting with senior subordinate staff and reviewing recommendations and reports;\n(i) representing the organization at official occasions and board meetings, in negotiations and at conventions, seminars, public hearings and forums; \n(j) selecting or approving the selection of senior staff;\n(k) ensuring the organization complies with relevant legislation and regulations.\nExamples of the occupations classified here:\n- Chief executive\n- Managing director\n- Regional manager\n\nNote\nRegional managers and other senior managers who coordinate and supervise the activities of subordinate managers who have a diverse range of functional responsibilities are included in Unit Group 1120: Managing Directors and Chief Executives. Managers responsible for specialized functions within a specific geographic area are excluded from this unit group. For example, regional sales managers are classified in Unit Group 1221: Sales and Marketing Managers. Jobs whose principal responsibility is to participate as a member of the board of directors of one or more enterprises or organizations are included in Unit Group 1120: Managing Directors and Chief Executives. Chief executives of government-owned enterprises are included in Unit Group 1120: Managing Directors and Chief Executives.\n
## ))
## ## this should be hand_keywords[[155]]
## ## hand_keywords[[6]] <- tibble(keywords=c("Pharmaceutical", "technicians" , "assistants",
## ## "dispensing", "medicinal", "products",
## ## "guidance" , "pharmacist", "health", "professional",
## ## "preparing", "medications", "compounds",
## ## "drugs" , "instructions", "refill" , "record","keeping", "storage",
## ## "security", "assisting", "pricing", "Pharmacology",
## ## "aide"
## ## ))
## df_key <- hand_keywords |>
## list_to_df(source_col_name="number")
## df_key_token <- df_key |>
## mutate(keywords=remove_special_char_num(keywords, " ")) |>
## mutate(keywords=remove_short_words(keywords, 3)) |>
## unnest_tokens(word, keywords) |>
## mutate(word=wordStem(word)) |>
## anti_join(stop_words) |>
## distinct() |>
## group_by(number) |>
## group_split()
## mm3 <- outer(
## df_key_token,
## jobs_list,
## Vectorize(\(x, y) jaccard(x$word, y$word))
## )
print("So far so good")