• R/O
  • SSH

标签
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

File Info

Rev. 5de46eb7bec4ef014306eb072775d685ee790fa6
大小 12,157 字节
时间 2024-07-06 04:10:48
作者 Lorenzo Isella
Log Message

I rewrote part of the script.

Content

rm(list=ls())

library(tidyverse)
library(janitor)
library(stringr)
library(tidytext)
library(SnowballC)
library(udpipe)
library(furrr)

source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")

semi_jaccard <- function(x, y){

    lx <- length(unique(x))
    ly <- length(unique(y))

    lint <- length(intersect(x,y))
    res <- lint/lx
    return(res)
    
}

#######################################################################

n_cores <- 2

plan(multicore(workers=return_cores(n_cores)))

do_labels <- 1

## see https://cran.r-project.org/web/packages/udpipe/vignettes/udpipe-usecase-postagging-lemmatisation.html

# Download and load the model
model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(model$file_model)


jobs <- read_csv("../input/wi_dataset.csv")


jobs_token <- jobs |>
    mutate(description=remove_text_within_brackets(description, " ")) |> 
    mutate(description=remove_special_char_num(description, " ")) |>
    mutate(description=remove_short_words(description, 3)) |> 
    unnest_tokens(word, description) |>
    mutate(word=wordStem(word)) |> 
    anti_join(stop_words) |>
    distinct()


jobs_list <- jobs_token |>
    select(-title) |>
    group_by(id) |>
    group_split()

labels <- read_csv("../input/wi_labels.csv") |>
    select(-starts_with("isco"))


labels_split <- labels |>
    group_by(code) |>
    group_split()


 labels_annotated <- future_map(labels_split, \(x) udpipe_annotate(ud_model, x$description) |> as_tibble() )


labels_keywords <- future_map(labels_annotated, \(x)  keywords_rake(x = x, term = "lemma",
                                                    group = "doc_id", 
                               relevant = x$upos %in% c("NOUN" ## , "ADJ"
                                                        ),
                               ngram_max=1,
                               n_min=1)
   )



labels_keywords_df <- labels_keywords |>
    list_to_df() |> as_tibble() |>
    filter(freq>1 | rake>0) |> 
    group_by(source) |>
    summarise(description=paste(keyword, collapse=" ")) |>
    ungroup() |>
    mutate(source=as.numeric(source)) |> 
    arrange(source)

labels_token_df <- labels_keywords_df |>
    mutate(description=remove_text_within_brackets(description, " ")) |> 
    mutate(description=remove_special_char_num(description, " ")) |>
    mutate(description=remove_short_words(description, 3)) |> 
    unnest_tokens(word, description) |>
    mutate(word=wordStem(word)) |> 
    anti_join(stop_words) |>
    distinct()


labels_list_df <- labels_token_df |>
    ## select(-label) |>
    group_by(source) |>
    group_split()





## labels_token <- labels |>
##     mutate(description=remove_text_within_brackets(description, " ")) |> 
##     mutate(description=remove_special_char_num(description, " ")) |>
##     mutate(description=remove_short_words(description, 3)) |> 
##     unnest_tokens(word, description) |>
##     mutate(word=wordStem(word)) |> 
##     anti_join(stop_words) |>
##     distinct()

## labels_list <- labels_token |>
##     select(-label) |>
##     group_by(code) |>
##     group_split()




mm3 <- outer(
    labels_list_df,
    jobs_list[1:100],
    Vectorize(\(x, y) jaccard(x$word, y$word))
)





## isco_labels <- read_csv("../input/ISCO-08_EN.csv",  col_types = cols(.default = "c")) |>
##     mutate(unit4=str_pad(unit, width=4, side="left", pad="0")) |>
##     clean_names()|>
##     rename("description_isco"="description") |>
##     filter(nchar(sub_major)>=2)


## isco_distinct <- isco_labels |>
##     get_dupes_short(major, sub_major, minor, unit)

## test <- isco_labels |>
##     filter(nchar(unit)<4) 

## wi_labels <- read_csv("../input/wi_labels.csv")

## labels_tot <- wi_labels |>
##     left_join(y=isco_labels, by=c("code"="unit4")) |>
##     reorder_columns()

## hand_keywords <- list()

## hand_keywords[[1]] <- tibble(keywords=c("armed", "forces", "officer", "leadership", "management",
##                         "lieutenant", "admiral", "commodore", "marshal", "brigadier","captain", "colonel", "flying", "general", "major","midshipman", "commander", "cadet", "customs", "inspectors", "squadron", "leader", "custom" ))

## hand_keywords[[2]] <- tibble(keywords=c("Non-commissioned", "armed", "forces" ,"officers",
##                         "Sub-major", "Group", "sergeant", "warrant","Boatswain" ))

## hand_keywords[[3]] <- tibble(keywords=c( "Armed", "forces",  "ranks",  "conscripted",
##                        "officers",  "Airman", "Bombardier","Corporal", "Coxswain",
##                        "Gunner", "Infantryman", "Infantrywoman",
##                        "Paratrooper", "Rifleman", "riflewoman",
##                        "Seaman", "woman"    ))


## hand_keywords[[4]] <- tibble(keywords=c("Legislators", " policies" , "national", "state", "regional" , "local", "governments" , "international", "agencies", "ratify", "amend",
##                        "repeal" ,"laws", "public" ,"rules" ," regulations",
##                        "elected" , "members" , "parliaments", "councils",
##                        "proceedings", "bodies", "administrative", 
##                        "statutory", "constitutional",  "serving" ,
##                        "boards" , "investigating", "constituencies" ,
##                        "community" , "opinion", "negotiating", "senior",
##                        "administrators", "officials","councillor","minister",
##                        "Mayor", "President",  "Secretary", "senator" ))

## hand_keywords[[5]] <- tibble(keywords=c(
##     "Senior", "government", "officials", "advise",  "policy", "interpretation",
##     "implementation" , "government", "policies" , "legislation",
##     "departments",  "agencies" ,"represent",  "country", "intergovernmental",
##     "organizations",  "plan", "organize", "direct", "control" , "evaluate",
##     "municipal",  "local", "regional" , "national", "government",
##     "departments", "boards", "agencies",  "commissions",  "legislation",
##     "policies",   "budgets", "laws" , "regulations", "amendments",  "objectives",
##     "programmes", "procedures" ,  "recommending", "reviewing",  "approving",
##     "middle", "managers",  "presentations",
##     "Ambassador",  "Civil", "service", "commissioner",  "Consul",
##       "Director",  "head",    "constable"
##     ))


## hand_keywords[[6]] <- tibble(keywords=c(
##               "Traditional", "chiefs" , "heads" , "villages",
##               "legislative", "administrative" , "ceremonial",
##               "tasks" , "duties", "ancient", "traditions",
##               "rights" , "responsibilities", "local", "regional" , "national",
##               "authorities",
##               "allocating",  "communal", "land", "resources",
##               "households", "community",  "collecting",
##               "distributing", "surplus", "production",
##               "disputes",  "members",   "disciplining",  "violation",
##               "rules" , "customs", "ceremonial", "duties",
##               "births", "marriages", "deaths", "harvests" , "councils",
##                  "chief",  "head"
                                 
##                              ))


## hand_keywords[[7]] <- tibble(keywords=c(
##                                  "Senior", "officials", "special","interest", "organizations",
##                                  "determine", "formulate" , "direct",
##                                  "implementation" , "policies" , "political","party" , "trade", "unions",
##                                  "employers", "organizations", "industry", "associations",
##                                  "humanitarian" , "charity",
##                                  "sports", "policies", "rules" ,
##                                  "regulations","reviewing", "operations",
##                                  "reporting" ,"boards" , "directors",
##                                  "membership" , "funding", "agencies",
##                                  "developed" ,"budgetary", "control",
##                                  "monitoring", "evaluating",
##                                  "performance",
##                                  "enterprise" , "objectives", "policies",
##                                  "Chairperson",  "Director", "Leader", "secretary","environment", "protection" , "human", "rights" 
##                              ))




## hand_keywords[[8]] <- tibble(keywords=c(
##                                  "Managing", "directors", "chief", "executives", "formulate",
##                                  "review", "policies", "plan",
##                                  "direct", "coordinate", "evaluate", "activities",
##                                  "enterprises" , "organizations", "guidelines", "board",
##       "directors", "governing", "body" , "operations", "results".\nTasks include -\n(a) planning, directing and coordinating the general functioning of an enterprise or organization;\n(b) reviewing the operations and results of the enterprise or organization and reporting to boards of directors and governing bodies;\n(c) determining objectives, strategies, policies and programmes for the enterprise or organization;\n(d) providing overall leadership and management to the enterprise or organization;\n(e) establishing and managing budgets, controlling expenditure and ensuring the efficient use of resources;\n(f) authorizing material, human and financial resources to implement organizational policies and programmes;\n(g) monitoring and evaluating performance of the organization or enterprise against established objectives and policies;\n(h) consulting with senior subordinate staff and reviewing recommendations and reports;\n(i) representing the organization at official occasions and board meetings, in negotiations and at conventions, seminars, public hearings and forums; \n(j) selecting or approving the selection of senior staff;\n(k) ensuring the organization complies with relevant legislation and regulations.\nExamples of the occupations classified here:\n-  Chief executive\n-  Managing director\n-  Regional manager\n\nNote\nRegional managers and other senior managers who coordinate and supervise the activities of subordinate managers who have a diverse range of functional responsibilities are included in Unit Group 1120: Managing Directors and Chief Executives. Managers responsible for specialized functions within a specific geographic area are excluded from this unit group. For example, regional sales managers are classified in Unit Group 1221: Sales and Marketing Managers. Jobs whose principal responsibility is to participate as a member of the board of directors of one or more enterprises or organizations are included in Unit Group 1120: Managing Directors and Chief Executives. Chief executives of government-owned enterprises are included in Unit Group 1120: Managing Directors and Chief Executives.\n
##                                  ))


                                 
## ## this should be hand_keywords[[155]]
## ## hand_keywords[[6]] <- tibble(keywords=c("Pharmaceutical", "technicians" , "assistants",
## ##                         "dispensing", "medicinal", "products",
## ##                         "guidance" , "pharmacist",  "health", "professional",
## ##                         "preparing", "medications",   "compounds",
## ##                         "drugs" , "instructions",  "refill" , "record","keeping", "storage",
## ##                         "security",  "assisting",  "pricing",  "Pharmacology",
## ##                         "aide"
## ## ))



## df_key <- hand_keywords |>
##     list_to_df(source_col_name="number")

## df_key_token <- df_key |> 
##     mutate(keywords=remove_special_char_num(keywords, " ")) |>
##     mutate(keywords=remove_short_words(keywords, 3)) |> 
##     unnest_tokens(word, keywords) |>
##     mutate(word=wordStem(word)) |> 
##     anti_join(stop_words) |>
##     distinct() |>
##     group_by(number) |>
##     group_split()



## mm3 <- outer(
##     df_key_token,
##     jobs_list,
##     Vectorize(\(x, y) jaccard(x$word, y$word))
## )


print("So far so good")