• R/O
  • SSH

标签
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

File Info

Rev. 362b496852abb158f51e9501936754431c20cdd7
大小 5,775 字节
时间 2023-11-10 22:35:56
作者 Lorenzo Isella
Log Message

I added a better plot of the LDA and I now first determine the optimal number of topics.

Content

rm(list=ls())

library(quanteda)

require(quanteda.textstats)
require(quanteda.textplots)
library(tidytext)
library(readtext)
library(tidyverse)
library(quanteda.sentiment)
library(seededlda)
library(stm)
library(ldatuning)



source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")




data(stop_words)



## df <- readtext("speech2023.txt") 

df <- readtext("./*txt",
                    docvarsfrom = "filenames"## , 
                   ##  docvarnames = c("unit", "context", "year", "language", "party"),
                   ##  dvsep = "_", 
                   ## encoding = "ISO-8859-1"
                   )


mycorpus <- corpus(df)
summary(mycorpus)

## a better way to generate tokens taken from

## https://tutorials.quanteda.io/multilingual/english-german/


toks <- tokens(mycorpus, remove_punct = TRUE, remove_numbers = TRUE,
               remove_url = TRUE, remove_symbols = TRUE ) |> 
    tokens_remove(pattern = stopwords("en", source = "marimo"))  |> 
    tokens_keep(pattern = "^[a-zA-Z]+$", valuetype = "regex")##  |>
## tokens_tolower() ## No need to move everything to lowercase since
## dfm does it and the token_compound is case insensitive
## |> ## it is a question whether to stem or not
    ## tokens_wordstem()


dfm_mat <- dfm(toks) ### by default the dfm function moves everything to lowercase

bigrams <- tokens_ngrams(toks, 2)

toks_eu_bigram <- tokens_compound(toks, pattern = phrase("Europe *")) ## by default this is case insensitive.

toks_eu_bigram_select <- tokens_select(toks_eu_bigram, pattern = phrase("Europe_*"))

tf <- topfeatures(dfm_mat, 10)

tstat_key <- textstat_keyness(dfm_mat , 
                              target = "speech2023.txt"
                              ) 


df_keyness <- tstat_key |>
    as_tibble()

gpl <- textplot_keyness(tstat_key, n=10)



ggsave("vdl-keyness.pdf", gpl, width=8,height=8)

## sentiment analysis

## Various methodologies to do sentiment analysis

sent_vdl <- mycorpus  |> 
  textstat_polarity(data_dictionary_LSD2015)

sent_vdl


valence(data_dictionary_LSD2015) <- list(positive = 1, negative = -1, 
                                         neg_negative = 1, neg_positive = -1)

sent_vdl2 <- textstat_valence(toks, data_dictionary_LSD2015)

sent_vdl2





## see https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html

## determing the optimal number of topics

result <- FindTopicsNumber(
  dfm_mat,
  topics = seq(from = 2, to = 15, by = 1),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

FindTopicsNumber_plot(result)

## 8-10 topics are a reasonable number




### simple LDA topic modeling

## See https://koheiw.github.io/seededlda/articles/pkgdown/basic.html


## for some reason working on the trimmed dfm does not work

## dfmt <- dfm_mat |> 
##     dfm_remove("*@*") |>
##     dfm_trim(max_docfreq = 0.1, docfreq_type = "prop")



lda <- textmodel_lda(dfm_mat, k = 9, verbose = TRUE)

knitr::kable(terms(lda))




## top 10 terms per topic

top10 <- terms(lda, n = 10) |>
    as_tibble() |>
    pivot_longer(cols=starts_with("t"),
                 names_to="topic", values_to="word")



phi <- lda$phi |>
    as_tibble(rownames="topic")  |>
    pivot_longer(cols=c(-topic))
    

top10phi <- top10 |>
    left_join(y=phi, by=c("topic", "word"="name")) ##finally I have a tibble I can work with.

top10phi

dd2 <- sort_facets(top10phi, topic, word, category2, value)

gpl <- ggplot(dd2, aes(y=category2, x=value)) +
  geom_bar(stat = "identity") +
  facet_wrap(. ~ topic, scales = "free_y", nrow=3) +
    scale_y_discrete(labels=dd2$word, breaks=dd2$category2,
                     )+
        xlab("Probability")+
    ylab(NULL)



ggsave("lda-topic-keywords.pdf", gpl, width=8,height=8)




### now another approach based on stm

## see https://rstudio-pubs-static.s3.amazonaws.com/406792_9287b832dd9e413f97243628cb2f7ddb.html

## convert the dfm to a format suitable to stm.

dfm2stm <- convert(dfm_mat, to = "stm")



model.stm <- stm(dfm2stm$documents, dfm2stm$vocab, K = 9, data = dfm2stm$meta,
                 init.type = "Spectral") 

## I make the model tidy.
## See  https://juliasilge.com/blog/sherlock-holmes-stm/

stm_tidy <- tidy(model.stm)

gpl <- stm_tidy  |> 
    group_by(topic)  |> 
    top_n(10, beta)  |> 
    ungroup()  |> 
    mutate(topic = paste0("Topic ", topic),
           term = reorder_within(term, beta, topic))  |> 
    ggplot(aes(term, beta, fill = as.factor(topic))) +
    geom_col(alpha = 0.8, show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free_y") +
    coord_flip() +
    scale_x_reordered() +
    labs(x = NULL, y = expression(beta),
         title = "Highest word probabilities for each topic",
         subtitle = "Different words are associated with different topics")


ggsave("stm-keywords.pdf", gpl, width=8,height=8)


######################################
## ## let us see the tidy approach

## dfm_tidy <- tidy(dfm_mat)

## corpus_tidy <- tidy(mycorpus)



## df_tidy <- list.files(pattern = "*.txt") %>% 
##         map_chr(~ read_file(.)) %>% 
##         tibble(text = .)

## df_clean1 <- clean_text("speech2023.txt") |>
##     mutate(origin="2023 speech")

## df_clean2 <- clean_text("speech2022.txt") |>
##     mutate(origin="2022 speech")


## df_clean <- rbind(df_clean1, df_clean2) |>
##     mutate(text=tolower(text))

## df_uni <- df_clean |>
##     clean_unigrams(stop_words)


## df_big <- df_clean |>
##     clean_bigrams(stop_words) |>
##     filter(word1=="europe")

## df_big_count <- df_clean |>
##     group_by(origin) |>
##     count_bigrams(stop_words)  |>
##     ungroup() |> 
##     arrange(origin, desc(n)) |>
##     filter(word1=="europe")




print("So far so good")