修订版 | 060281ed69fad21d2b70bcdc347ec3fb771200ca (tree) |
---|---|
时间 | 2017-11-06 07:06:03 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
I have further developed the code. Now I can find the word following a
given initial word.
@@ -21,7 +21,7 @@ | ||
21 | 21 | |
22 | 22 | |
23 | 23 | |
24 | -clean_monograms <- function(tidytext, stop_words){ | |
24 | +clean_unigrams <- function(tidytext, stop_words){ | |
25 | 25 | |
26 | 26 | tidytext %>% unnest_tokens(word, text)%>% anti_join(stop_words) |
27 | 27 |
@@ -48,7 +48,7 @@ | ||
48 | 48 | |
49 | 49 | |
50 | 50 | |
51 | -count_monograms <- function(tidytext, stop_words){ | |
51 | +count_unigrams <- function(tidytext, stop_words){ | |
52 | 52 | |
53 | 53 | tidytext %>% unnest_tokens(word, text)%>% anti_join(stop_words) %>% count(word, sort = TRUE) |
54 | 54 |
@@ -87,7 +87,7 @@ | ||
87 | 87 | stop_words <- rbind(stop_words, mystopwords) |
88 | 88 | |
89 | 89 | |
90 | -text_kor_monogram <- clean_text("korea_extraction.txt") %>% clean_monograms(stop_words) | |
90 | +text_kor_unigram <- clean_text("korea_extraction.txt") %>% clean_unigrams(stop_words) | |
91 | 91 | |
92 | 92 | |
93 | 93 | text_kor_bigram <- clean_text("korea_extraction.txt") %>% clean_bigrams(stop_words) |
@@ -100,7 +100,7 @@ | ||
100 | 100 | |
101 | 101 | |
102 | 102 | |
103 | -kor_monogram_count <- clean_text("korea_extraction.txt") %>% count_monograms(stop_words) | |
103 | +kor_unigram_count <- clean_text("korea_extraction.txt") %>% count_unigrams(stop_words) | |
104 | 104 | |
105 | 105 | |
106 | 106 | kor_bigram_count <- clean_text("korea_extraction.txt") %>% count_bigrams(stop_words) |
@@ -110,11 +110,14 @@ | ||
110 | 110 | kor_trigram_count <- clean_text("korea_extraction.txt") %>% count_trigrams(stop_words) |
111 | 111 | |
112 | 112 | |
113 | +##################################################################### | |
114 | +##################################################################### | |
115 | +##################################################################### | |
116 | +##################################################################### | |
113 | 117 | |
114 | 118 | |
115 | 119 | |
116 | - | |
117 | -ceta_monogram_count <- clean_text("ceta_extraction.txt") %>% count_monograms(stop_words) | |
120 | +ceta_unigram_count <- clean_text("ceta_extraction.txt") %>% count_unigrams(stop_words) | |
118 | 121 | |
119 | 122 | |
120 | 123 | ceta_bigram_count <- clean_text("ceta_extraction.txt") %>% count_bigrams(stop_words) |
@@ -124,10 +127,28 @@ | ||
124 | 127 | ceta_trigram_count <- clean_text("ceta_extraction.txt") %>% count_trigrams(stop_words) |
125 | 128 | |
126 | 129 | |
130 | +text_ceta_bigram <- clean_text("ceta_extraction.txt") %>% clean_bigrams(stop_words) | |
127 | 131 | |
128 | 132 | |
129 | 133 | |
130 | 134 | |
131 | 135 | |
132 | 136 | |
137 | +##################################################################### | |
138 | +##################################################################### | |
139 | +##################################################################### | |
140 | +##################################################################### | |
141 | +##################################################################### | |
142 | + | |
143 | + | |
144 | +parties_kor <- text_kor_bigram %>% filter(word1=="parties") %>% count(word1, word2, sort=T) | |
145 | + | |
146 | + | |
147 | + | |
148 | +parties_ceta <- text_ceta_bigram %>% filter(word1=="parties") %>% count(word1, word2, sort=T) | |
149 | + | |
150 | + | |
151 | + | |
152 | + | |
153 | + | |
133 | 154 | print("So far so good") |