OSDN > Developer > larry77 > Chamber > myprojects-hg-reborn > 提交

myprojects-hg-reborn
Fork

(Original repository, No fork origin)

提交

修订版	060281ed69fad21d2b70bcdc347ec3fb771200ca (tree)
时间	2017-11-06 07:06:03
作者	Lorenzo Isella <lorenzo.isella@gmai...>
Commiter	Lorenzo Isella

I have further developed the code. Now I can find the word following a
given initial word.

diff -r 938e846c3aae -r 060281ed69fa R-codes/text_mining.R

--- a/R-codes/text_mining.R Sun Nov 05 23:05:05 2017 +0100

+++ b/R-codes/text_mining.R Sun Nov 05 23:06:03 2017 +0100

		@@ -21,7 +21,7 @@
21	21
22	22
23	23
24		-clean_monograms <- function(tidytext, stop_words){
	24	+clean_unigrams <- function(tidytext, stop_words){
25	25
26	26	tidytext %>% unnest_tokens(word, text)%>% anti_join(stop_words)
27	27

		@@ -48,7 +48,7 @@
48	48
49	49
50	50
51		-count_monograms <- function(tidytext, stop_words){
	51	+count_unigrams <- function(tidytext, stop_words){
52	52
53	53	tidytext %>% unnest_tokens(word, text)%>% anti_join(stop_words) %>% count(word, sort = TRUE)
54	54

		@@ -87,7 +87,7 @@
87	87	stop_words <- rbind(stop_words, mystopwords)
88	88
89	89
90		-text_kor_monogram <- clean_text("korea_extraction.txt") %>% clean_monograms(stop_words)
	90	+text_kor_unigram <- clean_text("korea_extraction.txt") %>% clean_unigrams(stop_words)
91	91
92	92
93	93	text_kor_bigram <- clean_text("korea_extraction.txt") %>% clean_bigrams(stop_words)

		@@ -100,7 +100,7 @@
100	100
101	101
102	102
103		-kor_monogram_count <- clean_text("korea_extraction.txt") %>% count_monograms(stop_words)
	103	+kor_unigram_count <- clean_text("korea_extraction.txt") %>% count_unigrams(stop_words)
104	104
105	105
106	106	kor_bigram_count <- clean_text("korea_extraction.txt") %>% count_bigrams(stop_words)

		@@ -110,11 +110,14 @@
110	110	kor_trigram_count <- clean_text("korea_extraction.txt") %>% count_trigrams(stop_words)
111	111
112	112
	113	+#####################################################################
	114	+#####################################################################
	115	+#####################################################################
	116	+#####################################################################
113	117
114	118
115	119
116		-
117		-ceta_monogram_count <- clean_text("ceta_extraction.txt") %>% count_monograms(stop_words)
	120	+ceta_unigram_count <- clean_text("ceta_extraction.txt") %>% count_unigrams(stop_words)
118	121
119	122
120	123	ceta_bigram_count <- clean_text("ceta_extraction.txt") %>% count_bigrams(stop_words)

		@@ -124,10 +127,28 @@
124	127	ceta_trigram_count <- clean_text("ceta_extraction.txt") %>% count_trigrams(stop_words)
125	128
126	129
	130	+text_ceta_bigram <- clean_text("ceta_extraction.txt") %>% clean_bigrams(stop_words)
127	131
128	132
129	133
130	134
131	135
132	136
	137	+#####################################################################
	138	+#####################################################################
	139	+#####################################################################
	140	+#####################################################################
	141	+#####################################################################
	142	+
	143	+
	144	+parties_kor <- text_kor_bigram %>% filter(word1=="parties") %>% count(word1, word2, sort=T)
	145	+
	146	+
	147	+
	148	+parties_ceta <- text_ceta_bigram %>% filter(word1=="parties") %>% count(word1, word2, sort=T)
	149	+
	150	+
	151	+
	152	+
	153	+
133	154	print("So far so good")