install.packages("ggthemes")
install.packages("qdap")
install.packages("dplyr")
install.packages("wordcloud")
install.packages("plotrix")
install.packages("dendextend")
install.packages("ggplot2")
install.packages("ggthemes")
install.packages("RWeka")
install.packages("reshape2")
install.packages("quanteda")
install.packages("BiocManager")
BiocManager::install("Rgraphviz")
install.packages("tidyverse")
install.packages("tidytext")
install.packages("igraph")
install.packages("ggraph")
install.packages("sentimentr")
install.packages("syuzhet")

library("tm")
library("qdap")
library("dplyr")
library("wordcloud")
library("plotrix")
library("dendextend")
library("ggplot2")
library("ggthemes")
library("RWeka")
library("reshape2")
library("quanteda")
library(readxl)
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library("syuzhet")
library("Rgraphviz")
library(tidyverse)
library(tidytext)
library(igraph)
library(ggraph)

comments <- read_excel("dyson comments.xlsx", col_names = FALSE)

corpus_review=Corpus(VectorSource(comments))


toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpus_review= tm_map(corpus_review, toSpace, "/")

corpus_review=tm_map(corpus_review, tolower)

corpus_review= tm_map(corpus_review, toSpace, "@")
corpus_review= tm_map(corpus_review, toSpace, "'")
corpus_review= tm_map(corpus_review, toSpace, "°")
corpus_review= tm_map(corpus_review, toSpace, "’")
corpus_review=tm_map(corpus_review, removePunctuation)
corpus_review=tm_map(corpus_review, removeNumbers)
corpus_review=tm_map(corpus_review, removeWords, stopwords("it"))

corpus_review=tm_map(corpus_review, removeWords,c("dyson","dryer", "phon", "supersonic"))
for (j in seq(corpus_review))
  {
       corpus_review [[j]] <- gsub("less heat ","less_heat", corpus_review [[j]])
      # corpus_review [[j]] <- gsub(“long hair", “long_hair”, corpus_review [[j]])
     }

corpus_review=tm_map(corpus_review, stripWhitespace)
corpus_review=tm_map(corpus_review, stemDocument)

writeLines(as.character(corpus_review)) #to see the corpus


dtm<-TermDocumentMatrix(corpus_review)


#dtm <- DocumentTermMatrix(corpus_review)
dtm <- removeSparseTerms(dtm, 0.999)   #Remove sparse terms in the dtm
term_freq<-frequency(dtm, 30)


m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

barplot(d[1:10,]$freq, las = 3, names.arg = d[1:10,]$word, col = rainbow(50), main ="Most frequent words", ylab = "Word frequencies")


set.seed(1234)
wordcloud(words = d$word, freq = d$freq,
          max.words=200, random.order=FALSE, rot.per=0.35,
          colors=brewer.pal(8, "Dark2"))


#Word association for positive comments where dtm is the Doument term matrix of positive comments (file dyson)
findAssocs(dtm, terms = c("hair","much","use"), corlimit = 0.10)			
a=findAssocs(dtm, terms = findFreqTerms(dtm, lowfreq = 20), corlimit = 0.25)

freq.term<-findFreqTerms(dtm, lowfreq=10)
plot(dtm, term=freq.term, corThreshold=0.25)

#word cluster
v=sort(v, decreasing=TRUE)
v1=v[v>4]
hc<-hclust(d=dist(v1, method="euclidean"), method="complete")
plot(hc)