Business Data Analytics. Practice Session Understanding and Mining Text

# Identifying Sentiments from Tweet Corpus [1]

# Read apple <- read.csv(file.choose(), header = T) str(apple)

# Build corpus library(tm)# for text processing corpus <- iconv(apple$text, to = "utf-8-mac") # for mac corpus <- iconv(apple$text, to = "utf-8") # for windows corpus <- Corpus(VectorSource(corpus)) inspect(corpus[1:5])

# Clean text corpus <- tm_map(corpus, tolower) inspect(corpus[1:5]) corpus <- tm_map(corpus, removePunctuation) inspect(corpus[1:5]) corpus <- tm_map(corpus, removeNumbers) inspect(corpus[1:5]) cleanset <- tm_map(corpus, removeWords, stopwords('english')) inspect(cleanset[1:5]) removeURL <- function(x) gsub('http[[:alnum:]]*', '', x) cleanset <- tm_map(cleanset, content_transformer(removeURL)) inspect(cleanset[1:5]) cleanset <- tm_map(cleanset, removeWords, c('aapl', 'apple')) cleanset <- tm_map(cleanset, gsub, pattern = 'stocks', replacement = 'stock') cleanset <- tm_map(cleanset, stripWhitespace) inspect(cleanset[1:5])

# Term document matrix tdm <- TermDocumentMatrix(cleanset) tdm tdm <- as.matrix(tdm) tdm[1:10, 1:20]

# Bar plot-- wordBar w <- rowSums(tdm) w <- subset(w, w>=25) barplot(w, las = 2, col = rainbow(50))

# Word cloud library(wordcloud) w <- sort(rowSums(tdm), decreasing = TRUE) set.seed(222) wordcloud(words = names(w), freq = w, max.words = 150, random.order = F, min.freq = 5, colors = brewer.pal(8, 'Dark2'), scale = c(5, 0.3), rot.per = 0.7) library(wordcloud2) w <- data.frame(names(w), w) colnames(w) <- c('word', 'freq') wordcloud2(w, size = 0.7, shape = 'triangle', rotateRatio = 0.5, minSize = 1) letterCloud(w, word = "apple", size=1)

# Sentiment analysis library(syuzhet) library(lubridate) library(ggplot2) library(scales) library(reshape2) library(dplyr)

# Read file apple <- read.csv(file.choose(), header = T) tweets <- iconv(apple$text, to = 'utf-8-mac') # mac tweets <- iconv(apple$text, to = 'utf-8')# windows

# Obtain sentiment scores s <- get_nrc_sentiment(tweets) head(s) tweets[4] get_nrc_sentiment('delay')

# Bar plot-- SentiBar barplot(colSums(s), las = 2, col = rainbow(10), ylab = 'Count', main = 'Sentiment Scores for Apple Tweets')

# Text Classification using KNN Method [2] # Set seed for reproducible results set.seed(100)

# Packages library(tm) # Text mining: Corpus and Document Term Matrix library(class) # KNN model library(SnowballC) # Stemming words

# Read csv with two columns: text and category <- read.csv("knn.csv", sep =";", header = TRUE)

# Create corpus docs <- Corpus(VectorSource(df$Text))

# Clean corpus docs <- tm_map(docs, content_transformer(tolower)) docs <- tm_map(docs, removeNumbers) docs <- tm_map(docs, removeWords, stopwords("english")) docs <- tm_map(docs, removePunctuation) docs <- tm_map(docs, stripWhitespace) docs <- tm_map(docs, stemDocument, language = "english") inspect(docs[1:5])

# Create dtm dtm <- DocumentTermMatrix(docs)

# Transform dtm to matrix to data frame - df is easier to work with mat.df <- as.data.frame(data.matrix(dtm), stringsAsfactors = FALSE)

# Column bind category (known classification) mat.df <- cbind(mat.df, df$Category)

# Change name of new column to "category" colnames(mat.df)[ncol(mat.df)] <- "category"

# data by rownumber into two equal portions train <- sample(nrow(mat.df), ceiling(nrow(mat.df) * .50)) <- (1:nrow(mat.df))[- train]

# Isolate classifier cl <- mat.df[, "category"]

# Create model data and remove "category" modeldata <- mat.df[,!colnames(mat.df) %in% "category"]

# Create model: training set, test set, training set classifier knn.pred <- knn(modeldata[train, ], modeldata[test, ], cl[train])

# Confusion matrix conf.mat <- table("Predictions" = knn.pred, Actual = cl[test]) conf.mat

# Accuracy (accuracy <- sum(diag(conf.mat))/length(test) * 100)

# Create data frame with test data and predicted category df.pred <- cbind(knn.pred, modeldata[test, ]) write.table(df.pred, file="output.csv", sep=";")

References: [1] https://www.youtube.com/watch?v=otoXeVPhT7Q [2] http://garonfolo.dk/herbert/2015/05/r-text-classification-using-a-k-nearest-neighbour-model/