Business Data Analytics. Practice Session Understanding and Mining Text

Business Data Analytics. Practice Session Understanding and Mining Text # Identifying Sentiments from Tweet Corpus [1] # Read file apple <- read.csv(file.choose(), header = T) str(apple) # Build corpus library(tm)# for text processing corpus <- iconv(apple$text, to = "utf-8-mac") # for mac corpus <- iconv(apple$text, to = "utf-8") # for windows corpus <- Corpus(VectorSource(corpus)) inspect(corpus[1:5]) # Clean text corpus <- tm_map(corpus, tolower) inspect(corpus[1:5]) corpus <- tm_map(corpus, removePunctuation) inspect(corpus[1:5]) corpus <- tm_map(corpus, removeNumbers) inspect(corpus[1:5]) cleanset <- tm_map(corpus, removeWords, stopwords('english')) inspect(cleanset[1:5]) removeURL <- function(x) gsub('http[[:alnum:]]*', '', x) cleanset <- tm_map(cleanset, content_transformer(removeURL)) inspect(cleanset[1:5]) cleanset <- tm_map(cleanset, removeWords, c('aapl', 'apple')) cleanset <- tm_map(cleanset, gsub, pattern = 'stocks', replacement = 'stock') cleanset <- tm_map(cleanset, stripWhitespace) inspect(cleanset[1:5]) # Term document matrix tdm <- TermDocumentMatrix(cleanset) tdm tdm <- as.matrix(tdm) tdm[1:10, 1:20] # Bar plot-- wordBar w <- rowSums(tdm) w <- subset(w, w>=25) barplot(w, las = 2, col = rainbow(50)) # Word cloud library(wordcloud) w <- sort(rowSums(tdm), decreasing = TRUE) set.seed(222) wordcloud(words = names(w), freq = w, max.words = 150, random.order = F, min.freq = 5, colors = brewer.pal(8, 'Dark2'), scale = c(5, 0.3), rot.per = 0.7) library(wordcloud2) w <- data.frame(names(w), w) colnames(w) <- c('word', 'freq') wordcloud2(w, size = 0.7, shape = 'triangle', rotateRatio = 0.5, minSize = 1) letterCloud(w, word = "apple", size=1) # Sentiment analysis library(syuzhet) library(lubridate) library(ggplot2) library(scales) library(reshape2) library(dplyr) # Read file apple <- read.csv(file.choose(), header = T) tweets <- iconv(apple$text, to = 'utf-8-mac') # mac tweets <- iconv(apple$text, to = 'utf-8')# windows # Obtain sentiment scores s <- get_nrc_sentiment(tweets) head(s) tweets[4] get_nrc_sentiment('delay') # Bar plot-- SentiBar barplot(colSums(s), las = 2, col = rainbow(10), ylab = 'Count', main = 'Sentiment Scores for Apple Tweets') # Text Classification using KNN Method [2] # Set seed for reproducible results set.seed(100) # Packages library(tm) # Text mining: Corpus and Document Term Matrix library(class) # KNN model library(SnowballC) # Stemming words # Read csv with two columns: text and category df <- read.csv("knn.csv", sep =";", header = TRUE) # Create corpus docs <- Corpus(VectorSource(df$Text)) # Clean corpus docs <- tm_map(docs, content_transformer(tolower)) docs <- tm_map(docs, removeNumbers) docs <- tm_map(docs, removeWords, stopwords("english")) docs <- tm_map(docs, removePunctuation) docs <- tm_map(docs, stripWhitespace) docs <- tm_map(docs, stemDocument, language = "english") inspect(docs[1:5]) # Create dtm dtm <- DocumentTermMatrix(docs) # Transform dtm to matrix to data frame - df is easier to work with mat.df <- as.data.frame(data.matrix(dtm), stringsAsfactors = FALSE) # Column bind category (known classification) mat.df <- cbind(mat.df, df$Category) # Change name of new column to "category" colnames(mat.df)[ncol(mat.df)] <- "category" # Split data by rownumber into two equal portions train <- sample(nrow(mat.df), ceiling(nrow(mat.df) * .50)) test <- (1:nrow(mat.df))[- train] # Isolate classifier cl <- mat.df[, "category"] # Create model data and remove "category" modeldata <- mat.df[,!colnames(mat.df) %in% "category"] # Create model: training set, test set, training set classifier knn.pred <- knn(modeldata[train, ], modeldata[test, ], cl[train]) # Confusion matrix conf.mat <- table("Predictions" = knn.pred, Actual = cl[test]) conf.mat # Accuracy (accuracy <- sum(diag(conf.mat))/length(test) * 100) # Create data frame with test data and predicted category df.pred <- cbind(knn.pred, modeldata[test, ]) write.table(df.pred, file="output.csv", sep=";") References: [1] https://www.youtube.com/watch?v=otoXeVPhT7Q [2] http://garonfolo.dk/herbert/2015/05/r-text-classification-using-a-k-nearest-neighbour-model/ .

Load more