Business Data Analytics. Practice Session Understanding and Mining Text

Business Data Analytics. Practice Session Understanding and Mining Text # Identifying Sentiments from Tweet Corpus [1] # Read file apple <- read.csv(file.choose(), header = T) str(apple) # Build corpus library(tm)# for text processing corpus <- iconv(apple$text, to = "utf-8-mac") # for mac corpus <- iconv(apple$text, to = "utf-8") # for windows corpus <- Corpus(VectorSource(corpus)) inspect(corpus[1:5]) # Clean text corpus <- tm_map(corpus, tolower) inspect(corpus[1:5]) corpus <- tm_map(corpus, removePunctuation) inspect(corpus[1:5]) corpus <- tm_map(corpus, removeNumbers) inspect(corpus[1:5]) cleanset <- tm_map(corpus, removeWords, stopwords('english')) inspect(cleanset[1:5]) removeURL <- function(x) gsub('http[[:alnum:]]*', '', x) cleanset <- tm_map(cleanset, content_transformer(removeURL)) inspect(cleanset[1:5]) cleanset <- tm_map(cleanset, removeWords, c('aapl', 'apple')) cleanset <- tm_map(cleanset, gsub, pattern = 'stocks', replacement = 'stock') cleanset <- tm_map(cleanset, stripWhitespace) inspect(cleanset[1:5]) # Term document matrix tdm <- TermDocumentMatrix(cleanset) tdm tdm <- as.matrix(tdm) tdm[1:10, 1:20] # Bar plot-- wordBar w <- rowSums(tdm) w <- subset(w, w>=25) barplot(w, las = 2, col = rainbow(50)) # Word cloud library(wordcloud) w <- sort(rowSums(tdm), decreasing = TRUE) set.seed(222) wordcloud(words = names(w), freq = w, max.words = 150, random.order = F, min.freq = 5, colors = brewer.pal(8, 'Dark2'), scale = c(5, 0.3), rot.per = 0.7) library(wordcloud2) w <- data.frame(names(w), w) colnames(w) <- c('word', 'freq') wordcloud2(w, size = 0.7, shape = 'triangle', rotateRatio = 0.5, minSize = 1) letterCloud(w, word = "apple", size=1) # Sentiment analysis library(syuzhet) library(lubridate) library(ggplot2) library(scales) library(reshape2) library(dplyr) # Read file apple <- read.csv(file.choose(), header = T) tweets <- iconv(apple$text, to = 'utf-8-mac') # mac tweets <- iconv(apple$text, to = 'utf-8')# windows # Obtain sentiment scores s <- get_nrc_sentiment(tweets) head(s) tweets[4] get_nrc_sentiment('delay') # Bar plot-- SentiBar barplot(colSums(s), las = 2, col = rainbow(10), ylab = 'Count', main = 'Sentiment Scores for Apple Tweets') # Text Classification using KNN Method [2] # Set seed for reproducible results set.seed(100) # Packages library(tm) # Text mining: Corpus and Document Term Matrix library(class) # KNN model library(SnowballC) # Stemming words # Read csv with two columns: text and category df <- read.csv("knn.csv", sep =";", header = TRUE) # Create corpus docs <- Corpus(VectorSource(df$Text)) # Clean corpus docs <- tm_map(docs, content_transformer(tolower)) docs <- tm_map(docs, removeNumbers) docs <- tm_map(docs, removeWords, stopwords("english")) docs <- tm_map(docs, removePunctuation) docs <- tm_map(docs, stripWhitespace) docs <- tm_map(docs, stemDocument, language = "english") inspect(docs[1:5]) # Create dtm dtm <- DocumentTermMatrix(docs) # Transform dtm to matrix to data frame - df is easier to work with mat.df <- as.data.frame(data.matrix(dtm), stringsAsfactors = FALSE) # Column bind category (known classification) mat.df <- cbind(mat.df, df$Category) # Change name of new column to "category" colnames(mat.df)[ncol(mat.df)] <- "category" # Split data by rownumber into two equal portions train <- sample(nrow(mat.df), ceiling(nrow(mat.df) * .50)) test <- (1:nrow(mat.df))[- train] # Isolate classifier cl <- mat.df[, "category"] # Create model data and remove "category" modeldata <- mat.df[,!colnames(mat.df) %in% "category"] # Create model: training set, test set, training set classifier knn.pred <- knn(modeldata[train, ], modeldata[test, ], cl[train]) # Confusion matrix conf.mat <- table("Predictions" = knn.pred, Actual = cl[test]) conf.mat # Accuracy (accuracy <- sum(diag(conf.mat))/length(test) * 100) # Create data frame with test data and predicted category df.pred <- cbind(knn.pred, modeldata[test, ]) write.table(df.pred, file="output.csv", sep=";") References: [1] https://www.youtube.com/watch?v=otoXeVPhT7Q [2] http://garonfolo.dk/herbert/2015/05/r-text-classification-using-a-k-nearest-neighbour-model/ .

Business Data Analytics. Practice Session Understanding and Mining Text

Details

Download

Copyright

We respect the copyrights and intellectual property rights of all users. All uploaded documents are either original works of the uploader or authorized works of the rightful owners.

Support