Business Data Analytics. Practice Session Understanding and Mining Text

Business Data Analytics. Practice Session Understanding and Mining Text

Business Data Analytics. Practice Session Understanding and Mining Text # Identifying Sentiments from Tweet Corpus [1] # Read file apple <- read.csv(file.choose(), header = T) str(apple) # Build corpus library(tm)# for text processing corpus <- iconv(apple$text, to = "utf-8-mac") # for mac corpus <- iconv(apple$text, to = "utf-8") # for windows corpus <- Corpus(VectorSource(corpus)) inspect(corpus[1:5]) # Clean text corpus <- tm_map(corpus, tolower) inspect(corpus[1:5]) corpus <- tm_map(corpus, removePunctuation) inspect(corpus[1:5]) corpus <- tm_map(corpus, removeNumbers) inspect(corpus[1:5]) cleanset <- tm_map(corpus, removeWords, stopwords('english')) inspect(cleanset[1:5]) removeURL <- function(x) gsub('http[[:alnum:]]*', '', x) cleanset <- tm_map(cleanset, content_transformer(removeURL)) inspect(cleanset[1:5]) cleanset <- tm_map(cleanset, removeWords, c('aapl', 'apple')) cleanset <- tm_map(cleanset, gsub, pattern = 'stocks', replacement = 'stock') cleanset <- tm_map(cleanset, stripWhitespace) inspect(cleanset[1:5]) # Term document matrix tdm <- TermDocumentMatrix(cleanset) tdm tdm <- as.matrix(tdm) tdm[1:10, 1:20] # Bar plot-- wordBar w <- rowSums(tdm) w <- subset(w, w>=25) barplot(w, las = 2, col = rainbow(50)) # Word cloud library(wordcloud) w <- sort(rowSums(tdm), decreasing = TRUE) set.seed(222) wordcloud(words = names(w), freq = w, max.words = 150, random.order = F, min.freq = 5, colors = brewer.pal(8, 'Dark2'), scale = c(5, 0.3), rot.per = 0.7) library(wordcloud2) w <- data.frame(names(w), w) colnames(w) <- c('word', 'freq') wordcloud2(w, size = 0.7, shape = 'triangle', rotateRatio = 0.5, minSize = 1) letterCloud(w, word = "apple", size=1) # Sentiment analysis library(syuzhet) library(lubridate) library(ggplot2) library(scales) library(reshape2) library(dplyr) # Read file apple <- read.csv(file.choose(), header = T) tweets <- iconv(apple$text, to = 'utf-8-mac') # mac tweets <- iconv(apple$text, to = 'utf-8')# windows # Obtain sentiment scores s <- get_nrc_sentiment(tweets) head(s) tweets[4] get_nrc_sentiment('delay') # Bar plot-- SentiBar barplot(colSums(s), las = 2, col = rainbow(10), ylab = 'Count', main = 'Sentiment Scores for Apple Tweets') # Text Classification using KNN Method [2] # Set seed for reproducible results set.seed(100) # Packages library(tm) # Text mining: Corpus and Document Term Matrix library(class) # KNN model library(SnowballC) # Stemming words # Read csv with two columns: text and category df <- read.csv("knn.csv", sep =";", header = TRUE) # Create corpus docs <- Corpus(VectorSource(df$Text)) # Clean corpus docs <- tm_map(docs, content_transformer(tolower)) docs <- tm_map(docs, removeNumbers) docs <- tm_map(docs, removeWords, stopwords("english")) docs <- tm_map(docs, removePunctuation) docs <- tm_map(docs, stripWhitespace) docs <- tm_map(docs, stemDocument, language = "english") inspect(docs[1:5]) # Create dtm dtm <- DocumentTermMatrix(docs) # Transform dtm to matrix to data frame - df is easier to work with mat.df <- as.data.frame(data.matrix(dtm), stringsAsfactors = FALSE) # Column bind category (known classification) mat.df <- cbind(mat.df, df$Category) # Change name of new column to "category" colnames(mat.df)[ncol(mat.df)] <- "category" # Split data by rownumber into two equal portions train <- sample(nrow(mat.df), ceiling(nrow(mat.df) * .50)) test <- (1:nrow(mat.df))[- train] # Isolate classifier cl <- mat.df[, "category"] # Create model data and remove "category" modeldata <- mat.df[,!colnames(mat.df) %in% "category"] # Create model: training set, test set, training set classifier knn.pred <- knn(modeldata[train, ], modeldata[test, ], cl[train]) # Confusion matrix conf.mat <- table("Predictions" = knn.pred, Actual = cl[test]) conf.mat # Accuracy (accuracy <- sum(diag(conf.mat))/length(test) * 100) # Create data frame with test data and predicted category df.pred <- cbind(knn.pred, modeldata[test, ]) write.table(df.pred, file="output.csv", sep=";") References: [1] https://www.youtube.com/watch?v=otoXeVPhT7Q [2] http://garonfolo.dk/herbert/2015/05/r-text-classification-using-a-k-nearest-neighbour-model/ .

View Full Text

Details

  • File Type
    pdf
  • Upload Time
    -
  • Content Languages
    English
  • Upload User
    Anonymous/Not logged-in
  • File Pages
    5 Page
  • File Size
    -

Download

Channel Download Status
Express Download Enable

Copyright

We respect the copyrights and intellectual property rights of all users. All uploaded documents are either original works of the uploader or authorized works of the rightful owners.

  • Not to be reproduced or distributed without explicit permission.
  • Not used for commercial purposes outside of approved use cases.
  • Not used to infringe on the rights of the original creators.
  • If you believe any content infringes your copyright, please contact us immediately.

Support

For help with questions, suggestions, or problems, please contact us