#Sentiment Analysis

#Name: Remita Austin
#Faculty Mentor: Dr. Parvathi R

#To clear the environment
rm(list=ls())

#Load packages
library(dplyr)
library(tidyr)
library(tidytext)
library(textdata)
library(ggplot2)
library(purrr)

#Import and load the dataset
data <- read.csv("finance.csv")
data <- head(data,100)

data$striptxt <- gsub("$S+","",data$Sentence)

data_stem <- data %>% 
  select(striptxt) %>% 
  unnest_tokens(word,striptxt)

head(data_stem,10)

#Remove stop words using anti-join 
cleaned_data.Sentence <- data_stem %>% 
  anti_join(stop_words)
head(cleaned_data.Sentence,10)

cleaned_data.Sentence %>% 
  count(word,sort=TRUE) %>%
  top_n(10) %>%
  mutate(word=reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) + geom_col() + xlab(NULL) + coord_flip() + theme_classic() + labs(x = "Unique words", y = "Count", title = "Unique word counts found in Finance data")

get_sentiments("bing") %>%
  filter(sentiment == "positive")
get_sentiments("bing") %>%
  filter(sentiment == "negative")

#AFINN lexicon model scores the words in a range from -5 to 5
get_sentiments("afinn") %>%
  filter(value == "3")
get_sentiments("afinn") %>%
  filter(value == "5")
get_sentiments("afinn") %>%
  filter(value == "-3")

#Use "bing" lexicon and implement filter() over words that correspond to postive sentiment
positive_senti <- get_sentiments("bing") %>%
  filter(sentiment == "positive")
cleaned_data.Sentence %>%
  semi_join(positive_senti) %>%
  count(word, sort = TRUE)

#Most common positive and negative words
bing_data = cleaned_data.Sentence %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
bing_data

bing_data %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = "free_y") + labs(title = "Finance data", y = "Contributing to sentiment", x = NULL) + coord_flip() + theme_bw()

#Applying sentiment analysis on the data, and splitting on whether the score of the sentence is zero or a non-zero value:
sentiment_bing = function(txt){
  #Perform basic cleaning
  txt_tbl = tibble(text = txt) %>%
    mutate(
      #Remove $ elements
      stripped_text = gsub("$S+","",text)
    ) %>%
    unnest_tokens(word,stripped_text) %>%
    anti_join(stop_words) %>% #Remove stop words
    inner_join(get_sentiments("bing")) %>% #Merge with bing sentiment
    count(word, sentiment, sort = TRUE) %>%
    ungroup() %>%
    #Create a column "score", that assigns -1 to all negative words, and +1 to all positive words
    mutate(
      score = case_when(
        sentiment == 'negative'~n*(-1),
        sentiment == 'positive'~n*1)
    )
  #Calculate total score
  sent.score = case_when(
    nrow(txt_tbl) == 0~0, #if there are no words, then the score is 0
    nrow(txt_tbl) > 0~sum(txt_tbl$score)
  )
  #To keep track of which sentence contained no words at all from the bing list
  zero.type = case_when(
    nrow(txt_tbl) == 0~"Type 1", #Type 1: no words at all, zero = no
    nrow(txt_tbl) > 0~"Type 2" #Type 2: zero means sum of words = 0
  )
  list(score = sent.score, type = zero.type, txt_tbl = txt_tbl)
}

#lapply function
data_sent = lapply(data$Sentence,function(x){sentiment_bing(x)})
data_sent

data_sentiment = bind_rows(
  tibble(
    data = 'Finance',
    score = unlist(map(data_sent,'score')),
    type = unlist(map(data_sent,'type'))
  )
)
data_sentiment
head(data_sentiment)

ggplot(data_sentiment,aes(x = score, fill = data)) + geom_histogram(bins = 15, alpha = 0.6) + facet_grid(~data) + ggtitle("Distribution of Sentiment scores of finance data") + theme_bw()

#Barplot of sentiment type
neutral <- length(which(data_sentiment$score == 0))
positive <- length(which(data_sentiment$score > 0))
negative <- length(which(data_sentiment$score < 0))
Sentiment <- c("Positive","Neutral","Negative")
Count <- c(positive,neutral,negative)
output <- data.frame(Sentiment,Count)
output$Sentiment<-factor(output$Sentiment,levels=Sentiment)
ggplot(output, aes(x=Sentiment,y=Count))+
  geom_bar(stat = "identity", aes(fill = Sentiment))+
  ggtitle("Barplot of Sentiment type of 100 sentences")