#------------------------------------------------------------------------------#
#              Introducción a la programación en R                            #                                    
#                         Clase 7                                              # 
#                      Live coding                                               #
#------------------------------------------------------------------------------#


library(quanteda)
library(stringr)
library(rtweet)
library(dplyr)
library(tidyr)
library(ggplot2)


load("Clase7/Material/tweets_fa.RData")
load("Clase7/Material/tweets_pn.RData")

##1. Realizo limpieza inicial de tweets:

##Frente amplio
# sacar URLs
tweets_fa$full_text <- str_replace_all(tweets_fa$full_text, "http[[:alnum:]]*","")
tweets_fa$full_text <- str_replace_all(tweets_fa$full_text, "s://t.co/[[:alnum:]]*","")
# sacar toda referencia a RT
tweets_fa$full_text <- str_replace(tweets_fa$full_text,"RT @[a-z,A-Z]*: ","")
# sacar hashtags
tweets_fa$full_text <- str_replace_all(tweets_fa$full_text,"#[a-z,A-Z]*","")
# sacar referencias a otros screen_names
tweets_fa$full_text <- str_replace_all(tweets_fa$full_text,"@[a-z,A-Z]*","")

##Partido Nacional
# sacar URLs
tweets_pn$full_text <- str_replace_all(tweets_pn$full_text, "http[[:alnum:]]*","")
tweets_pn$full_text <- str_replace_all(tweets_pn$full_text, "s://t.co/[[:alnum:]]*","")

# sacar toda referencia a RT
tweets_pn$full_text <- str_replace(tweets_pn$full_text,"RT @[a-z,A-Z]*: ","")
# sacar hashtags
tweets_pn$full_text <- str_replace_all(tweets_pn$full_text,"#[a-z,A-Z]*","")
# sacar referencias a otros screen_names
tweets_pn$full_text <- str_replace_all(tweets_pn$full_text,"@[a-z,A-Z]*","")


##creo y limpio: Frente Amplio

dfm_fa <- quanteda::dfm(quanteda::tokens(tweets_fa$full_text,
                 remove_punct = TRUE,
                 remove_numbers = TRUE),
                 tolower=TRUE,
                  verbose = FALSE) %>%
  quanteda::dfm_remove(pattern = c(quanteda::stopwords("spanish")),min_nchar=3)


##creo y limpio: Partido Nacional

dfm_pn <- quanteda::dfm(quanteda::tokens(tweets_pn$full_text,
                                           remove_punct = TRUE,
                                           remove_numbers = TRUE),
                          tolower=TRUE,
                          verbose = FALSE) %>%
  quanteda::dfm_remove(pattern = c(quanteda::stopwords("spanish")),min_nchar=3)



### Armo un diccionario según mi interés

midic <- dictionary(list(social = c("social","politica social","politicas sociales", "plan social","planes sociales", "sociedad", "salud","educación","educacion"),
            economia = c("economía","empleo", "desempleo", "crisis", "economia","fiscal","dolar*","ajuste"),
            seguridad=c("seguridad","robo","reforma","delincuente")))



### Aplico el diccionario en mi dfm y saco el porcentaje



midic_result_fa<-dfm_lookup(dfm_fa,dictionary=midic,nomatch="no_aparece")
midic_result_fa=convert(midic_result_fa, to = "data.frame") 
midic_result_fa$partido="Frente Amplio"

midic_result_pn<-dfm_lookup(dfm_pn,dictionary=midic,nomatch="no_aparece")
midic_result_pn=convert(midic_result_pn, to = "data.frame") 
midic_result_pn$partido="Partido Nacional"

midic_result=rbind(midic_result_fa,midic_result_pn)


##Armo una tabla resumen
tabla = midic_result  %>%
  group_by(partido)%>%
  tidyr::pivot_longer(cols = c(social,economia,seguridad,no_aparece))%>%
  filter(name!="no_aparece")%>%
  group_by(partido, name)%>%
  summarise(N = sum(value)) %>%
  mutate(Prop = round((N/sum(N))*100,1))


##Ejemplo con base bibliográfica para ver co-ocurrencia de palabras claves. 

library(quanteda)
library(dplyr)
library(stringr)

##a) Cargo la base de bibliografía
base=openxlsx::read.xlsx("Clase6/Material/base.xlsx")

##b) Tokenizo la variable donde tengo alojadas las palabras claves separadas por ";"
toks <- tokens(base$Author.Keywords, remove_punct = TRUE)

##c) Las separo en variables diferentes con la función tidyr::separate() 
convars=base %>% 
  tidyr::separate(col=Author.Keywords,into = "X", sep=";") 

##d) Les saco espacios adelante y atras con str_trim()
palabras=str_trim(c(convars$X1, convars$X2,convars$X3,convars$X4,
                    convars$X5,convars$X6,convars$X7,convars$X8,
                    convars$X9,convars$X10,convars$X11,convars$X12,
                    convars$X13,convars$X14,convars$X15,convars$X16))

##e) Armo diccionario con vector de cada columna  
palabras = quanteda::dictionary(list(palabras=palabras))

##f) Armo un DFM a partir de las keywords y aplico quanteda::tokens_compound() ya que
##son términos multi-palabra. Selecciono sólo las que están en mi vector "palabras"
##que sé que aparecen. Pondero tfidf. 

dfm <- quanteda::dfm(quanteda::tokens_compound(quanteda::tokens(base$Author.Keywords,
remove_punct = TRUE,remove_numbers = TRUE),palabras),tolower = TRUE,  verbose = FALSE)%>%
  dfm_tfidf()

topfeatures(dfm,10)
##g) Armo la matriz de co-ocurrencias y armo red con 80 términos principales
base_fcm= dfm %>%
  fcm(context = "document")

feat <- names(topfeatures(base_fcm, 150)) ##cambia la cantidad de palabras
base_fcm_select <- fcm_select(base_fcm, pattern = feat, selection = "keep")
size <- log(colSums(dfm_select(base_fcm, feat, selection = "keep")))
quanteda.textplots::textplot_network(base_fcm_select, min_freq = 0.8, 
vertex_size = size / max(size) * 3,
edge_color="#eb6864")



##2. Diccionarios de sentimientos. Método Syuzhet

#https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html
#https://arxiv.org/pdf/1901.08319.pdf

#install.packages("syuzhet")
library(syuzhet)


tweets_fa$screen_name = "Frente Amplio"
tweets_pn$screen_name = "Partido Nacional"
tweets_df = rbind(tweets_fa,tweets_pn)

Sentiment <- get_nrc_sentiment(tweets_df$full_text, language = "spanish")

tweets_df_senti <- cbind(tweets_df, Sentiment)

##Defino el sentimiento considerando la diferencia entre puntajes + y -

tweets_df_senti$puntaje<-tweets_df_senti$positive-tweets_df_senti$negative
tweets_df_senti$sentimiento=ifelse(tweets_df_senti$puntaje<0,"Negativo","Positivo")
tweets_df_senti$sentimiento=ifelse(tweets_df_senti$puntaje==0,"Neutral",tweets_df_senti$sentimiento)

tweets_sent <- tweets_df_senti %>% group_by(screen_name,sentimiento) %>% summarise(count=n()) %>% 
  mutate(per = round(prop.table(count)*100,1))

##Grafico

ggplot(tweets_sent, aes(x=screen_name, y=per, fill=sentimiento))+
  geom_bar(position="dodge", stat="identity")+
  scale_fill_manual(values = c("#EB594D", "#FFFAA4","#98E898"))+
  geom_text(data = tweets_sent, 
            aes(x = screen_name, y = per, label = per),position=position_dodge(width=0.9), vjust=-0.25)


# Conteo absoluto de puntajes

tweets_sent_puntaje <- tweets_df_senti %>% group_by(puntaje,screen_name) %>% summarise(count=n()) %>% 
  mutate(per = round(prop.table(count)*100,1))

ggplot(tweets_sent_puntaje, aes(x=puntaje, y=count, fill=screen_name))+
  geom_bar(position="dodge", stat="identity")+
  scale_fill_manual(values = c("#df4a4a", "#add8e6"))+
  geom_text(data = tweets_sent_puntaje, 
            aes(x = puntaje, y = count, label = count),position=position_dodge(width=1.5), vjust=-0.25)



##Grafico sentimiento y partido agrupado


a=tweets_df_senti  %>%
  group_by(screen_name) %>%
  summarise(text_p = paste(full_text, collapse = " ")) %>%
  mutate(Sentiment_syuzhet = syuzhet::get_sentiment(text_p, method = "syuzhet", 
                                                    language = "spanish")) %>%
  ggplot(aes(x = factor(screen_name, level = c("Frente Amplio", 
                                         "Partido Nacional")), 
             y = Sentiment_syuzhet, color = screen_name)) +
  geom_point(size = 5, alpha = 0.8) +
  ggtitle("Análisis de sentimiento por Partido") +
  geom_hline(yintercept = 0, color = "#4F4D4D") +
  theme_minimal() +
  theme(axis.title.y = element_blank(), 
        axis.title.x = element_blank(),
        legend.title = element_blank(), 
        legend.position = "none") 



##3. Modelado de topicos 
library(topicmodels)

dtm <- convert(dfm_fa, to = "topicmodels")

lda <- LDA(dtm, k = 4)

get_terms(lda,10)