library(text2vec)
library(Matrix)

Part 1 - Supervised text classification with R

Load the corpus

corpus <- read.csv('../Data/reviews.csv', stringsAsFactors=FALSE)
corpus$sentiment <- as.factor(corpus$sentiment)

Split the corpus into a training set and a test set

n_row_train <- round(nrow(corpus)*0.6)
cat("Sampling", n_row_train, "comments randomly to train the model, keeping the rest for evaluation")
## Sampling 1200 comments randomly to train the model, keeping the rest for evaluation
set.seed(42)
train_ids <- sample(corpus$doc_id, n_row_train)
test_ids <- setdiff(corpus$doc_id, train_ids)
train <- corpus[train_ids, ]
test <- corpus[test_ids, ]

Extract the raw vocabulary from the training set

iterator_train <- itoken(train$text,
                         preprocessor=tolower,
                         tokenizer=word_tokenizer,
                         progressbar=FALSE)
vocabulary <- create_vocabulary(iterator_train)
n_words <- nrow(vocabulary)
n_tokens <- sum(vocabulary$term_count)
cat("Number of distinct words:", n_words, "\nNumber of tokens:", n_tokens)
## Number of distinct words: 34510 
## Number of tokens: 779427

Vectorize both the training and test sets using this vocabulary

vectorizer = vocab_vectorizer(vocabulary)
train_dtm = create_dtm(iterator_train, vectorizer)
iterator_test <- itoken(test$text,
                        preprocessor=tolower,
                        tokenizer=word_tokenizer,
                        progressbar=FALSE)
test_dtm = create_dtm(iterator_test, vectorizer)

Implement a multinomial naive Bayse binary classifier

Implement a function to estimate the parameters of the naive Bayse model with Laplace smoothing

mle_mnb <- function(X, Y, k){
  d <- ncol(X)
  q_pos = length(Y[which(Y == 'pos')]) / (length(Y))
  q_neg = length(Y[which(Y == 'neg')]) / (length(Y))
  X_pos <- X[which(Y == 'pos'), ]
  q_j_pos <- (colSums(X_pos) + k) / (sum(X_pos) + d * k)
  X_neg <- X[which(Y == 'neg'), ]
  q_j_neg <- (colSums(X_neg) + k) / (sum(X_neg) + d * k)
  q <- rbind(c(q_pos, q_j_pos), c(q_neg, q_j_neg))
  return(q)
}

Implement a function to train and evaluate the classifier

evaluate_mnb <- function(X, Y, k, Z){
  log_q <- log(mle_mnb(X, Y, k))
  log_ratios <- apply(Z, 1, function(z) c(1, z) %*% log_q[1, ] - c(1, z) %*% log_q[2, ])
  predictions <- sapply(log_ratios, function(lr) if(lr < 0){'neg'}else{'pos'})
  confusion_matrix <- as.matrix(table(predictions, test$sentiment))
  return((confusion_matrix[1,1] + confusion_matrix[2,2]) / sum(confusion_matrix))
}

Assess the overall accuracy of this classifier for differents values of \(k\)

With the whole vocabulary

acc <- list()
for(k in 1:5){
  acc[k] <- evaluate_mnb(train_dtm, train$sentiment, k, test_dtm)
}
plot(c(1:5), acc, main="Accuracy vs k (i.e. smoothing constant)", xlab="Value of k", ylab="Accuracy")

With a pruned vocabulary

pruned_vocabulary <- prune_vocabulary(vocabulary, doc_proportion_max = 0.5, term_count_min = 10)
n_words <- nrow(pruned_vocabulary)
n_tokens <- sum(pruned_vocabulary$term_count)
cat("Number of distinct words:", n_words, "\nNumber of tokens:", n_tokens)
## Number of distinct words: 6612 
## Number of tokens: 361175
vectorizer <- vocab_vectorizer(pruned_vocabulary)
train_dtm <- create_dtm(iterator_train, vectorizer)
iterator_test <- itoken(test$text,
                        preprocessor=tolower,
                        tokenizer=word_tokenizer,
                        progressbar=FALSE)
test_dtm <- create_dtm(iterator_test, vectorizer)
acc <- list()
for(k in 1:5){
  acc[k] <- evaluate_mnb(train_dtm, train$sentiment, k, test_dtm)
}
plot(c(1:5), acc, main="Accuracy vs k (i.e. smoothing constant)", xlab="Value of k", ylab="Accuracy")