Part 1 - Supervised text classification with R
Load the corpus
corpus <- read.csv('../Data/reviews.csv', stringsAsFactors=FALSE)
corpus$sentiment <- as.factor(corpus$sentiment)
Split the corpus into a training set and a test set
n_row_train <- round(nrow(corpus)*0.6)
cat("Sampling", n_row_train, "comments randomly to train the model, keeping the rest for evaluation")
## Sampling 1200 comments randomly to train the model, keeping the rest for evaluation
set.seed(42)
train_ids <- sample(corpus$doc_id, n_row_train)
test_ids <- setdiff(corpus$doc_id, train_ids)
train <- corpus[train_ids, ]
test <- corpus[test_ids, ]
Vectorize both the training and test sets using this vocabulary
vectorizer = vocab_vectorizer(vocabulary)
train_dtm = create_dtm(iterator_train, vectorizer)
iterator_test <- itoken(test$text,
preprocessor=tolower,
tokenizer=word_tokenizer,
progressbar=FALSE)
test_dtm = create_dtm(iterator_test, vectorizer)
Implement a multinomial naive Bayse binary classifier
Implement a function to estimate the parameters of the naive Bayse model with Laplace smoothing
mle_mnb <- function(X, Y, k){
d <- ncol(X)
q_pos = length(Y[which(Y == 'pos')]) / (length(Y))
q_neg = length(Y[which(Y == 'neg')]) / (length(Y))
X_pos <- X[which(Y == 'pos'), ]
q_j_pos <- (colSums(X_pos) + k) / (sum(X_pos) + d * k)
X_neg <- X[which(Y == 'neg'), ]
q_j_neg <- (colSums(X_neg) + k) / (sum(X_neg) + d * k)
q <- rbind(c(q_pos, q_j_pos), c(q_neg, q_j_neg))
return(q)
}
Implement a function to train and evaluate the classifier
evaluate_mnb <- function(X, Y, k, Z){
log_q <- log(mle_mnb(X, Y, k))
log_ratios <- apply(Z, 1, function(z) c(1, z) %*% log_q[1, ] - c(1, z) %*% log_q[2, ])
predictions <- sapply(log_ratios, function(lr) if(lr < 0){'neg'}else{'pos'})
confusion_matrix <- as.matrix(table(predictions, test$sentiment))
return((confusion_matrix[1,1] + confusion_matrix[2,2]) / sum(confusion_matrix))
}
Assess the overall accuracy of this classifier for differents values of \(k\)
With the whole vocabulary
acc <- list()
for(k in 1:5){
acc[k] <- evaluate_mnb(train_dtm, train$sentiment, k, test_dtm)
}
plot(c(1:5), acc, main="Accuracy vs k (i.e. smoothing constant)", xlab="Value of k", ylab="Accuracy")
With a pruned vocabulary
pruned_vocabulary <- prune_vocabulary(vocabulary, doc_proportion_max = 0.5, term_count_min = 10)
n_words <- nrow(pruned_vocabulary)
n_tokens <- sum(pruned_vocabulary$term_count)
cat("Number of distinct words:", n_words, "\nNumber of tokens:", n_tokens)
## Number of distinct words: 6612
## Number of tokens: 361175
vectorizer <- vocab_vectorizer(pruned_vocabulary)
train_dtm <- create_dtm(iterator_train, vectorizer)
iterator_test <- itoken(test$text,
preprocessor=tolower,
tokenizer=word_tokenizer,
progressbar=FALSE)
test_dtm <- create_dtm(iterator_test, vectorizer)
acc <- list()
for(k in 1:5){
acc[k] <- evaluate_mnb(train_dtm, train$sentiment, k, test_dtm)
}
plot(c(1:5), acc, main="Accuracy vs k (i.e. smoothing constant)", xlab="Value of k", ylab="Accuracy")