--- title: "Lab 2 - Supervised text classification" author: "Adrien Guille" date: "10/10/2018" output: html_document --- ```{r} library(text2vec) library(Matrix) ``` # Part 1 - Supervised text classification with R ## Load the corpus ```{r} corpus <- read.csv('../Data/reviews.csv', stringsAsFactors=FALSE) corpus$sentiment <- as.factor(corpus$sentiment) ``` ## Split the corpus into a training set and a test set ```{r} n_row_train <- round(nrow(corpus)*0.6) cat("Sampling", n_row_train, "comments randomly to train the model, keeping the rest for evaluation") set.seed(42) train_ids <- sample(corpus$doc_id, n_row_train) test_ids <- setdiff(corpus$doc_id, train_ids) train <- corpus[train_ids, ] test <- corpus[test_ids, ] ``` ## Extract the raw vocabulary from the training set ```{r} iterator_train <- itoken(train$text, preprocessor=tolower, tokenizer=word_tokenizer, progressbar=FALSE) vocabulary <- create_vocabulary(iterator_train) n_words <- nrow(vocabulary) n_tokens <- sum(vocabulary$term_count) cat("Number of distinct words:", n_words, "\nNumber of tokens:", n_tokens) ``` ## Vectorize both the training and test sets using this vocabulary ```{r} vectorizer = vocab_vectorizer(vocabulary) train_dtm = create_dtm(iterator_train, vectorizer) iterator_test <- itoken(test$text, preprocessor=tolower, tokenizer=word_tokenizer, progressbar=FALSE) test_dtm = create_dtm(iterator_test, vectorizer) ``` ## Implement a multinomial naive Bayse binary classifier ### Implement a function to estimate the parameters of the naive Bayse model with Laplace smoothing ```{r} mle_mnb <- function(X, Y, k){ d <- ncol(X) q_pos = length(Y[which(Y == 'pos')]) / (length(Y)) q_neg = length(Y[which(Y == 'neg')]) / (length(Y)) X_pos <- X[which(Y == 'pos'), ] q_j_pos <- (colSums(X_pos) + k) / (sum(X_pos) + d * k) X_neg <- X[which(Y == 'neg'), ] q_j_neg <- (colSums(X_neg) + k) / (sum(X_neg) + d * k) q <- rbind(c(q_pos, q_j_pos), c(q_neg, q_j_neg)) return(q) } ``` ### Implement a function to train and evaluate the classifier ```{r} evaluate_mnb <- function(X, Y, k, Z){ log_q <- log(mle_mnb(X, Y, k)) log_ratios <- apply(Z, 1, function(z) c(1, z) %*% log_q[1, ] - c(1, z) %*% log_q[2, ]) predictions <- sapply(log_ratios, function(lr) if(lr < 0){'neg'}else{'pos'}) confusion_matrix <- as.matrix(table(predictions, test$sentiment)) return((confusion_matrix[1,1] + confusion_matrix[2,2]) / sum(confusion_matrix)) } ``` ## Assess the overall accuracy of this classifier for differents values of $k$ ### With the whole vocabulary ```{r} acc <- list() for(k in 1:5){ acc[k] <- evaluate_mnb(train_dtm, train$sentiment, k, test_dtm) } plot(c(1:5), acc, main="Accuracy vs k (i.e. smoothing constant)", xlab="Value of k", ylab="Accuracy") ``` #### With a pruned vocabulary ```{r} pruned_vocabulary <- prune_vocabulary(vocabulary, doc_proportion_max = 0.5, term_count_min = 10) n_words <- nrow(pruned_vocabulary) n_tokens <- sum(pruned_vocabulary$term_count) cat("Number of distinct words:", n_words, "\nNumber of tokens:", n_tokens) vectorizer <- vocab_vectorizer(pruned_vocabulary) train_dtm <- create_dtm(iterator_train, vectorizer) iterator_test <- itoken(test$text, preprocessor=tolower, tokenizer=word_tokenizer, progressbar=FALSE) test_dtm <- create_dtm(iterator_test, vectorizer) acc <- list() for(k in 1:5){ acc[k] <- evaluate_mnb(train_dtm, train$sentiment, k, test_dtm) } plot(c(1:5), acc, main="Accuracy vs k (i.e. smoothing constant)", xlab="Value of k", ylab="Accuracy") ```