library(readr)
library(parallel)
library(bench)
library(dplyr)
library(data.table)


### FUNCIONES ####
## leer_trozo: permite cargar una parte de las filas de la base de datos contenida en archivo, donde:

# - archivo: ruta completa o parcial en disco de la base de datos.
# - parte: el numero de porcion que se leera.
# - longitud: numero de filas de cada parte.
# 
leer_trozo = function(archivo,parte,longitud){
   
  trozo = read_csv(archivo, skip = (parte-1)*longitud+1,n_max = longitud,col_names = FALSE,
                   col_types = cols())
  return(trozo)
  
}

## cuenta: Contabiliza cuantos elementos no son NAs
cuenta = function(x) sum(!is.na(x))

## funCuenta: Cuantos elementos diferentes de NA hay en cada columna del trozo leido
funCuentaTiempo = function(archivo,parte,longitud,quitar) {
  
  t1 = Sys.time()
  M = leer_trozo(archivo,parte,longitud)
  M = select(M, -quitar)
  k = apply(M,2,cuenta)
  t2 = Sys.time()
  d = as.numeric(difftime(t2, t1), units="secs") # llevar tiempo a segundos
  d1 = data.frame(t1,t2,d)
  return(d1)
}


#### PARAMETROS ####
k = 10 # total de partes a ser leidas
lon = 10000 # longitud de cada parte
# Posición de las columnas de la base de datos que no son numericas, 
# sino que contienen una etiqueta, y no deben incluirse en los cilculos
nonumer = c(1,2,54,55,61,63,156,158,159,162,168) # las NO numéricas
# Ruta relativa del archivo que contiene la base de datos
archivo1 = "~/Desktop/R ZORA Ejemplos/CLASE 1/train_data.csv"


#### CASOS ####
######  Secuencial  #######
ben_sec = bench::mark(
  lapply(1:k, funCuentaTiempo, archivo=archivo1,longitud = lon, quitar = nonumer), 
  iterations = 1
)  

ben_sec_df = rbindlist(ben_sec$result[[1]]) # transforma las listas en dataframe
ben_sec_df
# tiempo en segundos
######  Paralelo  #######
cl <- makeCluster(detectCores())
clusterExport(cl,c('leer_trozo','read_csv','select','cuenta','cols'))

ben_par = bench::mark(
  clusterApply(cl, 1:k, funCuentaTiempo, archivo=archivo1,longitud = lon, quitar = nonumer),
  iterations = 1
)  

stopCluster(cl)

ben_par_df = rbindlist(ben_par$result[[1]])
ben_par_df


#####  Paralelo LB #######
cl <- makeCluster(detectCores())
clusterExport(cl,c('leer_trozo','read_csv','select','cuenta','cols'))

ben_parLB = bench::mark(
  
  clusterApplyLB(cl, 1:k, funCuentaTiempo, archivo=archivo1,longitud = lon, quitar = nonumer),
  iterations = 1
)  

stopCluster(cl)

ben_parLB_df = rbindlist(ben_parLB$result[[1]])
ben_parLB_df


tiempos = cbind( ben_sec$total_time, ben_par$total_time, ben_parLB$total_time)
colnames(tiempos) =  c('Secuencial','Paralelo','Paralelo LB')


ben_sec_df
ben_par_df
ben_parLB_df

tiempos