# Outlier removal by the Tukey rules on quartiles +/- 1.5 IQR # 2017 Klodian Dhana outlierKD <- function(dt, var) { var_name <- eval(substitute(var),eval(dt)) tot <- sum(!is.na(var_name)) na1 <- sum(is.na(var_name)) m1 <- mean(var_name, na.rm = T) par(mfrow=c(2, 2), oma=c(0,0,3,0)) boxplot(var_name, main="With outliers") hist(var_name, main="With outliers", xlab=NA, ylab=NA) outlier <- boxplot.stats(var_name)$out mo <- mean(outlier) var_name <- ifelse(var_name %in% outlier, NA, var_name) boxplot(var_name, main="Without outliers") hist(var_name, main="Without outliers", xlab=NA, ylab=NA) title("Outlier Check", outer=TRUE) na2 <- sum(is.na(var_name)) message("Outliers identified: ", na2 - na1, " from ", tot, " observations") message("Proportion (%) of outliers: ", (na2 - na1) / tot*100) message("Mean of the outliers: ", mo) m2 <- mean(var_name, na.rm = T) message("Mean without removing outliers: ", m1) message("Mean if we remove outliers: ", m2) response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ") if(response == "y" | response == "yes"){ dt[as.character(substitute(var))] <- invisible(var_name) assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv) message("Outliers successfully removed", "\n") return(invisible(dt)) } else{ message("Nothing changed", "\n") return(invisible(var_name)) } }