# Early Prediction of University Dropouts - A Random Forest Approach # Andreas Behr, Marco Giese, Herve D. Teguim K., Katja Theune # The following code is an extract of the original code, that contains nearly # 5,000 lines of R-code. # Load the following packages. If the package is not installed on your computer, # use install.packages("package.name") for installation. library(dplyr) library(ROCR) # The cforest function is included in the "party"-package. library(party) library(caret) ############################################################################# # The data # In the file “Data Description” we briefly describe which variables are used in # this study, including the original variable name, the new variable name in our # dataset, the wave (we generally used the first available wave, since we are # interested in dropout prediction as early as possible), variable label # and the range of values of the variables (this is additionally explained in # more detail in the codebook which is available on the NEPS homepage). Since # we are not allowed to hand out the prepared data, we provide a detailed # description of how the dataset was prepared. We saved the pre-processed data as # .csv format # Three datasets are obtained separately after pre-processing and represent different episodes of study. # Loading the prestudy data: prestudy <- read.csv2(file = "filename.csv") # Study-related determinants are added studyrel <- read.csv2(file = "filename.csv") # Merge the study-related variables with the prestudy variables studyrel <- merge(prestudy, studyrel, by = "ID_t", all = T) studyrel2 <- studyrel[, -c(1, 23)] # Load the early study phase variables and merge them with the prestudy and # study-related variables: early_study <- read.csv2(file = "filename.csv") a <- cbind(studyrel[,1], studyrel2) colnames(a)[1] <- "ID_t" early_study2 <- merge(a, early_study[, -(33:36)], by = "ID_t", all = T) early_study2 <- early_study2[,-1] # The next dataset contains the merged datasets and additional information # about panel attrition: data.d <- read.csv2(file = "filename.csv") ############################################################################## # The results # The general loops are for each model nearly the same, so we give # one example, the other calculations are mainly "copy and paste" of this code, # whereby the subject group and the data set (prestudy, study-related or # early study) have to be changed. # This is one example just for the Engineering students and the early study # dataset. This code needs some hours of calculation time. early_study3 <- early_study2[which(early_study2$current_subject == "Ingenieurwissenschaften"), -c(22,23)] set.seed(1909) # The folds for the 10-fold cross-validation: folds <- cut(seq(1,nrow(early_study3)),breaks=10,labels=FALSE) # Empty confusion matrix: confMat <- array(numeric(2*2*200), dim = c(2, 2, 200)) # Empty vector for the AUC: AUC <- numeric(200) # Values for the ROC-curve. The ROC-curve will be averaged over the 20 iterations. x <- array(numeric(2*200*(floor(nrow(early_study3) / 10)-10)), c(floor(nrow(early_study3) / 10)-10, 2, 200)) # We compute a 20-fold repeated 10-fold cross-validation: for(h in 1:20){ # Randomly changing the rows for each replication, to ensure that we assign # the observations to the groups randomly: early_study3 <- early_study3[sample(nrow(early_study3)),] for(i in 1:10){ # Which cross-validation group is used as test-set? testIndexes <- which(folds==i) # Model specification: model <- cforest(formula = status ~ ., data = early_study3[-testIndexes, ], control = cforest_control(ntree = 100, maxsurrogate = 5, mtry = round(sqrt(42)))) # Make predictions for the test-set using our fitted model: pred <- predict(model, newdata = early_study3[testIndexes,], type = "prob") pred1 <- matrix(numeric(length(testIndexes)*2), ncol = 2) for(j in 1:length(testIndexes)) pred1[j,] <- pred[[j]] # Adjusting the threshold: tr <- sum(early_study3$status == "graduate") / nrow(early_study3) # Make the class prediction with the threshold from above: Klasse <- as.factor(ifelse(pred1[,2] < tr, "dropout", "graduate")) levels(Klasse) <- c("graduate", "dropout") # Filling the entries of the confusion matrix: confMat[,,i+10*(h-1)] <- table(Klasse, early_study3$status[testIndexes]) # Calculating the Area under the ROC-curve (AUC): AUC[i+10*(h-1)] <- ROCR::performance(prediction(pred1[,2], early_study3$status[testIndexes]), measure = "auc")@y.values[[1]] # Filling the entries to later draw the average ROC-curve x[, 1, i+10*(h-1)] <- sort(sample(ROCR::performance(prediction(pred1[,2], early_study3$status[testIndexes]), "tpr", "fpr")@x.values[[1]], floor(nrow(early_study3) / 10)-10)) x[, 2, i+10*(h-1)] <- sort(sample(ROCR::performance(prediction(pred1[,2], early_study3$status[testIndexes]), "tpr", "fpr")@y.values[[1]], floor(nrow(early_study3) / 10)-10)) } } # Averaging the entries, to calculate a mean (smooth) ROC-curve: o1 <- apply(x, c(1, 2), mean) # Calculating the average confusion matrix over the 20 iterations: confMatr <- diag(2) for(i in 1:2){ for(j in 1:2) confMatr[i, j] <- mean(confMat[i, j,])} confusionMatrix(as.table(confMatr)) # Dropouts Graduates # Dropouts 12.8 27.7 # Graduates 5.1 107.5 # Final results: # AUC: mean(AUC) # 0.8292763 # Accuracy: confusionMatrix(as.table(confMatr))$overall[1] # Sensitivity and Precision confusionMatrix(as.table(confMatr))$byClass[c(1, 5)] # Confusion matrix: KM3 <- array(numeric(20), c(2, 2, 5)) KM3[,,1] <- as.table(confMatr) # Variable importance. Here we use 500 trees to reduce the variance. # We do not need a train-test procedure here, so we use all observations: imp <- varimpAUC(cforest(formula = status ~ ., data = early_study3, control = cforest_unbiased(ntree = 500, maxsurrogate = 5, mtry = round(sqrt(81))))) sort(imp, decreasing = T) # Save the results in the matrix IMP3 IMP3 <- matrix(numeric(5*2*81), ncol = 10) rownames(IMP3) <- names(imp) colnames(IMP3) <- rep(c("Engin", "Math", "Law", "Ling", "All"), each = 2) IMP3[,1] <- rank(imp) IMP3[,2] <- imp / sum(imp) # Drawing the ROC-curve: plot(o1, main = "ROC-curves", xlab = "False Positive Rate", ylab = "True Positive Rate", type = "l", col = "black", lwd = 2) ############################################################################## # Since panel attrition is a big problem for our data, we are checking our # results for robustness. b <- which(data.d$status == "graduate" | data.d$status == "dropout") AUC <- numeric(10) # Save the values in an array: AUC_all <- array(numeric(3*5*20), dim = c(3, 5, 20)) dimnames(AUC_all) <- list(c("prestudy", "studyrel", "early_study"), c("Ingenieurwissenschaften", "Mathematik, Naturwissenschaften", "Rechts-, Wirtschafts- und Sozialwissenschaften", "Sprach- und Kulturwissenschaften", "General"), 1:20) # Watch the progress of the calculation: pb <- txtProgressBar(min = 0, max = 3*5*20, style = 3) # We use 20 repetitions, to reduce variance: for(k in 1:20){ # Loop over the three datasets for(m in 1:3){ data2 <- get(paste(rownames(AUC_all)[m], "2", sep = "")) # Loop over the four fields and the general model for(j in 1:5){ if(j < 5){ a <- which(data2$current_subject == colnames(AUC_all)[j]) data <- data2[a, -c(22,23)] # Here we just use the observations, that are no final panel dropouts. # For a second calculation, just using the panel dropouts, set this to "yes". data <- data[which(data.d$attrition[b[a]] == "no"), ] } else{ a <- 1:10010 data <- data2[, -c(22,23)] } # Sampling the order of observations: data <- data[sample(nrow(data)), ] folds <- cut(seq(1,nrow(data)), breaks=10, labels=F) for(i in 1:10){ testIndexes <- which(folds==i) # Model fitting: model <- cforest(formula = status ~ ., data = data[-testIndexes, ], control = cforest_unbiased(ntree = 100, maxsurrogate = 5, mtry = round(sqrt(ncol(data)-1)))) # Out-of-sample prediction with the test data: pred <- predict(model, newdata = data[testIndexes,], type = "prob") pred1 <- matrix(numeric(length(testIndexes)*2), ncol = 2) for(h in 1:length(testIndexes)) pred1[h,] <- pred[[h]] # Calculation of the AUC: AUC[i] <- ROCR::performance(prediction(pred1[,2], data$status[testIndexes]), measure = "auc")@y.values[[1]] } # Averaging AUC values over the 10 CV-loops to reduce variance: AUC_all[m, j, k] <- mean(AUC) setTxtProgressBar(pb, j+(m-1)*5+(k-1)*15) } } } # Averaging the AUC values over the 20 repetitions: apply(AUC_all, c(1,2), mean)