# Early Prediction of University Dropouts - A Random Forest Approach
# Andreas Behr, Marco Giese, Herve D. Teguim K., Katja Theune
# The following code is an extract of the original code, that contains nearly
# 5,000 lines of R-code.
# Load the following packages. If the package is not installed on your computer,
# use install.packages("package.name") for installation.
library(dplyr)
library(ROCR)
# The cforest function is included in the "party"-package.
library(party)
library(caret)
#############################################################################
# The data
# In the file “Data Description” we briefly describe which variables are used in
# this study, including the original variable name, the new variable name in our
# dataset, the wave (we generally used the first available wave, since we are
# interested in dropout prediction as early as possible), variable label
# and the range of values of the variables (this is additionally explained in
# more detail in the codebook which is available on the NEPS homepage). Since
# we are not allowed to hand out the prepared data, we provide a detailed
# description of how the dataset was prepared. We saved the pre-processed data as
# .csv format
# Three datasets are obtained separately after pre-processing and represent different episodes of study.
# Loading the prestudy data:
prestudy <- read.csv2(file = "filename.csv")
# Study-related determinants are added
studyrel <- read.csv2(file = "filename.csv")
# Merge the study-related variables with the prestudy variables
studyrel <- merge(prestudy, studyrel, by = "ID_t", all = T)
studyrel2 <- studyrel[, -c(1, 23)]
# Load the early study phase variables and merge them with the prestudy and
# study-related variables:
early_study <- read.csv2(file = "filename.csv")
a <- cbind(studyrel[,1], studyrel2)
colnames(a)[1] <- "ID_t"
early_study2 <- merge(a, early_study[, -(33:36)], by = "ID_t", all = T)
early_study2 <- early_study2[,-1]
# The next dataset contains the merged datasets and additional information
# about panel attrition:
data.d <- read.csv2(file = "filename.csv")
##############################################################################
# The results
# The general loops are for each model nearly the same, so we give
# one example, the other calculations are mainly "copy and paste" of this code,
# whereby the subject group and the data set (prestudy, study-related or
# early study) have to be changed.
# This is one example just for the Engineering students and the early study
# dataset. This code needs some hours of calculation time.
early_study3 <- early_study2[which(early_study2$current_subject == "Ingenieurwissenschaften"), -c(22,23)]
set.seed(1909)
# The folds for the 10-fold cross-validation:
folds <- cut(seq(1,nrow(early_study3)),breaks=10,labels=FALSE)
# Empty confusion matrix:
confMat <- array(numeric(2*2*200), dim = c(2, 2, 200))
# Empty vector for the AUC:
AUC <- numeric(200)
# Values for the ROC-curve. The ROC-curve will be averaged over the 20 iterations.
x <- array(numeric(2*200*(floor(nrow(early_study3) / 10)-10)),
c(floor(nrow(early_study3) / 10)-10, 2, 200))
# We compute a 20-fold repeated 10-fold cross-validation:
for(h in 1:20){
# Randomly changing the rows for each replication, to ensure that we assign
# the observations to the groups randomly:
early_study3 <- early_study3[sample(nrow(early_study3)),]
for(i in 1:10){
# Which cross-validation group is used as test-set?
testIndexes <- which(folds==i)
# Model specification:
model <- cforest(formula = status ~ ., data = early_study3[-testIndexes, ],
control = cforest_control(ntree = 100, maxsurrogate = 5,
mtry = round(sqrt(42))))
# Make predictions for the test-set using our fitted model:
pred <- predict(model, newdata = early_study3[testIndexes,], type = "prob")
pred1 <- matrix(numeric(length(testIndexes)*2), ncol = 2)
for(j in 1:length(testIndexes))
pred1[j,] <- pred[[j]]
# Adjusting the threshold:
tr <- sum(early_study3$status == "graduate") / nrow(early_study3)
# Make the class prediction with the threshold from above:
Klasse <- as.factor(ifelse(pred1[,2] < tr, "dropout", "graduate"))
levels(Klasse) <- c("graduate", "dropout")
# Filling the entries of the confusion matrix:
confMat[,,i+10*(h-1)] <- table(Klasse, early_study3$status[testIndexes])
# Calculating the Area under the ROC-curve (AUC):
AUC[i+10*(h-1)] <- ROCR::performance(prediction(pred1[,2], early_study3$status[testIndexes]),
measure = "auc")@y.values[[1]]
# Filling the entries to later draw the average ROC-curve
x[, 1, i+10*(h-1)] <- sort(sample(ROCR::performance(prediction(pred1[,2],
early_study3$status[testIndexes]),
"tpr", "fpr")@x.values[[1]], floor(nrow(early_study3) / 10)-10))
x[, 2, i+10*(h-1)] <- sort(sample(ROCR::performance(prediction(pred1[,2],
early_study3$status[testIndexes]),
"tpr", "fpr")@y.values[[1]], floor(nrow(early_study3) / 10)-10))
}
}
# Averaging the entries, to calculate a mean (smooth) ROC-curve:
o1 <- apply(x, c(1, 2), mean)
# Calculating the average confusion matrix over the 20 iterations:
confMatr <- diag(2)
for(i in 1:2){
for(j in 1:2)
confMatr[i, j] <- mean(confMat[i, j,])}
confusionMatrix(as.table(confMatr))
# Dropouts Graduates
# Dropouts 12.8 27.7
# Graduates 5.1 107.5
# Final results:
# AUC:
mean(AUC) # 0.8292763
# Accuracy:
confusionMatrix(as.table(confMatr))$overall[1]
# Sensitivity and Precision
confusionMatrix(as.table(confMatr))$byClass[c(1, 5)]
# Confusion matrix:
KM3 <- array(numeric(20), c(2, 2, 5))
KM3[,,1] <- as.table(confMatr)
# Variable importance. Here we use 500 trees to reduce the variance.
# We do not need a train-test procedure here, so we use all observations:
imp <- varimpAUC(cforest(formula = status ~ ., data = early_study3,
control = cforest_unbiased(ntree = 500, maxsurrogate = 5,
mtry = round(sqrt(81)))))
sort(imp, decreasing = T)
# Save the results in the matrix IMP3
IMP3 <- matrix(numeric(5*2*81), ncol = 10)
rownames(IMP3) <- names(imp)
colnames(IMP3) <- rep(c("Engin", "Math", "Law", "Ling", "All"), each = 2)
IMP3[,1] <- rank(imp)
IMP3[,2] <- imp / sum(imp)
# Drawing the ROC-curve:
plot(o1, main = "ROC-curves", xlab = "False Positive Rate",
ylab = "True Positive Rate",
type = "l", col = "black", lwd = 2)
##############################################################################
# Since panel attrition is a big problem for our data, we are checking our
# results for robustness.
b <- which(data.d$status == "graduate" | data.d$status == "dropout")
AUC <- numeric(10)
# Save the values in an array:
AUC_all <- array(numeric(3*5*20), dim = c(3, 5, 20))
dimnames(AUC_all) <- list(c("prestudy", "studyrel", "early_study"),
c("Ingenieurwissenschaften", "Mathematik, Naturwissenschaften",
"Rechts-, Wirtschafts- und Sozialwissenschaften",
"Sprach- und Kulturwissenschaften", "General"), 1:20)
# Watch the progress of the calculation:
pb <- txtProgressBar(min = 0, max = 3*5*20, style = 3)
# We use 20 repetitions, to reduce variance:
for(k in 1:20){
# Loop over the three datasets
for(m in 1:3){
data2 <- get(paste(rownames(AUC_all)[m], "2", sep = ""))
# Loop over the four fields and the general model
for(j in 1:5){
if(j < 5){
a <- which(data2$current_subject == colnames(AUC_all)[j])
data <- data2[a, -c(22,23)]
# Here we just use the observations, that are no final panel dropouts.
# For a second calculation, just using the panel dropouts, set this to "yes".
data <- data[which(data.d$attrition[b[a]] == "no"), ]
}
else{
a <- 1:10010
data <- data2[, -c(22,23)]
}
# Sampling the order of observations:
data <- data[sample(nrow(data)), ]
folds <- cut(seq(1,nrow(data)), breaks=10, labels=F)
for(i in 1:10){
testIndexes <- which(folds==i)
# Model fitting:
model <- cforest(formula = status ~ ., data = data[-testIndexes, ],
control = cforest_unbiased(ntree = 100, maxsurrogate = 5,
mtry = round(sqrt(ncol(data)-1))))
# Out-of-sample prediction with the test data:
pred <- predict(model, newdata = data[testIndexes,], type = "prob")
pred1 <- matrix(numeric(length(testIndexes)*2), ncol = 2)
for(h in 1:length(testIndexes))
pred1[h,] <- pred[[h]]
# Calculation of the AUC:
AUC[i] <- ROCR::performance(prediction(pred1[,2], data$status[testIndexes]),
measure = "auc")@y.values[[1]]
}
# Averaging AUC values over the 10 CV-loops to reduce variance:
AUC_all[m, j, k] <- mean(AUC)
setTxtProgressBar(pb, j+(m-1)*5+(k-1)*15)
}
}
}
# Averaging the AUC values over the 20 repetitions:
apply(AUC_all, c(1,2), mean)