Student Dropout Prediction

Created by Ramses Alexander Coraspe Valdez

Created on August 5, 2020

Installing libraries

In [32]:
# install.packages("genalg")
# install.packages("tidyverse")
# install.packages("cluster")
# install.packages("factoextra")
# install.packages("corrplot")
# install.packages("party")
# install.packages("neuralnet")

Importing libraries

In [33]:
library(genalg)
library(ggplot2)
library(tidyverse)
library(cluster)    
library(factoextra) 
library(neuralnet)

Building the dataset

In [34]:
get_data_assistance <- function(data_r, r1, r2) {
    
  assistance <- vector()
  for(i in 1:1000){
    assistance <- c(assistance, round((sum(asistencias.totales[[i]][,r1:r2]==0) *100) / 192))
  }
  assistance
}

get_data_Biblio <- function(data_r, r1, r2) {  
  suma <- 0
  for(i in 1:1000){
    suma <- suma + sum(data_r[[i]][,r1:r2])
  }  
  factor_biblio <- (round(suma)/1000)/6  
  uso_biblio <- vector()
  for(i in 1:1000){
    uso_biblio <- c(uso_biblio,sum(data_r[[i]][,r1:r2]>=factor_biblio))
  }  
  uso_biblio
}
get_data_plataforma <- function(data_r, r1, r2) {  

  suma <- 0
  for(i in 1:1000){
    suma <- suma + sum(data_r[[i]][,r1:r2])
  }  
  factor_plataforma <- (round(suma)/1000)/6  
  uso_plat <- vector()
  for(i in 1:1000){
    uso_plat <- c(uso_plat,sum(data_r[[i]][,r1:r2]>=factor_plataforma))
  }  
  uso_plat
}
get_data_libros <- function(data_r, r1, r2) {  

  suma <- 0
  for(i in 1:1000){
    suma <- suma + sum(data_r[[i]][,r1:r2])
  }  
  factor_apartado <- (round(suma)/1000)/6  
  apa_Lib <- vector()
  for(i in 1:1000){
    apa_Lib <- c(apa_Lib,sum(data_r[[i]][,r1:r2]>=factor_apartado))
  }  
  apa_Lib
}
get_data_pagos <- function(data_r) {  
  promedio.pagos.mtx <-matrix(,nrow=0,ncol=1)  
  for (i in 1:1000) {
    pagos.vct <- vector()
    pagos.vct[1] <- mean(data_r[[i]][,1:2]) 
    promedio.pagos.mtx <- rbind(promedio.pagos.mtx,pagos.vct)
  }  
  rownames(promedio.pagos.mtx) <- 1:1000  
  #Si hay mas de un retraso enciendo la alarma con 0
  promedio.pagos.mtx[promedio.pagos.mtx < 1.875] <- 0  
  #Minimo un retraso y no se considera con status de pago en alarma
  promedio.pagos.mtx[promedio.pagos.mtx >= 1.875] <- 1  
  promedio.pagos.mtx  
}

get_data_examenes <- function(data_r, r1, r2){  
  promedio.examenes.mtx <- matrix(,nrow=0,ncol=1)  
  for (i in 1:1000) {
    resultados.vct <- vector()
    resultados.vct[1] <- round(mean(data_r[[i]][,r1:r2]))
    promedio.examenes.mtx <- rbind(promedio.examenes.mtx,resultados.vct)    
  }
  rownames(promedio.examenes.mtx) <- 1:1000  
  promedio.examenes.mtx    
}
get_data_trabajos <- function(data_r, r1, r2){  
  promedio.trabajos.mtx <- matrix(,nrow=0,ncol=1)
  for (i in 1:1000) {
    resultados.vct <- vector()
    resultados.vct[1] <- round(mean(data_r[[i]][,r1:r2]))
    promedio.trabajos.mtx <- rbind(promedio.trabajos.mtx,resultados.vct)    
  }
  rownames(promedio.trabajos.mtx) <- 1:1000
  promedio.trabajos.mtx
}
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
In [35]:
load("AsistenciasTotales.R")
f_as_1 <-  get_data_assistance(asistencias.totales, 1, 6)
f_as_2 <-  get_data_assistance(asistencias.totales, 7, 12)

load("perfilAlumnos.R")
perfil.alumnos$genero <- as.numeric(perfil.alumnos$genero)
perfil.alumnos[perfil.alumnos$genero==2,]$genero <- 0
perfil.alumnos$evalucion.socioeconomica <- as.numeric(perfil.alumnos$evalucion.socioeconomica)
perfil.alumnos$edad.ingreso <- as.numeric(perfil.alumnos$edad.ingreso)
data_set <- cbind(perfil.alumnos, f_as_1, f_as_2)

load("ResultadosExamenes.R")
f_examenes_1 <- get_data_examenes(resultados.examenes.totales, 1,6) 
f_examenes_2 <- get_data_examenes(resultados.examenes.totales, 7,12)
data_set <- cbind(data_set, f_examenes_1, f_examenes_2)

load("ResultadoTrabajos.R")
f_trabajos_1 <- get_data_trabajos(resultados.trabajos.totales, 1, 6)
f_trabajos_2 <- get_data_trabajos(resultados.trabajos.totales, 7, 12)
data_set <- cbind(data_set, f_trabajos_1, f_trabajos_2)

load("UsoBiblioteca.R")
f_bibl_1 <-  get_data_Biblio(uso.biblioteca.totales, 1, 6)
f_bibl_2 <-  get_data_Biblio(uso.biblioteca.totales, 7, 12)
data_set <- cbind(data_set, f_bibl_1, f_bibl_2)


load("UsoPlataforma.R")
f_plat_1 <- get_data_plataforma(uso.plataforma.totales, 1, 6)
f_plat_2 <- get_data_plataforma(uso.plataforma.totales, 7, 12)
data_set <- cbind(data_set, f_plat_1, f_plat_2)

load("ApartadoDeLibros.R")
f_libros_1 <- get_data_libros(separacion.libros.totales, 1, 6)
f_libros_2 <- get_data_libros(separacion.libros.totales, 7, 12)
data_set <- cbind(data_set, f_libros_1, f_libros_2)

load("Becas.R")
data_set <- cbind(data_set, distribucion.becas)

load("HistorialPagos.R")
f_pagos_status <-  get_data_pagos(registro.pagos)
data_set <- cbind(data_set, f_pagos_status)

load("CambioCarrera.R")
datos.integrados.R <- cbind(data_set, cambio.carrera)
In [36]:
write.csv(datos.integrados.R,"datos.integrados.csv", row.names = TRUE)
datos.integrados <- read.csv("datos.integrados.csv")
rownames(datos.integrados) <- 1:1000
datos.integrados$X <- NULL
In [37]:
set.seed(3)
ind <- sample(x=c(0,1),size=nrow(datos.integrados),replace=TRUE,prob = c(0.9,0.1))
Training.set <- datos.integrados[ind==0,]
Test.set <- datos.integrados[ind==1,]
Test.set.aux <- Test.set
In [38]:
head(Training.set,20)
A data.frame: 20 × 22
generoadmision.letrasadmision.numerospromedio.preparatoriaedad.ingresoevalucion.socioeconomicanota.conductaf_as_1f_as_2f_examenes_1f_trabajos_2f_bibl_1f_bibl_2f_plat_1f_plat_2f_libros_1f_libros_2distribucion.becasf_pagos_statuscambio.carrera
<int><dbl><dbl><dbl><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int>
1060.0937335.1874670.2811918416 7 81411202020010
2059.0787433.1574767.236211741510 91413212121000
3053.1433521.2866960.000001541312 81312101010000
4157.0041629.0083261.0124816414 3 51514434343010
5061.4727337.9454574.418181841610 91312111111010
6061.9489738.8979475.846911841611 31413323232000
7053.9596222.9192460.0000015413 1 31516565656000
8058.1816231.3632464.5448617415 2111513424242010
9151.8666418.7332760.0000014412 1 71213010101010
10058.7176132.4352266.1528217415 1 31514434343010
11165.5089046.0178086.5266920418 5 51213010101010
12063.7789142.5578281.3367219417 5121312101010000
13058.8088332.6176666.4265017415 4 51611515151010
14064.9372244.8744584.8116719417 9 31312111111010
15063.7069542.4139081.1208519417 9 41613424242010
16060.4467435.8934771.3402118416 3 51412434343010
17055.2252825.4505660.0000016414 9 41212000000010
18059.0242533.0485067.0727417415 2 31513444444000
19064.6276144.2552183.8828219417 8161212000000010
20162.4148939.8297977.2446818316 6 31513444444000
In [39]:
head(Test.set,100)
A data.frame: 100 × 22
generoadmision.letrasadmision.numerospromedio.preparatoriaedad.ingresoevalucion.socioeconomicanota.conductaf_as_1f_as_2f_examenes_1f_trabajos_2f_bibl_1f_bibl_2f_plat_1f_plat_2f_libros_1f_libros_2distribucion.becasf_pagos_statuscambio.carrera
<int><dbl><dbl><dbl><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int>
28055.6392126.27841 60.000001641411221212000000010
54070.6888456.37767100.0000022420 1 31516555555000
56063.9317142.86342 81.7951419417 6 61412212121000
73154.7577724.51554 60.0000015413 1 61415343434010
74058.9074832.81496 66.7224517415 6 81512424242010
85062.2012139.40242 76.6036318416 4121511404040010
90059.6302234.26044 68.8906617415 6 51214232323010
115164.6214744.24293 83.8644019317 1 31414343434010
161163.2658341.53166 79.7974919317 8 51213121212010
198053.2474321.49485 60.0000015413 1 31714656565000
216153.2577421.51549 60.0000015313 1 31615555555000
227060.4865635.97312 71.4596718416 1 31513535353010
248158.2106031.42120 64.6318017215 4 31412323232110
260154.9354824.87096 60.0000015313 1 61413232323010
262062.3887039.77740 77.1661118416 6 71414333333010
265064.9017744.80354 84.7053019417 4 41512424242010
273157.5942330.18845 62.7826817415 8 41213121212010
274061.1136837.22735 73.3410318416 4 31314343434010
288058.7133832.42675 66.1401317415 9121212000000000
297165.7789946.55797 87.3369620218 1 81513434343111
303056.8465728.69315 60.539721641412 41513424242000
310154.1197723.23953 60.000001531322101411303030000
316069.9821754.96434 99.9465221419 4 81314141414000
319073.2224461.44488100.0000023420 4 41516464646000
327058.8406332.68125 66.5218817415 4 31514454545010
329060.1115535.22311 70.3346618416 3 31215151515010
330160.2223435.44467 70.6670118116 1 31412434343101
336158.2787431.55748 64.8362217315 6 31214252525010
353051.9745718.94915 60.0000014412 4121214040404010
360170.0604155.12082100.0000022320 1 31616666666000
732149.9573814.9147760.0000013311 4 71614535353010
737155.5398326.0796760.0000016114 1 31513444444111
741058.3742731.7485365.1228017415 1 31314353535000
759157.9698230.9396363.9094517415 1101214030303010
796052.1996119.3992260.0000014412 1 31516464646000
798048.2947211.5894460.0000013411 9 81312202020000
801049.3746613.7493160.0000013411 4111211111111010
806156.3676427.7352960.0000016414 7 81412212121000
813063.7292042.4583981.1875919417 1 41412313131010
817163.7824642.5649181.3473719217 1 31311313131110
819056.3334227.6668460.0000016414 7141413222222010
826059.8954134.7908269.6862317415 7 61213121212010
833063.9266542.8533081.7799519417 4 31314353535010
842062.2715939.5431876.8147718416 4 31512434343000
844057.2345729.4691461.7037016414 1 31615656565000
857155.7818426.5636960.0000016214 4 71212010101111
858165.1246445.2492785.3739120418 9131313121212000
867165.5873946.1747986.7621820318 1 41413232323000
899057.9174330.8348663.752281741514161212000000010
910063.4682941.9365880.4048719417 3 31315252525010
917165.7761046.5522087.3282920318 3 31412333333000
924051.8114018.6228160.0000014412 4 31414242424010
929066.4224347.8448789.2673020418 4 51313232323010
937163.6213842.2427580.8641319217 4 31415252525110
949064.4338543.8676983.3015419417 6 41313323232000
958157.9234330.8468663.7702917115 3 31512434343101
963069.3891353.7782698.1673821419 7 51411202020010
973057.1096729.2193461.3290116414 6 91312101010010
981162.7080440.4160878.124121931716 91312101010010
987062.0814939.1629876.2444718416 6 41612515151010
In [40]:
boxplot(Training.set, col = "lightblue")
# Training.set <- apply(Training.set , MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
# Training.set <- as.data.frame(scale(Training.set))
# boxplot(Training.set,col = "lightblue")

Checking for correlations between variables

In [41]:
source("http://www.sthda.com/upload/rquery_cormat.r")

col<- colorRampPalette(c("blue", "white", "red"))(20)
rquery.cormat(Training.set, type="flatten", col=col)
$r
A data.frame: 231 × 4
rowcolumncorp
<fct><fct><dbl><dbl>
genero distribucion.becas 0.55000 1.0e-72
genero cambio.carrera 0.41000 1.2e-38
distribucion.becascambio.carrera 0.750001.8e-164
genero f_pagos_status -0.01500 6.6e-01
distribucion.becasf_pagos_status -0.03400 3.0e-01
cambio.carrera f_pagos_status -0.00720 8.3e-01
genero f_as_1 0.06700 4.6e-02
distribucion.becasf_as_1 0.04900 1.4e-01
cambio.carrera f_as_1 0.03300 3.2e-01
f_pagos_status f_as_1 0.06100 6.6e-02
genero f_as_2 0.08000 1.7e-02
distribucion.becasf_as_2 0.04600 1.7e-01
cambio.carrera f_as_2 -0.00039 9.9e-01
f_pagos_status f_as_2 0.12000 3.2e-04
f_as_1 f_as_2 0.46000 4.7e-48
genero f_libros_1 -0.05600 9.4e-02
distribucion.becasf_libros_1 -0.01500 6.5e-01
cambio.carrera f_libros_1 0.00710 8.3e-01
f_pagos_status f_libros_1 -0.32000 6.2e-23
f_as_1 f_libros_1 -0.46000 8.9e-48
f_as_2 f_libros_1 -0.52000 1.4e-64
genero f_bibl_1 -0.05600 9.4e-02
distribucion.becasf_bibl_1 -0.01500 6.5e-01
cambio.carrera f_bibl_1 0.00710 8.3e-01
f_pagos_status f_bibl_1 -0.32000 6.2e-23
f_as_1 f_bibl_1 -0.46000 8.9e-48
f_as_2 f_bibl_1 -0.52000 1.4e-64
f_libros_1 f_bibl_1 1.00000 0.0e+00
genero f_plat_1 -0.05600 9.4e-02
distribucion.becasf_plat_1 -0.01500 6.5e-01
f_libros_2 edad.ingreso -0.00280.93
f_bibl_2 edad.ingreso -0.00280.93
f_plat_2 edad.ingreso -0.00280.93
f_examenes_2 edad.ingreso 0.00930.78
f_trabajos_2 edad.ingreso 0.01100.74
evalucion.socioeconomicaedad.ingreso -0.00670.84
promedio.preparatoria edad.ingreso 0.94000.00
admision.letras edad.ingreso 0.99000.00
admision.numeros edad.ingreso 0.99000.00
genero nota.conducta 0.04200.21
distribucion.becas nota.conducta 0.01300.69
cambio.carrera nota.conducta-0.00410.90
f_pagos_status nota.conducta 0.05400.10
f_as_1 nota.conducta-0.01100.74
f_as_2 nota.conducta 0.01400.67
f_libros_1 nota.conducta-0.03600.28
f_bibl_1 nota.conducta-0.03600.28
f_plat_1 nota.conducta-0.03600.28
f_examenes_1 nota.conducta-0.03300.33
f_trabajos_1 nota.conducta-0.03300.32
f_libros_2 nota.conducta-0.00470.89
f_bibl_2 nota.conducta-0.00470.89
f_plat_2 nota.conducta-0.00470.89
f_examenes_2 nota.conducta 0.00820.81
f_trabajos_2 nota.conducta 0.00970.77
evalucion.socioeconomicanota.conducta-0.00850.80
promedio.preparatoria nota.conducta 0.94000.00
admision.letras nota.conducta 0.99000.00
admision.numeros nota.conducta 0.99000.00
edad.ingreso nota.conducta 1.00000.00
$p
NULL
$sym
NULL

Removing highly correlated features

In [42]:
Training.set$nota.conducta <- NULL
Training.set$f_plat_1 <- NULL
Training.set$f_plat_2 <- NULL
Training.set$f_libros_1 <- NULL
Training.set$f_libros_2 <- NULL
Training.set$f_trabajos_1 <- NULL
Training.set$f_trabajos_2 <- NULL

Labeling the students using Clustering Analysis with k-means

In [43]:
set.seed(123)
pal_color = "simpsons"

kplot2 <- kmeans(Training.set, centers = 2, iter.max = 25, nstart = 1)
plot2 <-  fviz_cluster(kplot2, data = Training.set, ellipse.type = "convex",palette = pal_color,ggtheme = theme_minimal())
plot2

set.seed(123)
kplot3 <- kmeans(Training.set, centers = 3, iter.max = 25, nstart = 1)
plot3 <-  fviz_cluster(kplot3, data = Training.set,ellipse.type = "convex",palette = pal_color,ggtheme = theme_minimal())
plot3

set.seed(123)
kplot4 <- kmeans(Training.set, centers = 4, iter.max = 25, nstart = 1)
plot4 <-  fviz_cluster(kplot4, data = Training.set, ellipse.type = "convex",palette = pal_color,ggtheme = theme_minimal())
plot4

set.seed(123)
kplot5 <- kmeans(Training.set, centers = 5, iter.max = 25, nstart = 1)
plot5 <-  fviz_cluster(kplot5, data = Training.set,ellipse.type = "convex", palette = pal_color,ggtheme = theme_minimal())
plot5

set.seed(123)
kplot6 <- kmeans(Training.set, centers = 6, iter.max = 25, nstart = 1)
plot6 <-  fviz_cluster(kplot6, data = Training.set,ellipse.type = "convex",palette = pal_color,ggtheme = theme_minimal())
plot6

set.seed(123)
kplot7 <- kmeans(Training.set, centers = 7, iter.max = 25, nstart = 1)
plot7 <-  fviz_cluster(kplot7, data = Training.set,ellipse.type = "convex",palette = pal_color, ggtheme = theme_minimal())
plot7

set.seed(123)
kplot8 <- kmeans(Training.set, centers = 8, iter.max = 25, nstart = 1)
plot8 <-  fviz_cluster(kplot8, data = Training.set,ellipse.type = "convex",palette = pal_color,ggtheme = theme_minimal())
plot8

set.seed(123)
kplot9 <- kmeans(Training.set, centers = 9, iter.max = 25, nstart = 1)
plot9 <-  fviz_cluster(kplot9, data = Training.set,ellipse.type = "convex",palette = pal_color,ggtheme = theme_minimal())
plot9


set.seed(123)
kplot10 <- kmeans(Training.set, centers = 10, iter.max = 25, nstart = 1)
plot10 <-  fviz_cluster(kplot10, data = Training.set,ellipse.type = "convex", palette = pal_color,ggtheme = theme_minimal())
plot10

set.seed(123)
kplot11 <- kmeans(Training.set, centers = 11, iter.max = 25, nstart = 1)
plot11 <-  fviz_cluster(kplot11, data = Training.set,ellipse.type = "convex", palette = pal_color,ggtheme = theme_minimal())
plot11

set.seed(123)
kplot12 <- kmeans(Training.set, centers = 12, iter.max = 25, nstart = 1)
plot12 <-  fviz_cluster(kplot12, data = Training.set,ellipse.type = "convex",palette = pal_color,ggtheme = theme_minimal())
plot12
In [44]:
set.seed(123)
fviz_nbclust(Training.set, kmeans, method = "wss")
In [45]:
kplot12$size
kplot12$centers
  1. 61
  2. 79
  3. 142
  4. 43
  5. 40
  6. 123
  7. 65
  8. 68
  9. 32
  10. 115
  11. 102
  12. 30
A matrix: 12 × 15 of type dbl
generoadmision.letrasadmision.numerospromedio.preparatoriaedad.ingresoevalucion.socioeconomicaf_as_1f_as_2f_examenes_1f_examenes_2f_bibl_1f_bibl_2distribucion.becasf_pagos_statuscambio.carrera
10.459016458.0602231.1204464.2275216.754103.29508212.491803 9.78688512.7868912.737701.1967211.3278690.245901640.83606560.14754098
20.443038061.1127237.2254473.3381517.987343.32911410.53164610.10126612.8227812.746841.1898731.3924050.227848100.78481010.08860759
30.323943762.6477140.2954177.9431218.563383.619718 2.852113 4.25352113.9154913.584512.9788733.5000000.105633800.69718310.05633803
40.418604754.9543324.9086660.0916415.465123.58139510.372093 5.53488413.1627912.813951.6279071.6046510.116279070.76744190.09302326
50.450000050.2028215.4056460.0000013.575003.550000 3.750000 5.22500013.7000013.350002.6250003.0750000.100000000.67500000.07500000
60.414634159.4633333.9266668.3899917.260163.365854 3.349593 4.40650413.7561013.626022.7723583.2357720.211382110.71544720.13821138
70.276923153.7967822.5935660.0000015.030773.646154 2.723077 4.16923114.0307713.476923.1076923.2307690.076923080.64615380.04615385
80.470588264.8006244.6012584.4018719.411763.51470611.01470610.67647112.7794112.558821.0147061.1029410.117647060.70588240.08823529
90.437500070.5322356.0644598.4007521.812503.500000 6.718750 6.40625013.1875013.000001.8750002.2187500.125000000.81250000.06250000
100.373913056.9140828.8281761.4639416.313043.391304 2.600000 4.83478313.8608713.713042.9565223.4086960.217391300.70434780.13043478
110.450980465.9617346.9234687.8851919.882353.401961 2.539216 4.21568613.7156913.784312.7745103.3921570.205882350.73529410.13725490
120.500000053.4001321.8002560.0000014.933333.13333311.20000013.66666712.5666712.700000.8000001.0666670.266666670.60000000.20000000

Insights about the K choosed (K=12)

The students will be separated in two groups:

Students at risk of dropping out

Students who do not drop out

Lets clarify some features of this analysis:

  1. The variable "f_as_1" is the percentage of absence of the student in semester 1 and the variable "f_as_2" is the percentage of absence of the student in semester 2, if the value is very high then the student has been absent many times in that semester and this is an indicator of possible dropout, the clusters 1, 2, 8 and 12 are the hihger values for these variables

  2. The variable "f_examenes_1" is the average of the student's exam scores in semester 1 and the variable "f_examenes_2 is the average of the student's exams scores for semester 2, in the previous table it can be seen that both are the lower for clusters 1, 2, 8 and 12, these are very clear indicators of a possible risk of dropout.

  3. The variables "f_bibl_1" is the factor of university use of the library in semester 1 and the variable "f_bibl_2" is the factor of university use of the library for semester 2 of the student, both variables mean how frequent the use is of the university's educational resources for a student and their interest in studying and researching. A very low value represents a possible dropout case, the clusters 1, 2, 8 and 12 are the lower values for these variables.

In [46]:
Training.set <- cbind(Training.set, cluster = kplot12$cluster) 
Training.set <- cbind(Training.set, dropout = kplot12$cluster)

Training.set[Training.set$cluster==1,]$dropout  <- 1
Training.set[Training.set$cluster==2,]$dropout  <- 1
Training.set[Training.set$cluster==8,]$dropout  <- 1
Training.set[Training.set$cluster==12,]$dropout <- 1

Training.set[Training.set$cluster==3,]$dropout  <- 0
Training.set[Training.set$cluster==4,]$dropout  <- 0
Training.set[Training.set$cluster==5,]$dropout  <- 0
Training.set[Training.set$cluster==6,]$dropout  <- 0
Training.set[Training.set$cluster==7,]$dropout  <- 0
Training.set[Training.set$cluster==9,]$dropout  <- 0
Training.set[Training.set$cluster==10,]$dropout <- 0
Training.set[Training.set$cluster==11,]$dropout <- 0

Training.set$cluster <- NULL
In [47]:
sum(Training.set$dropout==1)
sum(Training.set$dropout==0)
238
662

Data Normalization

In [48]:
test_rows <- rownames(Test.set)

Training.set <- as.data.frame(scale(Training.set))
Test.set <- as.data.frame(scale(Test.set))

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

Training.set <- as.data.frame(lapply(Training.set, normalize))
Test.set <- as.data.frame(lapply(Test.set, normalize)) 

rownames(Test.set) <- test_rows

Splitting the 900 students in two groups, Training (700) and Evaluation (200)

In [49]:
set.seed(12345)

rows <- sample(nrow(Training.set))
Complete.set <- Training.set[rows, ]

Training.set   <- tail(Complete.set, 700)
Evaluation.set <- head(Complete.set, 200)

nrow(Training.set)
nrow(Evaluation.set)
700
200

Creating a model using ANN to predict possible dropout students

Training the model with the 700 students of the Training set

In [50]:
nn=neuralnet(dropout~genero+admision.letras+admision.numeros+promedio.preparatoria+edad.ingreso+evalucion.socioeconomica+f_as_1+f_as_2+f_examenes_1+f_examenes_2+f_bibl_1+f_bibl_2+distribucion.becas+f_pagos_status+cambio.carrera, 
             data=Training.set, 
             hidden=c(10,5,3), 
             act.fct = "logistic", 
             linear.output = FALSE,
             stepmax=10^5,threshold = 0.01)
In [51]:
plot(nn)

Testing the model with the 200 students of the Evaluation set

In [52]:
temp_test <- subset(Evaluation.set, select = c("genero","admision.letras","admision.numeros","promedio.preparatoria","edad.ingreso","evalucion.socioeconomica","f_as_1","f_as_2","f_examenes_1","f_examenes_2","f_bibl_1","f_bibl_2","distribucion.becas","f_pagos_status","cambio.carrera"))
nn.results <- compute(nn, temp_test)
results <- data.frame(actual = Evaluation.set$dropout, prediction = nn.results$net.result)

94% of accuracy predicting with the evaluation set

In [53]:
roundedresults<-sapply(results,round,digits=0)
roundedresultsdf=data.frame(roundedresults)
attach(roundedresultsdf)
confusion_table <- table(actual,prediction)
confusion_table
n <- sum(confusion_table)
diag <- diag(confusion_table) 
accuracy <- sum(diag) / n
accuracy
The following objects are masked from roundedresultsdf (pos = 3):

    actual, prediction


      prediction
actual   0   1
     0 145   4
     1   8  43
0.94

Testing the model with the 100 students of the Test set

In [54]:
temp_test <- subset(Test.set, select = c("genero","admision.letras","admision.numeros","promedio.preparatoria","edad.ingreso","evalucion.socioeconomica","f_as_1","f_as_2","f_examenes_1","f_examenes_2","f_bibl_1","f_bibl_2","distribucion.becas","f_pagos_status","cambio.carrera"))
nn.results <- compute(nn, temp_test)
results <- data.frame(row= rownames(Test.set), prediction = round(nn.results$net.result))
students_prediction_rows <- as.vector(results[results$prediction==1,]$row)

Showing the most likely students which could dropout

In [138]:
Test.set.aux <- subset(Test.set.aux, select = c("genero","admision.letras","admision.numeros","promedio.preparatoria","edad.ingreso","evalucion.socioeconomica","f_as_1","f_as_2","f_examenes_1","f_examenes_2","f_bibl_1","f_bibl_2","distribucion.becas","f_pagos_status","cambio.carrera"))
dropout_students <- Test.set.aux[students_prediction_rows,]
dropout_students$id_alumno <- seq(1:length(students_prediction_rows))
dropout_students
A data.frame: 16 × 16
generoadmision.letrasadmision.numerospromedio.preparatoriaedad.ingresoevalucion.socioeconomicaf_as_1f_as_2f_examenes_1f_examenes_2f_bibl_1f_bibl_2distribucion.becasf_pagos_statuscambio.carreraid_alumno
<int><dbl><dbl><dbl><int><int><int><int><int><int><int><int><int><int><int><int>
28055.6392126.2784160.000001641122121200010 1
161163.2658341.5316679.79749193 8 5121312010 2
288058.7133832.4267566.14013174 912121200000 3
310154.1197723.2395360.000001532210141130000 4
451165.8198446.6396887.459512032114131413010 5
484064.1934543.3869082.580351941414131210010 6
499155.4527725.9055360.00000163 912121302010 7
569061.8359538.6719075.50785184 9 6121200010 8
698063.2687041.5374079.80610194 513141322010 9
715163.8370842.6741581.5112319311 614153501010
726056.9950728.9901360.98520164 81014132201011
819056.3334227.6668460.00000164 71414132201012
826059.8954134.7908269.68623174 7 612131201013
858165.1246445.2492785.37391204 91313121200014
899057.9174330.8348663.75228174141612120001015
981162.7080440.4160878.1241219316 913121001016

Use of Genetic Algorithm for avoid Students Dropout

In [ ]:
#10000 USD
budget.limit <- 10000 

items <- data.frame(                    

student.features = c('beca.estudiantil', 'vales.transporte',
                     'consulta.psicologica', 'asesor.individual',
                     'curso.remedial', 'visita.empresa',
                     'platica.motivacional', 'viaje.recreativo',
                     'mentoria', 'comedor.gratuito'),

budget.features = c(500, 100, 
                   300, 200, 
                   1000,30,
                   50, 150,
                   200, 250)                   
)

n_students <- length(students_prediction_rows)
n_features <- nrow(items)

get_budget_chromosome <- function(g, b, nf){
    budget <- 0

    for(i in 1:nf){
        if(g[i]==1){  budget <- budget + b[i] }            
    }

    budget 
}

get_punishment<- function(i, g){

    punish.value <- 0

   #Get genetic data from the chunk related to the student
    beca <- g[1]
    asesor.individual <- g[4]
    curso.remedial <- g[5]
    platica.motivacional <-g[7]
    mentoria <- g[9]
    comedor.gratuito <- g[10]

    #Get the actual data from the student
    std <- dropout_students[dropout_students$id_alumno==i,]
    eval.economica <- as.integer(std$evalucion.socioeconomica)
    prom.prepa <- as.double(std$promedio.preparatoria)
    edad <- as.integer(std$edad.ingreso)
    cambio.carrera <- as.integer(std$cambio.carrera)
    f_pagos_status <- as.integer(std$f_pagos_status)
    beca_estudiante <- as.integer(std$distribucion.becas)

    flag = FALSE

    if(beca == 1 && beca_estudiante == 1){
        punish.value <- punish.value + 50
        flag = TRUE    
    }
    if(edad < 22 && platica.motivacional == 0){
        punish.value <- punish.value + 20
        flag = TRUE
    }
    if(cambio.carrera == 1 && platica.motivacional == 0){
        punish.value <- punish.value + 10
        flag = TRUE
    }
    if(f_pagos_status==0 && eval.economica == 4 && beca_estudiante == 0){
        punish.value <- punish.value + 50
        flag = TRUE
    }
    if(prom.prepa > 70  && eval.economica == 4 && beca_estudiante == 0){
        punish.value <- punish.value + 60
        flag = TRUE
    }
    if(prom.prepa <= 70  && eval.economica == 4 && beca_estudiante == 0){
        punish.value <- punish.value + 20
        flag = TRUE
    }
    if(eval.economica == 3 && edad < 22 && asesor.individual == 0){
        punish.value <- punish.value + 20
        flag = TRUE
    }
    else { }
    
    punish.value <- if(flag==FALSE) -1 else punish.value
    
    punish.value

}

fitness.generic <- function(x) {


    current.budget <- 0
    punish.value <- -1

    iter <- 1
    for(i in 1:n_students) {
        
        student_genes <- x[iter:(n_features * i)]
        current.budget <- current.budget + get_budget_chromosome(student_genes,items$budget.features,n_features)    
        punish.value <- punish.value + get_punishment(i, student_genes)
        iter <- iter + n_features

    }

    if(current.budget > budget.limit){
        return(0)
    }
    else{
        return(punish.value)
    }
}



?rbga.bin
ga.one <- rbga.bin(size=(n_students*n_features),
                   iters=200,
                   mutationChance=0.01,
                   elitism = 50,
                   evalFunc = fitness.generic,
                   verbose = T
)
In [134]:
Finalbudget <- 0
best <- ga.one$population[ga.one$evaluations == min(ga.one$best),][1,]

iter <- 1
for(i in 1:n_students) {
    
    best_genes <- best[iter:(n_features * i)]
    print(rownames(dropout_students[dropout_students$id_alumno==i,]))
    print(best_genes)
    best.items <- items$student.features[best_genes == 1]
    print(best.items)
    Finalbudget <- Finalbudget + best_genes %*% items$budget.features
    iter <- iter + n_features
}

Finalbudget
[1] "28"
 [1] 0 0 0 0 0 0 1 0 0 0
[1] platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "161"
 [1] 0 0 0 1 0 0 1 0 0 0
[1] asesor.individual    platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "288"
 [1] 0 0 0 0 0 0 1 1 0 0
[1] platica.motivacional viaje.recreativo    
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "310"
 [1] 0 0 0 1 0 1 1 0 0 0
[1] asesor.individual    visita.empresa       platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "451"
 [1] 0 0 0 1 0 0 1 0 0 0
[1] asesor.individual    platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "484"
 [1] 0 0 0 0 0 0 1 0 0 0
[1] platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "499"
 [1] 0 1 0 1 0 0 1 0 0 0
[1] vales.transporte     asesor.individual    platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "569"
 [1] 0 0 0 0 0 1 1 0 0 0
[1] visita.empresa       platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "698"
 [1] 0 0 0 1 0 1 1 0 0 0
[1] asesor.individual    visita.empresa       platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "715"
 [1] 0 0 0 1 1 0 1 0 1 0
[1] asesor.individual    curso.remedial       platica.motivacional
[4] mentoria            
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "726"
 [1] 0 0 0 0 0 0 1 0 0 0
[1] platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "819"
 [1] 0 0 0 1 1 0 1 0 0 0
[1] asesor.individual    curso.remedial       platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "826"
 [1] 0 0 0 0 0 0 1 0 0 0
[1] platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "858"
 [1] 0 0 0 0 0 0 1 0 0 0
[1] platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "899"
 [1] 0 0 0 0 0 0 1 0 0 0
[1] platica.motivacional
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
[1] "981"
 [1] 0 1 0 1 0 0 1 0 1 0
[1] vales.transporte     asesor.individual    platica.motivacional
[4] mentoria            
10 Levels: asesor.individual beca.estudiantil ... visita.empresa
A matrix: 1 × 1 of type dbl
5240
In [131]:
plot(ga.one)