data mining project code
TRANSCRIPT
Final ProjectDerek Atwood, Emdadul Haque
April 25, 2016
install.packages('stringi')
library(caret)library(MASS)library(ElemStatLearn)library(pls)library(glmnet)library(randomForest)library(usdm)library(choroplethr)library(choroplethrMaps)library(gridExtra)
set.seed(123)dictionary <- read.csv('dictionary.csv')load('2009.RData')
#Treat missing values and remove variables with >15% missing datascorecard.2009[scorecard.2009 == 'PrivacySuppressed'] <- NAdata.miss <- function(x) {sum(is.na(x))/length(x)*100}missing <- apply(scorecard.2009,2,data.miss)scorecard.2009.reduced <- scorecard.2009[,missing < 15]
#Remove index variablesdictionary <- dictionary[which(dictionary$VARIABLE.NAME %in% names(scorecard.2009.reduced)),]scorecard.2009.reduced2 <- scorecard.2009.reduced[,-c(1:5,15)]rm(scorecard.2009.reduced)
#Convert appropriate variables to numericscorecard.numeric <- as.data.frame(sapply(scorecard.2009.reduced2[,161:274], function(x) as.numeric(as.character(x))))scorecard.2009.reduced3 <- data.frame(scorecard.2009.reduced2[,1:160], scorecard.numeric)rm(scorecard.2009.reduced2)rm(scorecard.numeric)
#Create ratio variable and remove redundant variables concerning debt and earningsscorecard.2009.reduced3$RATIO <- scorecard.2009.reduced3$md_earn_wne_p6 /
scorecard.2009.reduced3$GRAD_DEBT_MDNscorecard.2009.reduced3$md_earn_wne_p6 <- NULLscorecard.2009.reduced3$GRAD_DEBT_MDN <- NULLdictionary <- dictionary[which(dictionary$VARIABLE.NAME %in% names(scorecard.2009.reduced3)),]debt.earning.list <- c(grep('debt', dictionary$NAME.OF.DATA.ELEMENT), grep('earnings', dictionary$NAME.OF.DATA.ELEMENT))scorecard.2009.reduced4 <- scorecard.2009.reduced3[,-debt.earning.list]rm(scorecard.2009.reduced3)dictionary <- dictionary[which(dictionary$VARIABLE.NAME %in% names(scorecard.2009.reduced4)),]
#Remove institution name and variables with fewer unique values than levelsscorecard.2009.reduced4$INSTNM <- NULLscorecard.2009.reduced5 <- scorecard.2009.reduced4[,-c(10,26,27,28)]scorecard.2009.reduced5$CONTROL <- as.factor(as.character(scorecard.2009.reduced5$CONTROL))rm(scorecard.2009.reduced4)
#impute missing values with median of non-missing valuesRATIO <- scorecard.2009.reduced5$RATIOscorecard.2009.reduced5$RATIO <- NULLimpute.median <- function(x) { if(is.numeric(x)){ replace(x, is.na(x), median(x, na.rm=TRUE)) } else { x }}scorecard.imputed <- data.frame(sapply(scorecard.2009.reduced5,impute.median))
#convert appropriate variables to factorsscorecard.imputed[,5] <- as.factor(scorecard.imputed[,5])scorecard.imputed[,7] <- as.factor(scorecard.imputed[,7])scorecard.imputed[,8] <- as.factor(scorecard.imputed[,8])scorecard.imputed[,9] <- as.factor(scorecard.imputed[,9])scorecard.imputed[,150] <- as.factor(scorecard.imputed[,150])rm(scorecard.2009.reduced5)scorecard.imputed$RATIO <- RATIOscorecard.imputed <- na.omit(scorecard.imputed)RATIO <- scorecard.imputed$RATIOscorecard.imputed$RATIO <- NULL
#Calculate multicollinearity and remove necessary variables#correlated <- vifcor(scorecard.imputed)correlated.list <- c('RPY_3YR_RT_SUPP', 'LO_INC_RPY_3YR_RT_SUPP',
'PELL_RPY_3YR_RT_SUPP', 'SEPAR_DT_MDN', 'FEMALE_RPY_3YR_RT_SUPP', 'RPY_3YR_RT', 'PAR_ED_PCT_1STGEN', 'NOTFIRSTGEN_RPY_3YR_RT_SUPP', 'CIP23BACHL', 'FIRSTGEN_RPY_3YR_RT_SUPP', 'APPL_SCH_PCT_GE3', 'CIP27BACHL', 'CIP54BACHL', 'CIP48CERT2')scorecard.reduced <- scorecard.imputed[!names(scorecard.imputed) %in% correlated.list]scorecard.reduced$RATIO <- RATIOrm(scorecard.imputed)
#Plot reponse variable with and without outliersratioplot <- data.frame('x'=1:5154, 'y'=scorecard.reduced$RATIO)ratio1 <- ggplot(ratioplot) +geom_point(aes(x=ratioplot$x, y=ratioplot$y)) + labs(x='Index', y='Earnings To Debt Ratio')ratioplot.outliers <- data.frame('x'=1:5130, 'y'=scorecard.reduced$RATIO[scorecard.reduced$RATIO < 10])ratio2 <- ggplot(ratioplot.outliers) +geom_point(aes(x=ratioplot.outliers$x, y=ratioplot.outliers$y)) + labs(x='Index', y='Earnings To Debt Ratio')grid.arrange(ratio1,ratio2,ncol=2)
#Remove observations with unreasonably high earnings to debt ratio (>10)scorecard.lowratio <- scorecard.reduced[scorecard.reduced$RATIO < 10,]
index <- createDataPartition(scorecard.lowratio$RATIO, p=0.7, list=F)training <- scorecard.lowratio[index,]testing <- scorecard.lowratio[-index,]
#Lasso modelx <- model.matrix(RATIO~., training)[,-1]y <- training$RATIOlasso.fit <- glmnet(x,y,alpha=1,family='gaussian')cv.out=cv.glmnet(x,y,alpha=1)bestlam <- cv.out$lambda.minx2 <- model.matrix(RATIO~., testing)[,-1]y2 <- testing$RATIOlasso.pred <- predict(lasso.fit, s=bestlam, newx=x2)1 - mean((lasso.pred - testing$RATIO)^2)/var(testing$RATIO)
## [1] 0.6409248
#Random Forest with various numbers of predictors per treep <- 223; p2 <- p/2; psq <- sqrt(p); p4 <- p/4random.p3 <- randomForest(RATIO~., data=training, ntree=500)random.p2 <- randomForest(RATIO~., data=training, ntree=500, mtry=p2)random.p4 <- randomForest(RATIO~., data=training, ntree=500, mtry=p4)random.psq <- randomForest(RATIO~., data=training, ntree=500, mtry=psq)
#Comparison of train mean squared error for various numbers of predictorsdf.plot <- data.frame('trees'=1:500, 'p3'=random.p3$mse, 'p2'=random.p2$mse, 'psq'=random.psq$mse, 'p4'=random.p4$mse)ggplot(df.plot, aes(x=trees)) + geom_path(aes(x=trees, y=p2, colour='red')) + geom_path(aes(x=trees, y=p3, colour='blue')) + geom_path(aes(x=trees, y=p4, colour='green')) + geom_path(aes(x=trees, y=psq, colour='cyan')) + labs(x = 'Number Of Trees', y = 'Train MSE') + scale_colour_identity(guide='legend', name='variables per tree', labels=c('p/3', 'sqrt(p)', 'p/4', 'p/2'))
#Final random forest model uses one half of the predictors per treerandom.final <- randomForest(RATIO~., data=training, ntree=200, mtry=p2)final.preds <- predict(random.final, newdata=testing)1 - mean((final.preds - testing$RATIO)^2)/var(testing$RATIO)
## [1] 0.7630928
#Predictions were averaged by state#Choropleth maps for predicted and actual test datastate.abbs <- factor(testing$STABBR)levels(state.abbs) <- levels(scorecard.2009$STABBR)testing.df <- data.frame('region'=state.abbs, 'value'=testing$RATIO)testing.df <- data.frame(aggregate(testing.df$value~testing.df$region, FUN=mean))names(testing.df) <- c('region','value')testing.df$region <- state.name[match(testing.df$region, state.abb)]testing.df <- na.omit(testing.df)testing.df$region <- tolower(testing.df$region)choro1 <- state_choropleth(testing.df, title='Median Earnings To Debt Ratio By State Actual', legend='Earnings to Debt',num_colors=3)pred.df <- data.frame('region'=state.abbs, 'value'=final.preds)pred.df <- data.frame(aggregate(pred.df$value~pred.df$region, FUN=mean))names(pred.df) <- c('region','value')pred.df$region <- state.name[match(pred.df$region, state.abb)]
pred.df <- na.omit(pred.df)pred.df$region <- tolower(pred.df$region)choro2 <- state_choropleth(pred.df, title='Median Earnings To Debt Ratio By State Predicted', legend='Earnings to Debt',num_colors=3)grid.arrange(choro1,choro2,ncol=2)