data mining project code

7
Final Project Derek Atwood, Emdadul Haque April 25, 2016 install.packages('stringi') library (caret) library (MASS) library (ElemStatLearn) library (pls) library (glmnet) library (randomForest) library (usdm) library (choroplethr) library (choroplethrMaps) library (gridExtra) set.seed ( 123 ) dictionary <- read.csv ( 'dictionary.csv' ) load ( '2009.RData' ) #Treat missing values and remove variables with >15% missing data scorecard .2009 [scorecard .2009 == 'PrivacySuppressed' ] <- NA data.miss <- function(x) { sum ( is.na (x))/ length (x)* 100 } missing <- apply (scorecard .2009 , 2 ,data.miss) scorecard .2009 .reduced <- scorecard .2009 [,missing < 15 ] #Remove index variables dictionary <- dictionary[ which (dictionary$VARIABLE.NAME %in% names (scorecard .2009 .reduced)),] scorecard .2009 .reduced2 <- scorecard .2009 .reduced[,- c ( 1 : 5 , 15 )] rm (scorecard .2009 .reduced) #Convert appropriate variables to numeric scorecard.numeric <- as.data.frame ( sapply (scorecard .2009 .reduced2[, 161 : 274 ], function(x) as.numeric ( as.character (x)))) scorecard .2009 .reduced3 <- data.frame (scorecard .2009 .reduced2[, 1 : 160 ], scorecard.numeric) rm (scorecard .2009 .reduced2)

Upload: emdadul-haque

Post on 15-Feb-2017

31 views

Category:

Documents


2 download

TRANSCRIPT

Page 1: Data mining project code

Final ProjectDerek Atwood, Emdadul Haque

April 25, 2016

install.packages('stringi')

library(caret)library(MASS)library(ElemStatLearn)library(pls)library(glmnet)library(randomForest)library(usdm)library(choroplethr)library(choroplethrMaps)library(gridExtra)

set.seed(123)dictionary <- read.csv('dictionary.csv')load('2009.RData')

#Treat missing values and remove variables with >15% missing datascorecard.2009[scorecard.2009 == 'PrivacySuppressed'] <- NAdata.miss <- function(x) {sum(is.na(x))/length(x)*100}missing <- apply(scorecard.2009,2,data.miss)scorecard.2009.reduced <- scorecard.2009[,missing < 15]

#Remove index variablesdictionary <- dictionary[which(dictionary$VARIABLE.NAME %in% names(scorecard.2009.reduced)),]scorecard.2009.reduced2 <- scorecard.2009.reduced[,-c(1:5,15)]rm(scorecard.2009.reduced)

#Convert appropriate variables to numericscorecard.numeric <- as.data.frame(sapply(scorecard.2009.reduced2[,161:274], function(x) as.numeric(as.character(x))))scorecard.2009.reduced3 <- data.frame(scorecard.2009.reduced2[,1:160], scorecard.numeric)rm(scorecard.2009.reduced2)rm(scorecard.numeric)

#Create ratio variable and remove redundant variables concerning debt and earningsscorecard.2009.reduced3$RATIO <- scorecard.2009.reduced3$md_earn_wne_p6 /

Page 2: Data mining project code

scorecard.2009.reduced3$GRAD_DEBT_MDNscorecard.2009.reduced3$md_earn_wne_p6 <- NULLscorecard.2009.reduced3$GRAD_DEBT_MDN <- NULLdictionary <- dictionary[which(dictionary$VARIABLE.NAME %in% names(scorecard.2009.reduced3)),]debt.earning.list <- c(grep('debt', dictionary$NAME.OF.DATA.ELEMENT), grep('earnings', dictionary$NAME.OF.DATA.ELEMENT))scorecard.2009.reduced4 <- scorecard.2009.reduced3[,-debt.earning.list]rm(scorecard.2009.reduced3)dictionary <- dictionary[which(dictionary$VARIABLE.NAME %in% names(scorecard.2009.reduced4)),]

#Remove institution name and variables with fewer unique values than levelsscorecard.2009.reduced4$INSTNM <- NULLscorecard.2009.reduced5 <- scorecard.2009.reduced4[,-c(10,26,27,28)]scorecard.2009.reduced5$CONTROL <- as.factor(as.character(scorecard.2009.reduced5$CONTROL))rm(scorecard.2009.reduced4)

#impute missing values with median of non-missing valuesRATIO <- scorecard.2009.reduced5$RATIOscorecard.2009.reduced5$RATIO <- NULLimpute.median <- function(x) { if(is.numeric(x)){ replace(x, is.na(x), median(x, na.rm=TRUE)) } else { x }}scorecard.imputed <- data.frame(sapply(scorecard.2009.reduced5,impute.median))

#convert appropriate variables to factorsscorecard.imputed[,5] <- as.factor(scorecard.imputed[,5])scorecard.imputed[,7] <- as.factor(scorecard.imputed[,7])scorecard.imputed[,8] <- as.factor(scorecard.imputed[,8])scorecard.imputed[,9] <- as.factor(scorecard.imputed[,9])scorecard.imputed[,150] <- as.factor(scorecard.imputed[,150])rm(scorecard.2009.reduced5)scorecard.imputed$RATIO <- RATIOscorecard.imputed <- na.omit(scorecard.imputed)RATIO <- scorecard.imputed$RATIOscorecard.imputed$RATIO <- NULL

#Calculate multicollinearity and remove necessary variables#correlated <- vifcor(scorecard.imputed)correlated.list <- c('RPY_3YR_RT_SUPP', 'LO_INC_RPY_3YR_RT_SUPP',

Page 3: Data mining project code

'PELL_RPY_3YR_RT_SUPP', 'SEPAR_DT_MDN', 'FEMALE_RPY_3YR_RT_SUPP', 'RPY_3YR_RT', 'PAR_ED_PCT_1STGEN', 'NOTFIRSTGEN_RPY_3YR_RT_SUPP', 'CIP23BACHL', 'FIRSTGEN_RPY_3YR_RT_SUPP', 'APPL_SCH_PCT_GE3', 'CIP27BACHL', 'CIP54BACHL', 'CIP48CERT2')scorecard.reduced <- scorecard.imputed[!names(scorecard.imputed) %in% correlated.list]scorecard.reduced$RATIO <- RATIOrm(scorecard.imputed)

#Plot reponse variable with and without outliersratioplot <- data.frame('x'=1:5154, 'y'=scorecard.reduced$RATIO)ratio1 <- ggplot(ratioplot) +geom_point(aes(x=ratioplot$x, y=ratioplot$y)) + labs(x='Index', y='Earnings To Debt Ratio')ratioplot.outliers <- data.frame('x'=1:5130, 'y'=scorecard.reduced$RATIO[scorecard.reduced$RATIO < 10])ratio2 <- ggplot(ratioplot.outliers) +geom_point(aes(x=ratioplot.outliers$x, y=ratioplot.outliers$y)) + labs(x='Index', y='Earnings To Debt Ratio')grid.arrange(ratio1,ratio2,ncol=2)

#Remove observations with unreasonably high earnings to debt ratio (>10)scorecard.lowratio <- scorecard.reduced[scorecard.reduced$RATIO < 10,]

Page 4: Data mining project code

index <- createDataPartition(scorecard.lowratio$RATIO, p=0.7, list=F)training <- scorecard.lowratio[index,]testing <- scorecard.lowratio[-index,]

#Lasso modelx <- model.matrix(RATIO~., training)[,-1]y <- training$RATIOlasso.fit <- glmnet(x,y,alpha=1,family='gaussian')cv.out=cv.glmnet(x,y,alpha=1)bestlam <- cv.out$lambda.minx2 <- model.matrix(RATIO~., testing)[,-1]y2 <- testing$RATIOlasso.pred <- predict(lasso.fit, s=bestlam, newx=x2)1 - mean((lasso.pred - testing$RATIO)^2)/var(testing$RATIO)

## [1] 0.6409248

#Random Forest with various numbers of predictors per treep <- 223; p2 <- p/2; psq <- sqrt(p); p4 <- p/4random.p3 <- randomForest(RATIO~., data=training, ntree=500)random.p2 <- randomForest(RATIO~., data=training, ntree=500, mtry=p2)random.p4 <- randomForest(RATIO~., data=training, ntree=500, mtry=p4)random.psq <- randomForest(RATIO~., data=training, ntree=500, mtry=psq)

#Comparison of train mean squared error for various numbers of predictorsdf.plot <- data.frame('trees'=1:500, 'p3'=random.p3$mse, 'p2'=random.p2$mse, 'psq'=random.psq$mse, 'p4'=random.p4$mse)ggplot(df.plot, aes(x=trees)) + geom_path(aes(x=trees, y=p2, colour='red')) + geom_path(aes(x=trees, y=p3, colour='blue')) + geom_path(aes(x=trees, y=p4, colour='green')) + geom_path(aes(x=trees, y=psq, colour='cyan')) + labs(x = 'Number Of Trees', y = 'Train MSE') + scale_colour_identity(guide='legend', name='variables per tree', labels=c('p/3', 'sqrt(p)', 'p/4', 'p/2'))

Page 5: Data mining project code

#Final random forest model uses one half of the predictors per treerandom.final <- randomForest(RATIO~., data=training, ntree=200, mtry=p2)final.preds <- predict(random.final, newdata=testing)1 - mean((final.preds - testing$RATIO)^2)/var(testing$RATIO)

## [1] 0.7630928

#Predictions were averaged by state#Choropleth maps for predicted and actual test datastate.abbs <- factor(testing$STABBR)levels(state.abbs) <- levels(scorecard.2009$STABBR)testing.df <- data.frame('region'=state.abbs, 'value'=testing$RATIO)testing.df <- data.frame(aggregate(testing.df$value~testing.df$region, FUN=mean))names(testing.df) <- c('region','value')testing.df$region <- state.name[match(testing.df$region, state.abb)]testing.df <- na.omit(testing.df)testing.df$region <- tolower(testing.df$region)choro1 <- state_choropleth(testing.df, title='Median Earnings To Debt Ratio By State Actual', legend='Earnings to Debt',num_colors=3)pred.df <- data.frame('region'=state.abbs, 'value'=final.preds)pred.df <- data.frame(aggregate(pred.df$value~pred.df$region, FUN=mean))names(pred.df) <- c('region','value')pred.df$region <- state.name[match(pred.df$region, state.abb)]

Page 6: Data mining project code

pred.df <- na.omit(pred.df)pred.df$region <- tolower(pred.df$region)choro2 <- state_choropleth(pred.df, title='Median Earnings To Debt Ratio By State Predicted', legend='Earnings to Debt',num_colors=3)grid.arrange(choro1,choro2,ncol=2)