python resampling

71

Upload: pydata

Post on 08-Jan-2017

128 views

Category:

Data & Analytics


1 download

TRANSCRIPT

Page 1: Python resampling
Page 2: Python resampling
Page 3: Python resampling
Page 4: Python resampling
Page 5: Python resampling
Page 6: Python resampling

In[2]: fromsklearn.datasetsimportmake_classificationfromsklearn.decompositionimportPCAimportmatplotlib.pyplotaspltfromsklearn.linear_modelimportLogisticRegressionfromsklearn.metricsimportclassification_reportfromsklearn.grid_searchimportGridSearchCVfromsklearn.cross_validationimportKFold,train_test_splitimportnumpyasnpfromcollectionsimportCounterfromimblearn.under_samplingimportRandomUnderSampler,NearMiss,CondensedNearestNeighbourfromimblearn.under_samplingimportEditedNearestNeighbours,RepeatedEditedNearestNeighbours,TomekLinksfromimblearn.over_samplingimportRandomOverSampler,SMOTEfromimblearn.combineimportSMOTEENN,SMOTETomekfromimblearn.ensembleimportBalanceCascade,EasyEnsemblefromsklearn.ensembleimportAdaBoostClassifierimportwarningsfromitableimportPrettyTable,TableStyle,CellStyleimportpandasaspdwarnings.filterwarnings('ignore')%pylabinlinepylab.rcParams['figure.figsize']=(12,6)plt.style.use('fivethirtyeight')

Populatingtheinteractivenamespacefromnumpyandmatplotlib

Page 7: Python resampling

In[4]: #GeneratedatawithtwoclassesX,y=make_classification(class_sep=1.2,weights=[0.1,0.9],n_informative=3,n_redundant=1,n_features=5,n_clusters_per_class=1,n_samples=10000,flip_y=0,random_state=10)pca=PCA(n_components=2)X=pca.fit_transform(X)

y=y.astype('str')y[y=='1']='L'y[y=='0']='S'

X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,random_state=0)

X_1,X_2=X_train[y_train=='S'],X_train[y_train=='L']

Page 8: Python resampling

In[5]: #Scatterplotofthedataplt.scatter(zip(*X_1)[0],zip(*X_1)[1],color='#1abc9c')plt.scatter(zip(*X_2)[0],zip(*X_2)[1],color='#e67e22')

x_coords=zip(*X_1)[0]+zip(*X_2)[0]y_coords=zip(*X_1)[1]+zip(*X_2)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("OriginalDataset")plt.show()

Page 9: Python resampling

In[6]: #FitaLogisticRegressionmodelclf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train,y_train)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

Page 10: Python resampling

In[7]: plt.scatter(zip(*X_1)[0],zip(*X_1)[1],color='#1abc9c')plt.scatter(zip(*X_2)[0],zip(*X_2)[1],color='#e67e22')

x_coords=zip(*X_1)[0]+zip(*X_2)[0]y_coords=zip(*X_1)[1]+zip(*X_2)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("OriginalDataset-FittedLogisticRegression")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 11: Python resampling

In[8]: printclassification_report(y_test,clf.predict(X_test))

precisionrecallf1-scoresupport

L0.900.980.942680S0.390.120.19320

avg/total0.850.890.863000

Page 12: Python resampling
Page 13: Python resampling

In[10]: #LogisticRegressionwithbalancedclassweightsclf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2'],'class_weight':['balanced']}

cv=KFold(X_train.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train,y_train)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

Page 14: Python resampling

In[11]: x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

plt.scatter(zip(*X_1)[0],zip(*X_1)[1],color='#1abc9c')plt.scatter(zip(*X_2)[0],zip(*X_2)[1],color='#e67e22')

x_coords=zip(*X_1)[0]+zip(*X_2)[0]y_coords=zip(*X_1)[1]+zip(*X_2)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("OriginalDataset-FittedLogisticRegression")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 15: Python resampling

In[13]:

In[14]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.980.790.882680S0.340.890.49320

avg/total0.920.800.843000

Out[14]:

Page 16: Python resampling
Page 17: Python resampling

In[15]: #Randomundersamplingofmajorityclassus=RandomUnderSampler(ratio=0.5,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':1360,'S':680})

Page 18: Python resampling

In[16]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("RandomUndersamplingofmajorityclass")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 19: Python resampling

In[18]:

In[19]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.970.830.902680S0.360.820.50320

avg/total0.910.830.853000

Out[19]:

Page 20: Python resampling
Page 21: Python resampling

In[20]: #NearMiss1us=NearMiss(ratio=0.5,size_ngh=3,version=1,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':1360,'S':680})

Page 22: Python resampling

In[21]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("NearMiss1")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 23: Python resampling

In[23]:

In[24]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.920.950.942680S0.440.320.37320

avg/total0.870.880.883000

Out[24]:

Page 24: Python resampling
Page 25: Python resampling

In[25]: #NearMiss2us=NearMiss(ratio=0.5,size_ngh=3,version=2,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':1360,'S':680})

Page 26: Python resampling

In[26]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("NearMiss2")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 27: Python resampling

In[28]:

In[29]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.950.890.922680S0.390.600.47320

avg/total0.890.860.873000

Out[29]:

Page 28: Python resampling
Page 29: Python resampling

In[30]: #NearMiss3us=NearMiss(ratio=0.5,size_ngh=3,ver3_samp_ngh=3,version=3,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':964,'S':680})

Page 30: Python resampling

In[31]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("NearMiss3")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 31: Python resampling

In[33]:

In[34]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.910.970.942680S0.430.200.28320

avg/total0.860.890.873000

Out[34]:

Page 32: Python resampling
Page 33: Python resampling

In[35]: #CondensedNearestNeighborus=CondensedNearestNeighbour(random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':882,'S':680})

Page 34: Python resampling

In[36]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("CondensedNearestNeighbor")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 35: Python resampling

In[38]:

In[39]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.930.940.932680S0.440.390.42320

avg/total0.880.880.883000

Out[39]:

Page 36: Python resampling
Page 37: Python resampling

In[40]: #EditedNearestNeighbor(ENN)us=EditedNearestNeighbours(size_ngh=5,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':5120,'S':680})

Page 38: Python resampling

In[41]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("EditedNearestNeighbor")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 39: Python resampling

In[43]:

In[44]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.950.900.922680S0.420.590.49320

avg/total0.890.870.883000

Out[44]:

Page 40: Python resampling

In[45]: #RepeatedEditedNearestNeighborus=RepeatedEditedNearestNeighbours(size_ngh=5,random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':4796,'S':680})

Page 41: Python resampling

In[46]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("RepeatedEditedNearestNeighbor")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 42: Python resampling

In[48]:

In[49]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.970.840.902680S0.380.800.51320

avg/total0.910.840.863000

Out[49]:

Page 43: Python resampling
Page 44: Python resampling

In[50]: #Tomeklinkremovalus=TomekLinks(random_state=1)X_train_res,y_train_res=us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':6051,'S':680})

Page 45: Python resampling

In[51]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("TomekLinkRemoval")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 46: Python resampling

In[53]:

In[54]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.910.970.942680S0.440.210.29320

avg/total0.860.890.873000

Out[54]:

Page 47: Python resampling
Page 48: Python resampling

In[55]: #Randomoversamplingofminorityclassos=RandomOverSampler(ratio=0.5,random_state=1)X_train_res,y_train_res=os.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':6320,'S':3160})

Page 49: Python resampling

In[56]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("RandomOversamplingofminorityclass")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 50: Python resampling

In[58]:

In[59]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.970.850.912680S0.380.760.51320

avg/total0.910.840.863000

Out[59]:

Page 51: Python resampling
Page 52: Python resampling

In[60]: #SyntheticMinorityOversamplingTechnique(SMOTE)os=SMOTE(ratio=0.5,k=5,random_state=1)X_train_res,y_train_res=os.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':6320,'S':3160})

Page 53: Python resampling

In[61]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("SMOTE")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 54: Python resampling

In[63]:

In[64]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.970.850.912680S0.380.770.51320

avg/total0.910.840.863000

Out[64]:

Page 55: Python resampling
Page 56: Python resampling

In[65]: #SMOTE+Tomeklinkremovalos_us=SMOTETomek(ratio=0.5,k=5,random_state=1)X_train_res,y_train_res=os_us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':6050,'S':3160})

Page 57: Python resampling

In[66]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("SMOTE+TomekLinkRemoval")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 58: Python resampling

In[68]:

In[69]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.970.840.902680S0.380.800.51320

avg/total0.910.840.863000

Out[69]:

Page 59: Python resampling

In[70]: #SMOTE+ENNos_us=SMOTEENN(ratio=0.5,k=5,size_ngh=5,random_state=1)X_train_res,y_train_res=os_us.fit_sample(X_train,y_train)

print"Distributionofclasslabelsbeforeresampling{}".format(Counter(y_train))print"Distributionofclasslabelsafterresampling{}".format(Counter(y_train_res))

clf_base=LogisticRegression()grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')

clf.fit(X_train_res,y_train_res)

coef=clf.best_estimator_.coef_intercept=clf.best_estimator_.intercept_

x1=np.linspace(-8,10,100)x2=-(coef[0][0]*x1+intercept[0])/coef[0][1]

X_1_res=X_train_res[y_train_res=='S']X_2_res=X_train_res[y_train_res=='L']

DistributionofclasslabelsbeforeresamplingCounter({'L':6320,'S':680})DistributionofclasslabelsafterresamplingCounter({'L':4894,'S':3160})

Page 60: Python resampling

In[71]: plt.scatter(zip(*X_1_res)[0],zip(*X_1_res)[1],color='#1abc9c')plt.scatter(zip(*X_2_res)[0],zip(*X_2_res)[1],color='#e67e22')

x_coords=zip(*X_1_res)[0]+zip(*X_2_res)[0]y_coords=zip(*X_1_res)[1]+zip(*X_2_res)[1]plt.axis([min(x_coords),max(x_coords),min(y_coords),max(y_coords)])

plt.title("SMOTE+ENN")plt.plot(x1,x2,color='#414e8a',linewidth=2)plt.show()

Page 61: Python resampling

In[73]:

In[74]:

printclassification_report(y_test,clf.predict(X_test))

precision_recall_comparison

precisionrecallf1-scoresupport

L0.990.780.872680S0.330.920.49320

avg/total0.920.790.833000

Out[74]:

Page 62: Python resampling
Page 63: Python resampling

In[75]: #EasyEnsembleens=EasyEnsemble()X_train_res,y_train_res=ens.fit_sample(X_train,y_train)

y_pred_proba=np.zeros(len(y_test))foridxinrange(len(y_train_res)):clf_base=AdaBoostClassifier()grid={'n_estimators':[10,50,100]}

cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')clf.fit(X_train_res[idx],y_train_res[idx])y_pred_proba+=zip(*clf.predict_proba(X_test))[0]y_pred_proba=y_pred_proba/len(y_train_res)y_pred=(y_pred_proba>0.5).astype(int)y_pred=y_pred.astype('str')y_pred[y_pred=='1']='L'y_pred[y_pred=='0']='S'

Page 64: Python resampling

In[77]:

In[78]:

printclassification_report(y_test,y_pred)

precision_recall_comparison

precisionrecallf1-scoresupport

L0.980.800.882680S0.350.900.50320

avg/total0.920.810.843000

Out[78]:

Page 65: Python resampling
Page 66: Python resampling

In[79]: #BalanceCascade

#NotethisisnotatrueimplementationofBalancaCacadesincethelibrary#doesnotreturnthefittedclassifiersusedingeneratingthesubsetsens=BalanceCascade(classifier='adaboost',random_state=1)X_train_res,y_train_res=ens.fit_sample(X_train,y_train)

y_pred_proba=np.zeros(len(y_test))foridxinrange(len(y_train_res)):#imblearnusesAdaBoostClassiiferwithdefaulthyperparamstoselectsubsets#avariantimplementationwithgridsearchforeachsubsampleisshowninthenextcellclf=AdaBoostClassifier(random_state=1)clf.fit(X_train_res[idx],y_train_res[idx])y_pred_proba+=zip(*clf.predict_proba(X_test))[0]y_pred_proba=y_pred_proba/len(y_train_res)y_pred=(y_pred_proba>0.5).astype(int)y_pred=y_pred.astype('str')y_pred[y_pred=='1']='L'y_pred[y_pred=='0']='S'

Page 67: Python resampling

In[81]:

In[82]:

printclassification_report(y_test,y_pred)

precision_recall_comparison

precisionrecallf1-scoresupport

L0.990.810.892680S0.360.910.51320

avg/total0.920.820.853000

Out[82]:

Page 68: Python resampling

In[83]: #NotethisisnotatrueimplementationofBalancaCacadesincethelibrary#doesnotreturnthefittedclassifiersusedingeneratingthesubsetsens=BalanceCascade(classifier='adaboost',random_state=1)X_train_res,y_train_res=ens.fit_sample(X_train,y_train)

y_pred_proba=np.zeros(len(y_test))foridxinrange(len(y_train_res)):clf_base=AdaBoostClassifier(random_state=1)grid={'n_estimators':[10,50,100]}cv=KFold(X_train_res.shape[0],n_folds=5,shuffle=True,random_state=0)clf=GridSearchCV(clf_base,grid,cv=cv,n_jobs=8,scoring='f1_macro')clf.fit(X_train_res[idx],y_train_res[idx])y_pred_proba+=zip(*clf.predict_proba(X_test))[0]y_pred_proba=y_pred_proba/len(y_train_res)y_pred=(y_pred_proba>0.5).astype(int)y_pred=y_pred.astype('str')y_pred[y_pred=='1']='L'y_pred[y_pred=='0']='S'

Page 69: Python resampling

In[85]:

In[86]:

printclassification_report(y_test,y_pred)

precision_recall_comparison

precisionrecallf1-scoresupport

L0.990.800.882680S0.350.900.50320

avg/total0.920.810.843000

Out[86]:

Page 70: Python resampling
Page 71: Python resampling