Thursday, July 12, 2018

Random Forest Classification On Fertility dataset (Treating Class-Imbalance problem with Upsampling)

Random Forest Classification On Fertility dataset
In [216]:
import pandas as pd
full=pd.read_csv("C:/Users/E002891/Desktop/DayWiseTracker/Programming Concepts/Data Science/DataSets/fertility.csv",sep=";")
full.head(5)
Out[216]:
Season Age Childish diseases Accident or serious trauma Surgical intervention High fevers Frequency of alcohol Smoking habit Number of hours spent sitting Output
0 -0.33 0.69 0 1 1 0 0.8 0 0.88 N
1 -0.33 0.94 1 0 1 0 0.8 1 0.31 O
2 -0.33 0.50 1 0 0 0 1.0 -1 0.50 N
3 -0.33 0.75 0 1 1 0 1.0 -1 0.38 N
4 -0.33 0.67 1 1 0 0 0.8 -1 0.50 O
In [148]:
full.describe()
Out[148]:
Season Age Childish diseases Accident or serious trauma Surgical intervention High fevers Frequency of alcohol Smoking habit Number of hours spent sitting
count 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000 100.000000
mean -0.078900 0.669000 0.870000 0.440000 0.510000 0.190000 0.832000 -0.350000 0.406800
std 0.796725 0.121319 0.337998 0.498888 0.502418 0.580752 0.167501 0.808728 0.186395
min -1.000000 0.500000 0.000000 0.000000 0.000000 -1.000000 0.200000 -1.000000 0.060000
25% -1.000000 0.560000 1.000000 0.000000 0.000000 0.000000 0.800000 -1.000000 0.250000
50% -0.330000 0.670000 1.000000 0.000000 1.000000 0.000000 0.800000 -1.000000 0.380000
75% 1.000000 0.750000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.500000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [168]:
full.isna().any()
Out[168]:
Output                                  False
Season_-0.33                            False
Season_0.33                             False
Season_1.0                              False
Childish diseases_1                     False
Accident or serious trauma_1            False
Surgical intervention_1                 False
High fevers_0                           False
High fevers_1                           False
Frequency of alcohol_0.4                False
Frequency of alcohol_0.6                False
Frequency of alcohol_0.8                False
Frequency of alcohol_1.0                False
Smoking habit_0                         False
Smoking habit_1                         False
Number of hours spent sitting_(3, 6]    False
Number of hours spent sitting_(6, 9]    False
Age_(70, 80]                            False
Age_(40, 60]                            False
Age_(80,100]                            False
dtype: bool

Binning Age

In [217]:
mybins=range((full.Age*100).astype("int").min()-10,(full.Age*100).astype("int").max()+10,10)
full.Age=pd.cut((full.Age*100).astype("int"),bins=mybins)
full.head(3)
Out[217]:
Season Age Childish diseases Accident or serious trauma Surgical intervention High fevers Frequency of alcohol Smoking habit Number of hours spent sitting Output
0 -0.33 (60, 70] 0 1 1 0 0.8 0 0.88 N
1 -0.33 (90, 100] 1 0 1 0 0.8 1 0.31 O
2 -0.33 (40, 50] 1 0 0 0 1.0 -1 0.50 N
In [218]:
bins1=range((full["Number of hours spent sitting"]*10).astype("int").min(),(full["Number of hours spent sitting"]*10).astype("int").max(),3)
full["Number of hours spent sitting"]=pd.cut((full["Number of hours spent sitting"]*10).astype("int"),bins1)
# full["Number of hours spent sitting"].value_counts()
In [219]:
full.Output=full.Output.factorize()[0]
full.head(3)
Out[219]:
Season Age Childish diseases Accident or serious trauma Surgical intervention High fevers Frequency of alcohol Smoking habit Number of hours spent sitting Output
0 -0.33 (60, 70] 0 1 1 0 0.8 0 (6, 9] 0
1 -0.33 (90, 100] 1 0 1 0 0.8 1 (0, 3] 1
2 -0.33 (40, 50] 1 0 0 0 1.0 -1 (3, 6] 0
In [220]:
full.Output.value_counts()
Out[220]:
0    88
1    12
Name: Output, dtype: int64

Fitting models without treating categorical variables

In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
log=LogisticRegression(penalty="l2",C=1)
log.fit(full.iloc[:,0:9],full["Output"])
classification_report(full["Output"],log.predict(full.iloc[:,0:9]))
#Error: float() argument must be a string or a number, not 'pandas._libs.interval.Interval'
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
log=RandomForestClassifier()
log.fit(full.iloc[:,0:9],full["Output"])
classification_report(full["Output"],log.predict(full.iloc[:,0:9]))
#Error: float() argument must be a string or a number, not 'pandas._libs.interval.Interval'
In [94]:
full1=full

Binning and Converting to dummies

In [221]:
for i in range(len(full)):
    if((str(full.iloc[i,1])=='(80, 90]') | (str(full.iloc[i,1])=='(90, 100]')):
        full.loc[i,"NewAge"]='(80,100]'
    else:
        full.loc[i,"NewAge"]=full1.loc[i,"Age"]
            
In [222]:
for i in range(len(full)):
    if((str(full.loc[i,"NewAge"])=='(40, 50]') | (str(full.loc[i,"NewAge"])=='(50, 60]')):
        full.loc[i,"NewAge1"]='(40-60]'
    else:
        full.loc[i,"NewAge1"]=full.loc[i,"NewAge"]
In [ ]:
for i in range(len(full)):
    if(str(full.loc[i,'NewAge1'])=='(40-60]'):
        full.loc[i,'Age']='(40, 60]'
    else:
        full.loc[i,'Age']=full.loc[i,'NewAge1']
In [225]:
full.drop(columns=["Age","NewAge"],inplace=True)
In [227]:
full["Age"]=full["NewAge1"]
In [229]:
full.drop(columns=["NewAge1"],inplace=True)
In [230]:
full.head(3)
Out[230]:
Season Childish diseases Accident or serious trauma Surgical intervention High fevers Frequency of alcohol Smoking habit Number of hours spent sitting Output Age
0 -0.33 0 1 1 0 0.8 0 (6, 9] 0 (60, 70]
1 -0.33 1 0 1 0 0.8 1 (0, 3] 1 (80,100]
2 -0.33 1 0 0 0 1.0 -1 (3, 6] 0 (40, 60]
In [134]:
full=full1
In [231]:
full=pd.get_dummies(data=full,columns=["Season","Childish diseases","Accident or serious trauma","Surgical intervention","High fevers","Frequency of alcohol","Smoking habit","Number of hours spent sitting","Age"],drop_first=True)
In [232]:
full.head(3)
Out[232]:
Output Season_-0.33 Season_0.33 Season_1.0 Childish diseases_1 Accident or serious trauma_1 Surgical intervention_1 High fevers_0 High fevers_1 Frequency of alcohol_0.4 Frequency of alcohol_0.6 Frequency of alcohol_0.8 Frequency of alcohol_1.0 Smoking habit_0 Smoking habit_1 Number of hours spent sitting_(3, 6] Number of hours spent sitting_(6, 9] Age_(70, 80] Age_(40, 60] Age_(80,100]
0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0
1 1 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1
2 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0
Converting Strings to numbers
In [145]:
import numpy as np
full2=full1
full2["Frequency of alcohol"]=np.where(((full2["Frequency of alcohol"]==0.2) | (full2["Frequency of alcohol"]==0.4)),1.0,full2["Frequency of alcohol"])
In [146]:
full2["Frequency of alcohol"].value_counts()
Out[146]:
1.0    42
0.8    39
0.6    19
Name: Frequency of alcohol, dtype: int64
In [147]:
full.head(3)
Out[147]:
Output Season_-0.33 Season_0.33 Season_1.0 Childish diseases_1 Accident or serious trauma_1 Surgical intervention_1 High fevers_0 High fevers_1 Frequency of alcohol_0.4 Frequency of alcohol_0.6 Frequency of alcohol_0.8 Frequency of alcohol_1.0 Smoking habit_0 Smoking habit_1 Number of hours spent sitting_(3, 6] Number of hours spent sitting_(6, 9] Age_(70, 80] Age_(40, 60] Age_(80,100]
0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0
1 1 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1
2 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0
PCA
In [233]:
from sklearn.decomposition import PCA
X=full.iloc[:,1:len(full.columns)]
y=full["Output"]
pca=PCA(n_components=2)
X_new=pca.fit_transform(X)
In [234]:
y.value_counts()
Out[234]:
0    88
1    12
Name: Output, dtype: int64
Cross Validation and model building
In [235]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
log=LogisticRegression(penalty="l2",C=1, class_weight="balanced")
kfold=RepeatedKFold(n_splits=20,n_repeats=5)
f1_scores=cross_val_score(estimator=log,X=X,y=y,n_jobs=-1,cv=kfold,verbose=True,scoring="recall")
f1_scores
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   46.1s finished
Out[235]:
array([0. , 0. , 0.5, 0. , 1. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 0.5, 0. ,
       0.5, 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 1. , 0. , 0. , 0. , 0. , 0. , 1. , 0.5, 0. , 0. , 0. , 0. ,
       0.5, 0. , 0. , 1. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5,
       0. , 0.5, 0. , 0. , 0. , 0.5, 0. , 1. , 0. ])
In [207]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
log=LogisticRegression(penalty="l2",C=1, class_weight="balanced")
kfold=KFold(n_splits=3,random_state=123)
# f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
preds=cross_val_predict(X=X,y=y,estimator=log,cv=kfold,n_jobs=-1,method="predict_proba")
In [208]:
# pos_pred=preds[]
pos_preds=[i[1] for i in preds]
pos_preds
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y,pos_preds))
0.43276515151515155
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
log=RandomForestClassifier(n_estimators=1000)
kfold=KFold(n_splits=3,random_state=123)
# f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
preds=cross_val_predict(X=X,y=y,estimator=log,cv=kfold,n_jobs=-1,method="predict_proba")
In [215]:
f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
f1Scores
Out[215]:
array([0.22222222, 0.        , 0.        ])
Treating class imbalance problem: Upsampling
In [240]:
from sklearn.utils import resample
full_majority=full.loc[full.Output==0,:]
full_minority=full.loc[full.Output==1,:]
full_minority_new = resample(full_minority,n_samples=88,replace=True,random_state=123)
full_minority_new.Output.value_count()
Out[240]:
1    88
Name: Output, dtype: int64
In [243]:
full_new=pd.concat([full_majority,full_minority_new])
len(full_new)
Out[243]:
176
In [245]:
full_new.head(3)
Out[245]:
Output Season_-0.33 Season_0.33 Season_1.0 Childish diseases_1 Accident or serious trauma_1 Surgical intervention_1 High fevers_0 High fevers_1 Frequency of alcohol_0.4 Frequency of alcohol_0.6 Frequency of alcohol_0.8 Frequency of alcohol_1.0 Smoking habit_0 Smoking habit_1 Number of hours spent sitting_(3, 6] Number of hours spent sitting_(6, 9] Age_(70, 80] Age_(40, 60] Age_(80,100]
0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0
2 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0
3 0 1 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0
In [250]:
X=full_new.iloc[:,1:len(full_new)]
y=full_new.iloc[:,0]
In [262]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_new=pca.fit_transform(X)
print(X_new[1:5])
[[ 0.23542653  0.02168116]
 [ 0.08682597  0.06155696]
 [ 0.11590495 -1.05857954]
 [ 0.50206143 -0.33793794]]
In [253]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold

clf=RandomForestClassifier()
kfold = KFold(n_splits=3)
preds = cross_val_predict(X=X_new,y=y,cv=kfold,n_jobs=-1,estimator=clf)
preds
Out[253]:
array([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1],
      dtype=int64)
In [258]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(confusion_matrix(y,preds))
print(classification_report(y,preds))
[[68 20]
 [ 4 84]]
             precision    recall  f1-score   support

          0       0.94      0.77      0.85        88
          1       0.81      0.95      0.88        88

avg / total       0.88      0.86      0.86       176

In [261]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
rclf=RandomForestClassifier(n_estimators=1000)
rKfold=RepeatedKFold(n_splits=5,n_repeats=5)
f1Scores = cross_val_score(X=X_new,y=y,cv=rKfold,n_jobs=-1,estimator=rclf,scoring="f1")
print(f1Scores)
print("avarage f1:",f1Scores.mean())
[0.94444444 0.90909091 0.97560976 0.92682927 0.97142857 0.96774194
 0.97297297 0.97297297 0.93023256 0.97142857 0.88888889 0.91666667
 0.94736842 0.97297297 1.         0.875      0.97435897 0.88888889
 0.97297297 0.97674419 0.91891892 0.92307692 1.         0.97142857
 0.96      ]
avarage f1: 0.9492015338250006

No comments:

Post a Comment

ChatGPT and Intelligent Document Processing!!

 ChatGPT and Intelligent Document Processing!! Question: How chatgpt can helpful in IDP? Answer: As an AI language model, ChatGPT can be he...