import pandas as pd
full=pd.read_csv("C:/Users/E002891/Desktop/DayWiseTracker/Programming Concepts/Data Science/DataSets/fertility.csv",sep=";")
full.head(5)

full.describe()

full.isna().any()

Output                                  False
Season_-0.33                            False
Season_0.33                             False
Season_1.0                              False
Childish diseases_1                     False
Accident or serious trauma_1            False
Surgical intervention_1                 False
High fevers_0                           False
High fevers_1                           False
Frequency of alcohol_0.4                False
Frequency of alcohol_0.6                False
Frequency of alcohol_0.8                False
Frequency of alcohol_1.0                False
Smoking habit_0                         False
Smoking habit_1                         False
Number of hours spent sitting_(3, 6]    False
Number of hours spent sitting_(6, 9]    False
Age_(70, 80]                            False
Age_(40, 60]                            False
Age_(80,100]                            False
dtype: bool

Binning Age¶

mybins=range((full.Age*100).astype("int").min()-10,(full.Age*100).astype("int").max()+10,10)
full.Age=pd.cut((full.Age*100).astype("int"),bins=mybins)
full.head(3)

bins1=range((full["Number of hours spent sitting"]*10).astype("int").min(),(full["Number of hours spent sitting"]*10).astype("int").max(),3)
full["Number of hours spent sitting"]=pd.cut((full["Number of hours spent sitting"]*10).astype("int"),bins1)
# full["Number of hours spent sitting"].value_counts()

full.Output=full.Output.factorize()[0]
full.head(3)

full.Output.value_counts()

0    88
1    12
Name: Output, dtype: int64

Fitting models without treating categorical variables¶

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
log=LogisticRegression(penalty="l2",C=1)
log.fit(full.iloc[:,0:9],full["Output"])
classification_report(full["Output"],log.predict(full.iloc[:,0:9]))
#Error: float() argument must be a string or a number, not 'pandas._libs.interval.Interval'

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
log=RandomForestClassifier()
log.fit(full.iloc[:,0:9],full["Output"])
classification_report(full["Output"],log.predict(full.iloc[:,0:9]))
#Error: float() argument must be a string or a number, not 'pandas._libs.interval.Interval'

full1=full

Binning and Converting to dummies¶

for i in range(len(full)):
    if((str(full.iloc[i,1])=='(80, 90]') | (str(full.iloc[i,1])=='(90, 100]')):
        full.loc[i,"NewAge"]='(80,100]'
    else:
        full.loc[i,"NewAge"]=full1.loc[i,"Age"]

for i in range(len(full)):
    if((str(full.loc[i,"NewAge"])=='(40, 50]') | (str(full.loc[i,"NewAge"])=='(50, 60]')):
        full.loc[i,"NewAge1"]='(40-60]'
    else:
        full.loc[i,"NewAge1"]=full.loc[i,"NewAge"]

for i in range(len(full)):
    if(str(full.loc[i,'NewAge1'])=='(40-60]'):
        full.loc[i,'Age']='(40, 60]'
    else:
        full.loc[i,'Age']=full.loc[i,'NewAge1']

full.drop(columns=["Age","NewAge"],inplace=True)

full["Age"]=full["NewAge1"]

full.drop(columns=["NewAge1"],inplace=True)

full.head(3)

full=full1

full=pd.get_dummies(data=full,columns=["Season","Childish diseases","Accident or serious trauma","Surgical intervention","High fevers","Frequency of alcohol","Smoking habit","Number of hours spent sitting","Age"],drop_first=True)

full.head(3)

Converting Strings to numbers¶

import numpy as np
full2=full1
full2["Frequency of alcohol"]=np.where(((full2["Frequency of alcohol"]==0.2) | (full2["Frequency of alcohol"]==0.4)),1.0,full2["Frequency of alcohol"])

full2["Frequency of alcohol"].value_counts()

1.0    42
0.8    39
0.6    19
Name: Frequency of alcohol, dtype: int64

full.head(3)

PCA¶

from sklearn.decomposition import PCA
X=full.iloc[:,1:len(full.columns)]
y=full["Output"]
pca=PCA(n_components=2)
X_new=pca.fit_transform(X)

y.value_counts()

0    88
1    12
Name: Output, dtype: int64

Cross Validation and model building¶

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
log=LogisticRegression(penalty="l2",C=1, class_weight="balanced")
kfold=RepeatedKFold(n_splits=20,n_repeats=5)
f1_scores=cross_val_score(estimator=log,X=X,y=y,n_jobs=-1,cv=kfold,verbose=True,scoring="recall")
f1_scores

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   46.1s finished

array([0. , 0. , 0.5, 0. , 1. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 0.5, 0. ,
       0.5, 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 1. , 0. , 0. , 0. , 0. , 0. , 1. , 0.5, 0. , 0. , 0. , 0. ,
       0.5, 0. , 0. , 1. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5,
       0. , 0.5, 0. , 0. , 0. , 0.5, 0. , 1. , 0. ])

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
log=LogisticRegression(penalty="l2",C=1, class_weight="balanced")
kfold=KFold(n_splits=3,random_state=123)
# f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
preds=cross_val_predict(X=X,y=y,estimator=log,cv=kfold,n_jobs=-1,method="predict_proba")

# pos_pred=preds[]
pos_preds=[i[1] for i in preds]
pos_preds
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y,pos_preds))

0.43276515151515155

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
log=RandomForestClassifier(n_estimators=1000)
kfold=KFold(n_splits=3,random_state=123)
# f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
preds=cross_val_predict(X=X,y=y,estimator=log,cv=kfold,n_jobs=-1,method="predict_proba")

f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
f1Scores

array([0.22222222, 0.        , 0.        ])

Treating class imbalance problem: Upsampling

from sklearn.utils import resample
full_majority=full.loc[full.Output==0,:]
full_minority=full.loc[full.Output==1,:]
full_minority_new = resample(full_minority,n_samples=88,replace=True,random_state=123)
full_minority_new.Output.value_count()

1    88
Name: Output, dtype: int64

full_new=pd.concat([full_majority,full_minority_new])
len(full_new)

176

full_new.head(3)

X=full_new.iloc[:,1:len(full_new)]
y=full_new.iloc[:,0]

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_new=pca.fit_transform(X)
print(X_new[1:5])

[[ 0.23542653  0.02168116]
 [ 0.08682597  0.06155696]
 [ 0.11590495 -1.05857954]
 [ 0.50206143 -0.33793794]]

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold

clf=RandomForestClassifier()
kfold = KFold(n_splits=3)
preds = cross_val_predict(X=X_new,y=y,cv=kfold,n_jobs=-1,estimator=clf)
preds

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1],
      dtype=int64)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(confusion_matrix(y,preds))
print(classification_report(y,preds))

[[68 20]
 [ 4 84]]
             precision    recall  f1-score   support

          0       0.94      0.77      0.85        88
          1       0.81      0.95      0.88        88

avg / total       0.88      0.86      0.86       176

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
rclf=RandomForestClassifier(n_estimators=1000)
rKfold=RepeatedKFold(n_splits=5,n_repeats=5)
f1Scores = cross_val_score(X=X_new,y=y,cv=rKfold,n_jobs=-1,estimator=rclf,scoring="f1")
print(f1Scores)
print("avarage f1:",f1Scores.mean())

[0.94444444 0.90909091 0.97560976 0.92682927 0.97142857 0.96774194
 0.97297297 0.97297297 0.93023256 0.97142857 0.88888889 0.91666667
 0.94736842 0.97297297 1.         0.875      0.97435897 0.88888889
 0.97297297 0.97674419 0.91891892 0.92307692 1.         0.97142857
 0.96      ]
avarage f1: 0.9492015338250006

	Season	Age	Childish diseases	Accident or serious trauma	Surgical intervention	High fevers	Frequency of alcohol	Smoking habit	Number of hours spent sitting
count	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000
mean	-0.078900	0.669000	0.870000	0.440000	0.510000	0.190000	0.832000	-0.350000	0.406800
std	0.796725	0.121319	0.337998	0.498888	0.502418	0.580752	0.167501	0.808728	0.186395
min	-1.000000	0.500000	0.000000	0.000000	0.000000	-1.000000	0.200000	-1.000000	0.060000
25%	-1.000000	0.560000	1.000000	0.000000	0.000000	0.000000	0.800000	-1.000000	0.250000
50%	-0.330000	0.670000	1.000000	0.000000	1.000000	0.000000	0.800000	-1.000000	0.380000
75%	1.000000	0.750000	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.500000
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

TECHCEPTRON

Thursday, July 12, 2018

Random Forest Classification On Fertility dataset (Treating Class-Imbalance problem with Upsampling)

Binning Age¶

Fitting models without treating categorical variables¶

Binning and Converting to dummies¶

Converting Strings to numbers¶

PCA¶

Cross Validation and model building¶

Treating class imbalance problem: Upsampling

No comments:

Post a Comment

Important Points in Data Science

Report Abuse

	Season	Age	Childish diseases	Accident or serious trauma	Surgical intervention	Frequency of alcohol	Smoking habit	Number of hours spent sitting	Output
0	-0.33	0.69	0	1	1	0.8	0	0.88	N
1	-0.33	0.94	1	0	1	0.8	1	0.31	O
2	-0.33	0.50	1	0	0	1.0	-1	0.50	N
3	-0.33	0.75	0	1	1	1.0	-1	0.38	N
4	-0.33	0.67	1	1	0	0.8	-1	0.50	O

	Season	Age	Childish diseases	Accident or serious trauma	Surgical intervention	Frequency of alcohol	Smoking habit	Number of hours spent sitting	Output
0	-0.33	(60, 70]	0	1	1	0.8	0	0.88	N
1	-0.33	(90, 100]	1	0	1	0.8	1	0.31	O
2	-0.33	(40, 50]	1	0	0	1.0	-1	0.50	N

	Season	Age	Childish diseases	Accident or serious trauma	Surgical intervention	Frequency of alcohol	Smoking habit	Number of hours spent sitting	Output
0	-0.33	(60, 70]	0	1	1	0.8	0	(6, 9]	0
1	-0.33	(90, 100]	1	0	1	0.8	1	(0, 3]	1
2	-0.33	(40, 50]	1	0	0	1.0	-1	(3, 6]	0

	Output	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Smoking habit_1	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(40, 60]	Age_(80,100]
0	0	1	0	1	1	1	1	0	1	0	0	1	0	0
1	1	1	1	0	1	1	1	0	0	1	0	0	0	1
2	0	1	1	0	0	1	0	1	0	0	1	0	1	0

	Output	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Smoking habit_1	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(40, 60]	Age_(80,100]
0	0	1	0	1	1	1	1	0	1	0	0	1	0	0
1	1	1	1	0	1	1	1	0	0	1	0	0	0	1
2	0	1	1	0	0	1	0	1	0	0	1	0	1	0

	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(70, 80]	Age_(40, 60]
0	1	0	1	1	1	1	0	1	0	1	0	0
2	1	1	0	0	1	0	1	0	1	0	0	1
3	1	0	1	1	1	0	1	0	0	0	1	0

	Output	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Smoking habit_1	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(40, 60]	Age_(80,100]
0	0	1	0	1	1	1	1	0	1	0	0	1	0	0
1	1	1	1	0	1	1	1	0	0	1	0	0	0	1
2	0	1	1	0	0	1	0	1	0	0	1	0	1	0

	Output	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Smoking habit_1	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(40, 60]	Age_(80,100]
0	0	1	0	1	1	1	1	0	1	0	0	1	0	0
1	1	1	1	0	1	1	1	0	0	1	0	0	0	1
2	0	1	1	0	0	1	0	1	0	0	1	0	1	0

	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(70, 80]	Age_(40, 60]
0	1	0	1	1	1	1	0	1	0	1	0	0
2	1	1	0	0	1	0	1	0	1	0	0	1
3	1	0	1	1	1	0	1	0	0	0	1	0

	Output	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Smoking habit_1	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(40, 60]	Age_(80,100]
0	0	1	0	1	1	1	1	0	1	0	0	1	0	0
1	1	1	1	0	1	1	1	0	0	1	0	0	0	1
2	0	1	1	0	0	1	0	1	0	0	1	0	1	0

	Output	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Smoking habit_1	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(40, 60]	Age_(80,100]
0	0	1	0	1	1	1	1	0	1	0	0	1	0	0
1	1	1	1	0	1	1	1	0	0	1	0	0	0	1
2	0	1	1	0	0	1	0	1	0	0	1	0	1	0

	Season_-0.33	Childish diseases_1	Accident or serious trauma_1	Surgical intervention_1	High fevers_0	Frequency of alcohol_0.8	Frequency of alcohol_1.0	Smoking habit_0	Number of hours spent sitting_(3, 6]	Number of hours spent sitting_(6, 9]	Age_(70, 80]	Age_(40, 60]
0	1	0	1	1	1	1	0	1	0	1	0	0
2	1	1	0	0	1	0	1	0	1	0	0	1
3	1	0	1	1	1	0	1	0	0	0	1	0