In [216]:
import pandas as pd
full=pd.read_csv("C:/Users/E002891/Desktop/DayWiseTracker/Programming Concepts/Data Science/DataSets/fertility.csv",sep=";")
full.head(5)
Out[216]:
In [148]:
full.describe()
Out[148]:
In [168]:
full.isna().any()
Out[168]:
Binning Age¶
In [217]:
mybins=range((full.Age*100).astype("int").min()-10,(full.Age*100).astype("int").max()+10,10)
full.Age=pd.cut((full.Age*100).astype("int"),bins=mybins)
full.head(3)
Out[217]:
In [218]:
bins1=range((full["Number of hours spent sitting"]*10).astype("int").min(),(full["Number of hours spent sitting"]*10).astype("int").max(),3)
full["Number of hours spent sitting"]=pd.cut((full["Number of hours spent sitting"]*10).astype("int"),bins1)
# full["Number of hours spent sitting"].value_counts()
In [219]:
full.Output=full.Output.factorize()[0]
full.head(3)
Out[219]:
In [220]:
full.Output.value_counts()
Out[220]:
Fitting models without treating categorical variables¶
In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
log=LogisticRegression(penalty="l2",C=1)
log.fit(full.iloc[:,0:9],full["Output"])
classification_report(full["Output"],log.predict(full.iloc[:,0:9]))
#Error: float() argument must be a string or a number, not 'pandas._libs.interval.Interval'
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
log=RandomForestClassifier()
log.fit(full.iloc[:,0:9],full["Output"])
classification_report(full["Output"],log.predict(full.iloc[:,0:9]))
#Error: float() argument must be a string or a number, not 'pandas._libs.interval.Interval'
In [94]:
full1=full
Binning and Converting to dummies¶
In [221]:
for i in range(len(full)):
if((str(full.iloc[i,1])=='(80, 90]') | (str(full.iloc[i,1])=='(90, 100]')):
full.loc[i,"NewAge"]='(80,100]'
else:
full.loc[i,"NewAge"]=full1.loc[i,"Age"]
In [222]:
for i in range(len(full)):
if((str(full.loc[i,"NewAge"])=='(40, 50]') | (str(full.loc[i,"NewAge"])=='(50, 60]')):
full.loc[i,"NewAge1"]='(40-60]'
else:
full.loc[i,"NewAge1"]=full.loc[i,"NewAge"]
In [ ]:
for i in range(len(full)):
if(str(full.loc[i,'NewAge1'])=='(40-60]'):
full.loc[i,'Age']='(40, 60]'
else:
full.loc[i,'Age']=full.loc[i,'NewAge1']
In [225]:
full.drop(columns=["Age","NewAge"],inplace=True)
In [227]:
full["Age"]=full["NewAge1"]
In [229]:
full.drop(columns=["NewAge1"],inplace=True)
In [230]:
full.head(3)
Out[230]:
In [134]:
full=full1
In [231]:
full=pd.get_dummies(data=full,columns=["Season","Childish diseases","Accident or serious trauma","Surgical intervention","High fevers","Frequency of alcohol","Smoking habit","Number of hours spent sitting","Age"],drop_first=True)
In [232]:
full.head(3)
Out[232]:
Converting Strings to numbers¶
In [145]:
import numpy as np
full2=full1
full2["Frequency of alcohol"]=np.where(((full2["Frequency of alcohol"]==0.2) | (full2["Frequency of alcohol"]==0.4)),1.0,full2["Frequency of alcohol"])
In [146]:
full2["Frequency of alcohol"].value_counts()
Out[146]:
In [147]:
full.head(3)
Out[147]:
PCA¶
In [233]:
from sklearn.decomposition import PCA
X=full.iloc[:,1:len(full.columns)]
y=full["Output"]
pca=PCA(n_components=2)
X_new=pca.fit_transform(X)
In [234]:
y.value_counts()
Out[234]:
Cross Validation and model building¶
In [235]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
log=LogisticRegression(penalty="l2",C=1, class_weight="balanced")
kfold=RepeatedKFold(n_splits=20,n_repeats=5)
f1_scores=cross_val_score(estimator=log,X=X,y=y,n_jobs=-1,cv=kfold,verbose=True,scoring="recall")
f1_scores
Out[235]:
In [207]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
log=LogisticRegression(penalty="l2",C=1, class_weight="balanced")
kfold=KFold(n_splits=3,random_state=123)
# f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
preds=cross_val_predict(X=X,y=y,estimator=log,cv=kfold,n_jobs=-1,method="predict_proba")
In [208]:
# pos_pred=preds[]
pos_preds=[i[1] for i in preds]
pos_preds
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y,pos_preds))
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
log=RandomForestClassifier(n_estimators=1000)
kfold=KFold(n_splits=3,random_state=123)
# f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
preds=cross_val_predict(X=X,y=y,estimator=log,cv=kfold,n_jobs=-1,method="predict_proba")
In [215]:
f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="f1",n_jobs=-1)
f1Scores
Out[215]:
Treating class imbalance problem: Upsampling
In [240]:
from sklearn.utils import resample
full_majority=full.loc[full.Output==0,:]
full_minority=full.loc[full.Output==1,:]
full_minority_new = resample(full_minority,n_samples=88,replace=True,random_state=123)
full_minority_new.Output.value_count()
Out[240]:
In [243]:
full_new=pd.concat([full_majority,full_minority_new])
len(full_new)
Out[243]:
In [245]:
full_new.head(3)
Out[245]:
In [250]:
X=full_new.iloc[:,1:len(full_new)]
y=full_new.iloc[:,0]
In [262]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_new=pca.fit_transform(X)
print(X_new[1:5])
In [253]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
clf=RandomForestClassifier()
kfold = KFold(n_splits=3)
preds = cross_val_predict(X=X_new,y=y,cv=kfold,n_jobs=-1,estimator=clf)
preds
Out[253]:
In [258]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(confusion_matrix(y,preds))
print(classification_report(y,preds))
In [261]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
rclf=RandomForestClassifier(n_estimators=1000)
rKfold=RepeatedKFold(n_splits=5,n_repeats=5)
f1Scores = cross_val_score(X=X_new,y=y,cv=rKfold,n_jobs=-1,estimator=rclf,scoring="f1")
print(f1Scores)
print("avarage f1:",f1Scores.mean())
No comments:
Post a Comment