In [270]:
import pandas as pd
full=pd.read_csv("C:/Users/***/DataSets/titanic.csv")
X=full.iloc[:,[0,2,3,4,5,6,7,8,9,10,11,12,13]]
y=full.iloc[:,1]
X.head()
Out[270]:
Preprocessing stage¶
Imputation¶
In [271]:
full.isna().sum()
Out[271]:
In [272]:
full.age.fillna(full.age.mean(),inplace=True)
In [273]:
full.fare.fillna(full.fare.mean(),inplace=True)
In [274]:
full["HasCabin"]="F"
for i in range(len(full.cabin)):
if pd.isna(full.loc[i,"cabin"]):
full.loc[i,"HasCabin"]=0
else:
full.loc[i,"HasCabin"]=1
In [275]:
full=full.drop(columns="cabin")
In [276]:
full["embarked"].fillna(full.embarked.mode()[0],inplace=True)
In [277]:
full=full.drop(columns=["boat","body","home_dest"])
In [278]:
full=full.drop(columns=["ticket"])
Binning¶
In [279]:
mybins=range(0,full.age.astype("int").max()+1,10)
full["Age_Bucket"]=pd.cut(full.age,bins=mybins)
In [280]:
full.drop(columns="age",inplace=True)
Convert strings to numbers¶
In [281]:
full=pd.get_dummies(data=full,columns=["pclass"],drop_first=True)
In [282]:
full=pd.get_dummies(data=full,columns=["HasCabin"],drop_first=True)
In [283]:
for i in range(0,len(full)):
if (full.sibsp[i]>2):
full.sibsp[i]="3+"
In [284]:
for i in range(len(full)):
if full.parch[i]>2:
full.parch[i]="3+"
In [285]:
print(full.parch.value_counts())
print(full.sibsp.value_counts())
In [286]:
full=pd.get_dummies(data=full,columns=["sibsp","parch"],drop_first=True)
In [287]:
full=pd.get_dummies(data=full,columns=["sex","embarked"],drop_first=True)
In [288]:
full.Age_Bucket=pd.factorize(full.Age_Bucket)[0]
In [289]:
full.drop(columns="name",inplace=True)
In [310]:
full.head()
Out[310]:
Outlier Management¶
In [308]:
import numpy as np
np.sqrt(full.fare).hist()
Out[308]:
In [ ]:
full.fare=np.sqrt(full.fare)
In [338]:
from matplotlib import pyplot as pt
pt.boxplot(full.fare,showfliers=True)
Out[338]:
In [347]:
full.loc[full.fare>9,"fare"]=9
In [349]:
from matplotlib import pyplot as pt
pt.boxplot(full.fare,showfliers=True)
Out[349]:
Scaling¶
In [350]:
from sklearn.preprocessing import MinMaxScaler
minmax=MinMaxScaler()
full_scaled=min.fit_transform(full)
full_scaled=pd.DataFrame(full_scaled, columns=full.columns, index=full.index)
full_scaled
Out[350]:
In [352]:
full_scaled.head()
Out[352]:
PCA¶
In [360]:
from sklearn.decomposition import PCA
X=full_scaled.iloc[:,1:len(full_scaled.columns)]
y=full_scaled.iloc[:,0]
pca=PCA(n_components=2)
X_reduced = pca.fit_transform(X)
print(X_reduced)
Cross Validation & Model Building
In [372]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
log=LogisticRegression(penalty="l2",C=1)
kfold=KFold(n_splits=10,random_state=123)
f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="accuracy",n_jobs=-1)
f1Scores
Out[372]:
No comments:
Post a Comment