import pandas as pd
full=pd.read_csv("C:/Users/***/DataSets/titanic.csv")
X=full.iloc[:,[0,2,3,4,5,6,7,8,9,10,11,12,13]]
y=full.iloc[:,1]
X.head()

Preprocessing stage¶

Imputation¶

full.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home_dest     564
dtype: int64

full.age.fillna(full.age.mean(),inplace=True)

full.fare.fillna(full.fare.mean(),inplace=True)

full["HasCabin"]="F"
for i in range(len(full.cabin)):
    if pd.isna(full.loc[i,"cabin"]):
        full.loc[i,"HasCabin"]=0
    else:
        full.loc[i,"HasCabin"]=1

full=full.drop(columns="cabin")

full["embarked"].fillna(full.embarked.mode()[0],inplace=True)

full=full.drop(columns=["boat","body","home_dest"])

full=full.drop(columns=["ticket"])

Binning¶

mybins=range(0,full.age.astype("int").max()+1,10)
full["Age_Bucket"]=pd.cut(full.age,bins=mybins)

full.drop(columns="age",inplace=True)

Convert strings to numbers¶

full=pd.get_dummies(data=full,columns=["pclass"],drop_first=True)

full=pd.get_dummies(data=full,columns=["HasCabin"],drop_first=True)

for i in range(0,len(full)):
    if (full.sibsp[i]>2):
        full.sibsp[i]="3+"

C:\Users\***\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
C:\Users\***\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py:194: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

for i in range(len(full)):
    if full.parch[i]>2:
        full.parch[i]="3+"

C:\Users\***\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
C:\Users\***\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py:194: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

print(full.parch.value_counts())
print(full.sibsp.value_counts())

0     1002
1      170
2      113
3+      24
Name: parch, dtype: int64
0     891
1     319
3+     57
2      42
Name: sibsp, dtype: int64

full=pd.get_dummies(data=full,columns=["sibsp","parch"],drop_first=True)

full=pd.get_dummies(data=full,columns=["sex","embarked"],drop_first=True)

full.Age_Bucket=pd.factorize(full.Age_Bucket)[0]

full.drop(columns="name",inplace=True)

full.head()

Outlier Management¶

import numpy as np
np.sqrt(full.fare).hist()

<matplotlib.axes._subplots.AxesSubplot at 0x1235b160>

full.fare=np.sqrt(full.fare)

from matplotlib import pyplot as pt
pt.boxplot(full.fare,showfliers=True)

{'boxes': [<matplotlib.lines.Line2D at 0x14995780>],
 'caps': [<matplotlib.lines.Line2D at 0x149a51d0>,
  <matplotlib.lines.Line2D at 0x149a55f8>],
 'fliers': [<matplotlib.lines.Line2D at 0x149a5e48>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x149a5a20>],
 'whiskers': [<matplotlib.lines.Line2D at 0x149958d0>,
  <matplotlib.lines.Line2D at 0x14995d68>]}

full.loc[full.fare>9,"fare"]=9

from matplotlib import pyplot as pt
pt.boxplot(full.fare,showfliers=True)

{'boxes': [<matplotlib.lines.Line2D at 0x1328ea20>],
 'caps': [<matplotlib.lines.Line2D at 0x11f42390>,
  <matplotlib.lines.Line2D at 0x13228ac8>],
 'fliers': [<matplotlib.lines.Line2D at 0x13ce48d0>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x13228908>],
 'whiskers': [<matplotlib.lines.Line2D at 0x1328e1d0>,
  <matplotlib.lines.Line2D at 0x13ce6390>]}

Scaling¶

from sklearn.preprocessing import MinMaxScaler
minmax=MinMaxScaler()
full_scaled=min.fit_transform(full)
full_scaled=pd.DataFrame(full_scaled, columns=full.columns, index=full.index)
full_scaled

full_scaled.head()

PCA¶

from sklearn.decomposition import PCA
X=full_scaled.iloc[:,1:len(full_scaled.columns)]
y=full_scaled.iloc[:,0]
pca=PCA(n_components=2)
X_reduced = pca.fit_transform(X)
print(X_reduced)

[[ 0.82998575  0.22127871]
 [ 0.88750666 -0.05226833]
 [ 1.1548041   0.38539563]
 ...
 [-0.71827988  0.34391668]
 [-0.71827988  0.34391668]
 [-0.73291256 -0.29839093]]

Cross Validation & Model Building

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
log=LogisticRegression(penalty="l2",C=1)
kfold=KFold(n_splits=10,random_state=123)
f1Scores=cross_val_score(X=X,y=y,estimator=log,cv=kfold,scoring="accuracy",n_jobs=-1)
f1Scores

array([0.79389313, 0.75572519, 0.79389313, 0.8778626 , 0.80916031,
       0.73282443, 0.77099237, 0.70992366, 0.77099237, 0.83846154])

	pclass	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home_dest
0	1	Allen Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis MO
1	1	Allison Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal PQ / Chesterville ON
2	1	Allison Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal PQ / Chesterville ON
3	1	Allison Mr. Hudson Joshua Creighton	male	30.0000	1	2	113781	151.5500	C22 C26	S	NaN	135.0	Montreal PQ / Chesterville ON
4	1	Allison Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal PQ / Chesterville ON

	survived	fare	Age_Bucket	HasCabin_1	sibsp_1	parch_2	sex_male	embarked_S
0	1	14.537452	0	1	0	0	0	1
1	1	12.310565	1	1	1	1	1	1
2	0	12.310565	1	1	1	1	0	1
3	0	12.310565	0	1	1	1	1	1
4	0	12.310565	0	1	1	1	0	1

TECHCEPTRON

Monday, July 9, 2018

Logistic Regression On Titanic Using Python (Including PCA and Cross Validation)

Preprocessing stage¶

Imputation¶

Binning¶

Convert strings to numbers¶

Outlier Management¶

Scaling¶

PCA¶

Cross Validation & Model Building

No comments:

Post a Comment

Important Points in Data Science

Report Abuse

	survived	fare	Age_Bucket	pclass_2	pclass_3	HasCabin_1	sibsp_1	sibsp_2	sibsp_3+	parch_1	parch_2	parch_3+	sex_male	embarked_Q	embarked_S
0	1.0	1.000000	0.000000	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
1	1.0	1.000000	0.142857	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	1.0
2	0.0	1.000000	0.142857	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0
3	0.0	1.000000	0.000000	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	1.0
4	0.0	1.000000	0.000000	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0
5	1.0	0.572519	0.285714	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
6	1.0	0.981044	0.428571	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
7	0.0	0.000000	0.571429	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
8	1.0	0.797211	0.714286	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
9	0.0	0.781769	0.857143	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
10	0.0	1.000000	0.285714	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
11	1.0	1.000000	1.000000	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
12	1.0	0.924962	0.000000	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
13	1.0	0.986639	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
14	1.0	0.608581	0.857143	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
15	0.0	0.565740	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
16	0.0	1.000000	0.000000	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
17	1.0	1.000000	0.285714	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0
18	1.0	0.970501	0.571429	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
19	0.0	0.963800	0.571429	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
20	1.0	0.805492	0.571429	0.0	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	1.0
21	1.0	0.805492	0.285714	0.0	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0
22	1.0	0.608581	0.000000	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
23	1.0	1.000000	0.285714	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
24	1.0	1.000000	0.000000	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
25	0.0	0.566558	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
26	1.0	1.000000	0.000000	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
27	1.0	1.000000	1.000000	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
28	1.0	1.000000	0.571429	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
29	1.0	0.572519	0.000000	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1279	0.0	0.311393	1.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
1280	0.0	0.312216	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1281	0.0	0.333333	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1282	0.0	0.315250	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1283	0.0	0.305303	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1284	0.0	0.315250	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1285	0.0	0.342467	0.571429	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1286	1.0	0.298746	0.571429	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1287	0.0	0.309320	0.714286	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1288	0.0	0.283187	1.000000	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1289	0.0	0.283187	0.000000	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1290	1.0	0.293972	0.285714	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
1291	0.0	0.327966	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1292	0.0	0.305303	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1293	0.0	0.315250	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1294	0.0	0.445831	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1295	0.0	0.299176	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1296	0.0	0.327024	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1297	0.0	0.299176	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1298	0.0	0.342467	0.571429	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0
1299	0.0	0.422430	0.000000	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
1300	1.0	0.422430	1.000000	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1301	0.0	0.298660	0.285714	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
1302	0.0	0.298660	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
1303	0.0	0.422490	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
1304	0.0	0.422430	1.000000	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1305	0.0	0.422430	0.000000	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1306	0.0	0.298660	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
1307	0.0	0.298660	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
1308	0.0	0.311805	0.000000	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0