# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
os.path.join(dirname, filename)
  1. Selecting relevant features
  2. Handling missing values
  3. Data type conversion
  4. Data normalization (if necessary)
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
train_df.head()
png

Removing Irrelevant Attributes

Context Information Provided

# Removing Irrelevant attributes
irrelv_attrs = ['PassengerId','Name','Ticket','Fare']
relv_attrs=[x for x in train_df.columns.values if x not in irrelv_attrs]
train_df_tmp=train_df[relv_attrs]

# Removing Irrelevant attributes from test sest
relv_attrs=[x for x in test_df.columns.values if x not in irrelv_attrs]
test_df_tmp=test_df[relv_attrs]

Handling Missing Values

pd.concat([train_df_tmp,test_df_tmp]).isnull().apply(lambda x: x.any(), axis=0)Survived     True
Pclass False
Sex False
Age True
SibSp False
Parch False
Cabin True
Embarked True
dtype: bool

Age

grouped_train_df=train_df[['Survived','Sex','Age','Pclass']].groupby(['Pclass','Survived','Sex'])['Age'].mean()

grouped_train_df.unstack().unstack().plot(kind='bar', figsize=(15,8),
title="Fig 1: Age vs Pclass based upon gender & survived",
xlabel="Passenger class",
ylabel="Average age",
colormap='nipy_spectral')
<AxesSubplot:title={'center':'Fig 1: Age vs Pclass based upon gender & survived'}, xlabel='Passenger class', ylabel='Average age'>
png
# Filling out missing values for Age
calc_NA_vals=train_df_tmp[['Sex','Age','Pclass']].groupby(['Pclass','Sex'])['Age'].mean().reset_index().rename(columns={"Age":"Missing_Age"})
print("Values used to replace missing age")
print(calc_NA_vals)

train_df_tmp=train_df_tmp.merge(calc_NA_vals, how='left',on=['Sex','Pclass'])
test_df_tmp=test_df_tmp.merge(calc_NA_vals, how='left',on=['Sex','Pclass'])

train_df_tmp['Age'].fillna(train_df_tmp['Missing_Age'], inplace=True)
test_df_tmp['Age'].fillna(test_df_tmp['Missing_Age'], inplace=True)
Values used to replace missing age
Pclass Sex Missing_Age
0 1 female 34.611765
1 1 male 41.281386
2 2 female 28.722973
3 2 male 30.740707
4 3 female 21.750000
5 3 male 26.507589

Embarked

grouped_emb_train_df=train_df[['PassengerId','Embarked']].groupby(['Embarked'])['PassengerId'].count()

grouped_emb_train_df.plot(kind='bar', figsize=(15,8),
title="Fig 2: Count vs Embarked",
xlabel="Embarked location",
ylabel="Total count",
colormap='seismic')
<AxesSubplot:title={'center':'Fig 2: Count vs Embarked'}, xlabel='Embarked location', ylabel='Total count'>
png
# Filling the NA values for Embarked
train_df_tmp['Embarked'].fillna('S', inplace=True)
test_df_tmp['Embarked'].fillna('S', inplace=True)

Cabin

# Fill all NA by others and count values
train_df['Cabin'].fillna('Others').value_counts()
Others 687
B96 B98 4
C23 C25 C27 4
G6 4
F33 3
...
E77 1
D9 1
B94 1
A7 1
C70 1
Name: Cabin, Length: 148, dtype: int64
df['Cabin_Type'] = # First letter from the Cabin
df['Cabin_Count']=# Number of Cabins
# These are the unique first letter for the data points that have cabin
# I would use a different letter to fill the Null values, i.e. Z
(pd.concat([train_df,test_df]))[~(pd.concat([train_df,test_df]))["Cabin"].isnull()]['Cabin'].str[0].unique()
array(['C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)# Transforming Cabin to Two separate attributes
train_df_tmp_ = train_df_tmp[~train_df_tmp["Cabin"].isnull()][["Cabin"]]
train_df_tmp_["Cabin_fl"] = train_df_tmp_["Cabin"].str[0] # Get first letter
train_df_tmp_["Cabin_cnt"]= train_df_tmp_["Cabin"].str.strip().str.split(' ').apply(lambda x: len(x)) # Get cabin count
train_df_tmp=train_df_tmp.join(train_df_tmp_[["Cabin_fl","Cabin_cnt"]], lsuffix='_left', rsuffix='_right') # merge back to original dataframe

# Same process with test dataframe
test_df_tmp_ = test_df_tmp[~test_df_tmp["Cabin"].isnull()][["Cabin"]]
test_df_tmp_["Cabin_fl"] = test_df_tmp_["Cabin"].str[0] # Get first letter
test_df_tmp_["Cabin_cnt"]= test_df_tmp_["Cabin"].str.strip().str.split(' ').apply(lambda x: len(x)) # Get cabin count
test_df_tmp=test_df_tmp.join(test_df_tmp_[["Cabin_fl","Cabin_cnt"]], lsuffix='_left', rsuffix='_right') # merge back to original dataframe
train_df_tmp['Cabin_cnt'].fillna(0, inplace=True)
train_df_tmp['Cabin_fl'].fillna('Z', inplace=True)
test_df_tmp['Cabin_cnt'].fillna(0, inplace=True)
test_df_tmp['Cabin_fl'].fillna('Z', inplace=True)

train_df_tmp.head()
png

Data Type Conversion

# Sex Categorial to Numerical
train_df_tmp['Sex'].replace({'male':0,'female':1}, inplace=True)
train_df_tmp=train_df_tmp.join(pd.get_dummies(train_df_tmp['Pclass'], prefix='Pclass'))
train_df_tmp=train_df_tmp.join(pd.get_dummies(train_df_tmp['Embarked'], prefix='Embarked'))
train_df_tmp=train_df_tmp.join(pd.get_dummies(train_df_tmp['Cabin_fl'], prefix='Cabin_fl'))

# For test dataframe
test_df_tmp['Sex'].replace({'male':0,'female':1}, inplace=True)
test_df_tmp=test_df_tmp.join(pd.get_dummies(test_df_tmp['Pclass'], prefix='Pclass'))
test_df_tmp=test_df_tmp.join(pd.get_dummies(test_df_tmp['Embarked'], prefix='Embarked'))
test_df_tmp=test_df_tmp.join(pd.get_dummies(test_df_tmp['Cabin_fl'], prefix='Cabin_fl'))

Context Information Not Provided

Embedded Method

from sklearn.linear_model import LassoCV

model = LassoCV(random_state=0)

# Only numerical attributes are selected to fit into the model
cols_req=['Sex', 'Age', 'SibSp', 'Parch', 'Cabin_cnt', 'Pclass_1',
'Pclass_2', 'Pclass_3', 'Cabin_fl_A', 'Cabin_fl_B', 'Cabin_fl_C',
'Cabin_fl_D', 'Cabin_fl_E', 'Cabin_fl_F', 'Cabin_fl_G',
'Cabin_fl_T', 'Cabin_fl_Z','Embarked_C','Embarked_Q','Embarked_S']

model.fit(train_df_tmp[cols_req],train_df_tmp.Survived)
LassoCV(random_state=0)coef = pd.Series(model.coef_, index = train_df_tmp[cols_req].columns)
imp_coef = coef.sort_values()

# filtering out all the attributes that have 0 coefficient
imp_coef[abs(imp_coef)>0].plot(kind = "barh", title="Fig 3: ", figsize=(15,8))
<AxesSubplot:title={'center':'Fig 3: '}>
png

Correlation Method

cols_req=['Survived','Sex', 'Age', 'SibSp', 'Parch', 'Cabin_cnt', 'Pclass_1',
'Pclass_2', 'Pclass_3', 'Cabin_fl_A', 'Cabin_fl_B', 'Cabin_fl_C',
'Cabin_fl_D', 'Cabin_fl_E', 'Cabin_fl_F', 'Cabin_fl_G',
'Cabin_fl_T', 'Cabin_fl_Z','Embarked_C','Embarked_Q','Embarked_S']

cor = train_df_tmp[cols_req].corr()

fig, ax = plt.subplots(figsize=(18, 12))
im = ax.imshow(cor, cmap='Wistia')

ax.set_title("Fig 4: Correlation among the required variables")

ax.set_xticks(np.arange(len(cols_req)))
ax.set_yticks(np.arange(len(cols_req)))

ax.set_xticklabels(cols_req)
ax.set_yticklabels(cols_req)

plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")

for i in range(len(cols_req)):
for j in range(len(cols_req)):
text = ax.text(j, i, round(cor.iloc[i, j],2),
ha="center", va="center", color="w")

fig.tight_layout()
plt.show()
png
from sklearn.svm import SVC

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


# All used features
features=['Sex','Age','Parch','SibSp','Pclass_1','Pclass_3', 'Embarked_C','Embarked_Q','Embarked_S']
# Populate test set for the missing columns
for x in features:
if x not in test_df_tmp.columns.values:
test_df_tmp[x]=0

X_train,y_train=train_df_tmp[features].to_numpy(),train_df_tmp["Survived"]

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('svc', SVC(gamma='auto'))])

Model Validity

Cross-Validation Scores

scores = cross_val_score(clf, X_train, y_train, cv=10)
scores
array([0.83333333, 0.83146067, 0.7752809 , 0.87640449, 0.86516854,
0.82022472, 0.83146067, 0.76404494, 0.86516854, 0.86516854])

Confusion Matrix & Metrics

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=0)
clf.fit(X_train, y_train)
plot_confusion_matrix(clf, X_test, y_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f6f9bba7370>
png
func_vars = [('precision_score',precision_score),('recall_score',recall_score),('f1_score',f1_score),('accuracy_score',accuracy_score)]

y_pred = clf.predict(X_test)

for lab,fn in func_vars:
print(f"The {lab} score for classifier is {round(fn(y_test,y_pred),4)}")
The precision_score score for classifier is 0.8143
The recall_score score for classifier is 0.6786
The f1_score score for classifier is 0.7403
The accuracy_score score for classifier is 0.8206

Conclusion

--

--

--

Just a data science enthusiast.

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

How Programming Pathshala got started?

Beginner’s Guide to Setting up NGINX on Ubuntu 18.04

Server image, NGINX, Ubuntu

What it’s like to be a Software Engineer

Interacting with webcam and video file and some drawing with OPEN CV — PART-II

ALC ’17: The beginning of my EPIC journey

CS373 Spring 2021: Week 2

Debugging a native deadlock in a .NET Linux application

Aligning business and IT: 3 tips for innovative software development

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Dudhraj Sandeep

Dudhraj Sandeep

Just a data science enthusiast.

More from Medium

Used Car Price prediction & analysis

Unsupervised Machine Learning: What, Why and How..? Example, Using Python

Predicting HDB Resale Prices in Singapore during COVID-19