Kaggle Titanic: Survive prediction

This post is a recording of learning ML with Kaggle, For the competition, link click here.

This is a 73.68% accurate model with Random Forest Classifier.

Intro

Let take a glance at the notebook, here we go! We need the following packages.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Modelling Algorithms -> I finally select Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import Normalizer, scale
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure Visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8, 6

plus, we got some helper methods.

# Helper Methods from Kaggle
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = titanic.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

and using KaggleAPI to download data

kaggle competitions download -c titanic

loading the data

# Loading Data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

full = train.append(test, ignore_index=True)
titanic = full[:891]

del train, test

print('Datasets:', 'full:', full.shape, 'titanic:', titanic.shape)
# Datasets: full: (1309, 12) titanic: (891, 12)

Taking a glance at the data head

titanic.head()

Data Visualization

plot_correlation_map(titanic)
plot_categories(titanic, cat='Embarked', target='Survived')
# Plot survival rate by Sex
plot_categories(titanic, cat='Sex', target='Survived')
# Plot survival rate by Pclass
plot_categories(titanic, cat='Pclass', target='Survived')
# Plot survival rate by SibSp (no. of Siblings / Spouses)
plot_categories(titanic, cat='SibSp', target='Survived')

Data Preparation

pandas.get_dummies() the method is to one-hot encoding every attributes with the data, here is an example to know how it works

# pd.get_dummies() function usage
# one hot encoding of every attributes with the data
df = pd.DataFrame([
    ['tall', 'thin'],
    ['short', 'thin'],
    ['short', 'overweighted']
])

df.columns = ['height', 'weight']
pd.get_dummies(df)
IDheight_shortheight_tallweight_overweightedweight_thin
00101
11001
21010

To have a more clear dataset, We can transform sex into 1 or 0

# Transform sex -> 0 / 1
sex = pd.Series(np.where(full.Sex == 'male', 1, 0), name='Sex')

Make dummies for embarked gate

# Create a new variable for every unique value of Embarked (one-hot encoding)
# Embarked gate = [C, Q, S]
embarked = pd.get_dummies(full.Embarked, prefix='Embarked')

# Same operation of Pclass
pclass = pd.get_dummies(full.Pclass, prefix='Pclass')

Data Cleaning

Most machine learning algorithms require all variables to have values to use for training the model. The simplest method is to fill missing values with the average of the variable across all observations in the training set.

# Create Dataset
imputed = pd.DataFrame()

# Fill missing values of Age and Fare with the avg from full
imputed['Age'] = full.Age.fillna(full.Age.mean())
imputed['Fare'] = full.Fare.fillna(full.Fare.mean())
imputed.head()
IDAgeFare
022.07.2500
138.071.2833
226.07.9250
335.053.1000
435.08.0500

Feature Engineering

There are many titles for the passengers in Titanic, but we can simplify them into several categories, this needs a helper method and mapping dictionary to achieve.

# Extract title from passager names
# Titles reflect social status and may predict survival probability
title = pd.DataFrame()
title['Title'] = full['Name'].map(\
    lambda name: name.split(',')[1]\
    .split('.')[0].strip()
)
# Simplify titles dictionary
Title_Dictionary = {
    "Capt":         "Officer",
    "Col":          "Officer",
    "Major":        "Officer",
    "Dr":           "Officer",
    "Rev":          "Officer",
    "Jonkheer":     "Royalty",
    "Don":          "Royalty",
    "Sir":          "Royalty",
    "the Countess": "Royalty",
    "Dona":         "Royalty",
    "Mme":          "Mrs",
    "Mlle":         "Miss",
    "Ms":           "Mrs",
    "Mr":           "Mr",
    "Mrs":          "Mrs",
    "Miss":         "Miss",
    "Master":       "Master",
    "Lady":         "Royalty"
}

To apply the cleaned title

title['Title'] = title.Title.map(Title_Dictionary)
title = pd.get_dummies(title.Title)
title.head()
IDMasterMissMrMrsOfficerRoyalty
0001000
1000100
2010000
3000100
4001000

Now we optimize the Cabin, using U for unknown cabins (data missing).

# Extract cabin category information from cabin no.
cabin = pd.DataFrame()
# U: Unknown
cabin['Cabin'] = full.Cabin.fillna('U')

# mapping cabin value with cabin letter
cabin['Cabin'] = cabin['Cabin'].map(lambda c: c[0])

# one hot encoding
cabin = pd.get_dummies(cabin['Cabin'], prefix='Cabin')
cabin.head()
IDCabin_ACabin_BCabin_CCabin_DCabin_ECabin_FCabin_GCabin_TCabin_U
0000000001
1001000000
2000000001
3001000000
4000000001

The following is for extracting ticket class from each ticket no.

# Extract ticket class from ticket number

# Ticket clean Algorithms
def cleanTicket( ticket ):
    ticket = ticket.replace( '.' , '' )
    ticket = ticket.replace( '/' , '' )
    ticket = ticket.split()
    ticket = map( lambda t : t.strip() , ticket )
    ticket = list(filter( lambda t : not t.isdigit() , ticket ))
    if len( ticket ) > 0:
        return ticket[0]
    else: 
        return 'XXX'


ticket = pd.DataFrame()
ticket['Ticket'] = full['Ticket'].map(cleanTicket)
ticket = pd.get_dummies(ticket['Ticket'], prefix='Ticket')

ticket.head()
IDTicket_ATicket_A4Ticket_A5Ticket_AQ3Ticket_AQ4Ticket_ASTicket_CTicket_CATicket_CASOTONTicket_FC...Ticket_SOTONO2Ticket_SOTONOQTicket_SPTicket_STONOTicket_STONO2Ticket_STONOQTicket_SWPPTicket_WCTicket_WEPTicket_XXX
00010000000...0000000000
10000000000...0000000000
20000000000...0000100000
30000000000...0000000001
40000000000...0000000001

We introduce a new attribute called family size which data is derived from parent/children and sibling/spouse attributes

# Create family size and category for family size
family = pd.DataFrame()

family['FamilySize'] = full['Parch'] + full['SibSp'] + 1

family['Family_Single'] = family['FamilySize'].map(lambda s: 1 if s == 1 else 0)
family['Family_Small'] = family['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
family['Family_Large'] = family['FamilySize'].map(lambda s: 1 if 5 <= s else 0)

family.head()
IDFamilySizeFamily_SingleFamily_SmallFamily_Large
02010
12010
21100
32010
41100

Data Modelling

With the cleaned data, I selected these variables.

  • imputed
  • embarked
  • pclass
  • sex
  • family
  • cabin
  • ticket
full_X = pd.concat([imputed, embarked, cabin, sex], axis=1)
full_X.head()
IDAgeFareEmbarked_CEmbarked_QEmbarked_SCabin_ACabin_BCabin_CCabin_DCabin_ECabin_FCabin_GCabin_TCabin_USex
022.07.25000010000000011
138.071.28331000010000000
226.07.92500010000000010
335.053.10000010010000000
435.08.05000010000000011

That's it, and now we can set the train and test datasets

train_valid_X = full_X[0:891]
train_valid_y = titanic.Survived
test_X = full_X[891:]

train_X, valid_X, train_y, valid_y = train_test_split(
    train_valid_X, 
    train_valid_y, 
    train_size=.7
)

print(
    full_X.shape, 
    train_X.shape, 
    valid_X.shape, 
    train_y.shape, 
    valid_y.shape, 
    test_X.shape
)

# (1309, 15) (623, 15) (268, 15) (623,) (268,) (418, 15)
plot_variable_importance(train_X, train_y)

I also tried many other classifiers, finally, the random forest is optimized for this classification

# Random Forest Classifier
selected_model = RandomForestClassifier(n_estimators=100)

# Support Vector Machines
model = SVC()

# Gradient Boosting Classifier
model = GradientBoostingClassifier()

# K-nearest neighbors
model = KNeighborsClassifier(n_neighbors=3)

# Gaussian Naive Bayes
model = GaussianNB()

# Logistic Regression
model = LogisticRegression()

Then we can fit (train the model)

selected_model.fit(train_X, train_y)

And take a look at the accuracy

print("Train:", selected_model.score(train_X, train_y),
      "\nValid:", selected_model.score(valid_X, valid_y))

# Train: 0.9341894060995185 
# Valid: 0.7761194029850746

Deploy to Kaggle

Deployment to Kaggle

Deployment in this context means publishing the resulting prediction from the model to the Kaggle leaderboard. To do this do the following:

  1. select the cell below and run it by pressing the play button.
  2. Press the Publish button in the top right corner.
  3. Select Output on the notebook menubar
  4. Select the result dataset and press Submit to Competition button
test_Y = selected_model.predict(test_X)
passenger_id = full[891:].PassengerId
test = pd.DataFrame({
    'PassengerId': passenger_id,
    'Survived': test_Y
}, dtype='int32')
print(test.shape)
print(test.head())
test.to_csv('submission.csv', index=False)
# output
(418, 2)
     PassengerId  Survived
891          892         0
892          893         1
893          894         0
894          895         0
895          896         1

here is the submission file

!ls

Kaggle-Titanic.ipynb submission.csv
data                 titanic.zip

Something in the end

Kaggle Titanic survival prediction is a straightforward competition and it is good for making your hands dirty to exercise ML skills.