In [61]:
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
# import dataset
dataset = pd.read_csv('titanic_data.csv')
In [3]:
display(dataset.head())
display(dataset.shape)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
(891, 12)
In [4]:
# Data processing
# Drop names as a feature
dataset = dataset.drop('Name', axis=1)
# Fill missing info
dataset = dataset.fillna(0)
# One hot encode to convert feature values to numerical values
dataset = pd.get_dummies(dataset)
# Shuffle data
dataset = dataset.sample(frac=1, random_state=77)
# Separate into train, validate, test
train = dataset[0:500]
val = dataset[500:700]
test = dataset[700:]
In [5]:
# Create targets
train_t = train['Survived']
train_X = train.drop('Survived', axis=1)
val_t = val['Survived']
val_X = val.drop('Survived', axis=1)
test_t = train['Survived']
test_X = train.drop('Survived', axis=1)
In [6]:
# Models for comparison
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
# Evaluation metric
from sklearn.metrics import accuracy_score
In [7]:
# Logistic Regression Classifier
acc_best = 0
best_lr = None
val_accs_lr = []
max_iter = np.arange(1000, 11000, 1000)
C_range = np.arange(1, 11)
for i in max_iter:
    for c in C_range:
        model_lr = LogisticRegression(C=c, max_iter=i, solver = 'liblinear')
        model_lr.fit(train_X, train_t)
        
        # predict on validation set
        val_y = model_lr.predict(val_X)
        acc = accuracy_score(val_t,val_y)
        val_accs_lr.append(acc)
        if(acc>acc_best):
            best_lr = model_lr
            acc_best = acc
            
# Test set performance
test_y = best_lr.predict(test_X)
accuracy_score(test_t, test_y)
print("Test accuracy = " + str(accuracy_score(test_t, test_y)))
print(best_lr)
Test accuracy = 0.988
LogisticRegression(C=9, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
In [8]:
# Decision Tree Classifier
acc_best = 0
best_dt = None
val_accs_dt = []
max_depth = np.arange(1, 11)
min_split = np.arange(2, 22)
for depth in max_depth:
    for split in min_split:
        model_dt = DecisionTreeClassifier(max_depth=depth, min_samples_split=split)
        model_dt.fit(train_X, train_t)
        
        # predict on validation set
        val_y = model_dt.predict(val_X)
        acc = accuracy_score(val_t, val_y)
        val_accs_dt.append(acc)
        if(acc>acc_best):
            best_dt = model_dt
            acc_best = acc

# Test Set performance
test_y = best_dt.predict(test_X)
print("Test accuracy = " + str(accuracy_score(test_t, test_y)))
print(best_dt)
Test accuracy = 0.9
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=18,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
In [81]:
# SGD Classifier
acc_best = 0
best_sgd = None
val_accs_sgd = []
max_iter = np.arange(1000, 11000, 1000)
alphas = np.arange(0.0001, 0.01, 0.001)

for i in max_iter:
    for a in alphas:
        model_sgd = SGDClassifier(max_iter=i,alpha=a)
        model_sgd.fit(train_X, train_t)
        
        # predict on validation set
        val_y = model_sgd.predict(val_X)
        acc = accuracy_score(val_t, val_y)
        val_accs_sgd.append(acc)
        if(acc>acc_best):
            best_sgd = model_sgd
            acc_best = acc
            
# Test Set performance
test_y = best_sgd.predict(test_X)
print("Test accuracy = " + str(accuracy_score(test_t, test_y)))
print(best_sgd)
Test accuracy = 0.72
SGDClassifier(alpha=0.0011, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=6000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
In [62]:
# Make plots

def plot_training(accs, iters, model):
    plt.figure()
    plt.xlabel("Iteration")
    plt.ylabel("Validation Accuracy")
    plt.title("Learning Curve of the Validation Accuracy for the " + model + " model")
    plt.plot(accs, iters, linestyle='solid')
    plt.savefig(model + '.jpg')
In [66]:
plot_training(np.arange(0, len(val_accs_sgd)),val_accs_sgd, "SGD")
In [67]:
plot_training(np.arange(0, len(val_accs_dt)), val_accs_dt, "Decision Tree")
In [68]:
plot_training(np.arange(0, len(val_accs_lr)), val_accs_lr, "Logistic Regression")
In [ ]: