import sklearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# import dataset
dataset = pd.read_csv('titanic_data.csv')
display(dataset.head())
display(dataset.shape)
# Data processing
# Drop names as a feature
dataset = dataset.drop('Name', axis=1)
# Fill missing info
dataset = dataset.fillna(0)
# One hot encode to convert feature values to numerical values
dataset = pd.get_dummies(dataset)
# Shuffle data
dataset = dataset.sample(frac=1, random_state=77)
# Separate into train, validate, test
train = dataset[0:500]
val = dataset[500:700]
test = dataset[700:]
# Create targets
train_t = train['Survived']
train_X = train.drop('Survived', axis=1)
val_t = val['Survived']
val_X = val.drop('Survived', axis=1)
test_t = train['Survived']
test_X = train.drop('Survived', axis=1)
# Models for comparison
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
# Evaluation metric
from sklearn.metrics import accuracy_score
# Logistic Regression Classifier
acc_best = 0
best_lr = None
val_accs_lr = []
max_iter = np.arange(1000, 11000, 1000)
C_range = np.arange(1, 11)
for i in max_iter:
for c in C_range:
model_lr = LogisticRegression(C=c, max_iter=i, solver = 'liblinear')
model_lr.fit(train_X, train_t)
# predict on validation set
val_y = model_lr.predict(val_X)
acc = accuracy_score(val_t,val_y)
val_accs_lr.append(acc)
if(acc>acc_best):
best_lr = model_lr
acc_best = acc
# Test set performance
test_y = best_lr.predict(test_X)
accuracy_score(test_t, test_y)
print("Test accuracy = " + str(accuracy_score(test_t, test_y)))
print(best_lr)
# Decision Tree Classifier
acc_best = 0
best_dt = None
val_accs_dt = []
max_depth = np.arange(1, 11)
min_split = np.arange(2, 22)
for depth in max_depth:
for split in min_split:
model_dt = DecisionTreeClassifier(max_depth=depth, min_samples_split=split)
model_dt.fit(train_X, train_t)
# predict on validation set
val_y = model_dt.predict(val_X)
acc = accuracy_score(val_t, val_y)
val_accs_dt.append(acc)
if(acc>acc_best):
best_dt = model_dt
acc_best = acc
# Test Set performance
test_y = best_dt.predict(test_X)
print("Test accuracy = " + str(accuracy_score(test_t, test_y)))
print(best_dt)
# SGD Classifier
acc_best = 0
best_sgd = None
val_accs_sgd = []
max_iter = np.arange(1000, 11000, 1000)
alphas = np.arange(0.0001, 0.01, 0.001)
for i in max_iter:
for a in alphas:
model_sgd = SGDClassifier(max_iter=i,alpha=a)
model_sgd.fit(train_X, train_t)
# predict on validation set
val_y = model_sgd.predict(val_X)
acc = accuracy_score(val_t, val_y)
val_accs_sgd.append(acc)
if(acc>acc_best):
best_sgd = model_sgd
acc_best = acc
# Test Set performance
test_y = best_sgd.predict(test_X)
print("Test accuracy = " + str(accuracy_score(test_t, test_y)))
print(best_sgd)
# Make plots
def plot_training(accs, iters, model):
plt.figure()
plt.xlabel("Iteration")
plt.ylabel("Validation Accuracy")
plt.title("Learning Curve of the Validation Accuracy for the " + model + " model")
plt.plot(accs, iters, linestyle='solid')
plt.savefig(model + '.jpg')
plot_training(np.arange(0, len(val_accs_sgd)),val_accs_sgd, "SGD")
plot_training(np.arange(0, len(val_accs_dt)), val_accs_dt, "Decision Tree")
plot_training(np.arange(0, len(val_accs_lr)), val_accs_lr, "Logistic Regression")