Titanic: Machine Learning from Disaster

本文介绍了一次泰坦尼克号生存预测的机器学习实践,包括数据预处理、特征工程和模型选择等步骤,最终利用随机森林实现了较高的预测准确率。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

Titanic: Machine Learning from Disaster是Kaggle发起的一场机器学习入门级比赛。数据量很少,适合快速验证想法,让我们用机器学习来一场泰坦尼克之旅,Lets GO!

  • 数据预处理和可视化函数
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

class data_preprocessing():
    def __init__(self):
        pass

    #归一化函数
    def data_norm(self, df, features):
        for f in features:
            max_value = np.max(df[f])
            min_value = np.min(df[f])
            df[f] = (df[f] - min_value) / (max_value - min_value)
        return df

    #数据转化函数,更符合标准正态分布
    def data_convert(self, df, features, style = 'log'):
        if style == 'log':
            for f in features:
                df[f] = np.log(df[f])
        return df

    #去除异常值
    def remove_abnormal_value(self, df, features, alpha):
        for f in features:
            value_mean = np.mean(df[f])
            value_std = np.std(df[f])
            df = df[df[f] <= value_mean + alpha * value_std]
            df = df[df[f] >= value_mean - alpha * value_std]
        return df

    #填充空缺值
    def data_fill(self, df, features, style = 'mean'):
        if style == 'mean':
            for f in features:
                df[f] = df[f].fillna(df[f][df[f].notnull()].mean())
        if style == 'mode':
            for f in features:
                df[f] = df[f].fillna(df[f][df[f].notnull()].mode()[0])
        if style == '0':
            for f in features:
                df[f] = df[f].fillna('0')
        return df

    def data_one_hot(self, df, features):
        for f in features:
            dummies = pd.get_dummies(df[f], prefix = df[[f]].columns[0])
            df = df.join(dummies)
            df = df.drop([f], axis = 1)
        return df

    def data_replace(self, df, features):
        for f in features:
            value = list(set(df[f]))
            value_range = range(len(value))
            df[f] = df[f].replace(value, value_range)
        return df

    def data_replace_random(self, df, features):
        for f in features:
            value = list(set(df[f]))
            np.random.shuffle(value)
            value_range = range(len(value))
            df[f] = df[f].replace(value, value_range)
        return df

    def data_replace_sort_by_mean(self, train_df, df, features, label):
        for f in features:
            value = list(set(df[f]))
            value_of_train = list(set(train_df[f]))
            value_sort = []
            for i in range(len(value)):
                if value[i] in value_of_train:
                    value_sort.append(np.mean(train_df[train_df[f] == value[i]][label]))
                else:
                    value_sort.append(0)
            value = [value[i] for i in np.argsort(-np.array(value_sort))]
            value_range = range(len(value))
            df[f] = df[f].replace(value, value_range)
        return df
# -*- coding: utf-8 -*-

import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt

class data_visualization:
    def __init__(self):
        pass

    def plt_mean(self, df, features):
        average = df[features].groupby([features[0]],as_index = False).mean()
        sns.barplot(x = features[0], y = features[1], data = average)
        plt.show()

    def plt_count(self, df, features):
        for i in range(len(features)):
            plt.figure(i)
            sns.countplot(x=features[i], data=df)
        plt.show()

    def plt_density(self, df, feature):
        for i in range(len(features)):
            plt.figure(i)
            sns.distplot(df[features[i]], kde = True, rug = True)
        plt.show()
    def plt_density_compare(self, df1, df2, features):
        for i in range(len(features)):
            plt.figure(i)
            sns.distplot(df1[features[i]], kde = True, rug = True)
            sns.distplot(df2[features[i]], kde = True, rug = True)
        plt.show()

    def plt_factor_count(self, df, x, hue, col):
        sns.factorplot(x=x, hue=hue, col=col, data=df, kind='count')
        plt.show()

    def plt_factor_ratio(self,df, x, y, hue):
        sns.factorplot(x=x, y=y, hue=hue, data=df)
        plt.show()

    def plt_crosstab(self, df, features):
        print pd.crosstab(df[features[0]],df[features[1]])
  • 导入库和数据(注意路径修改为自己电脑下的路径)
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from math import log
import random
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
sns.set_style('whitegrid')
import sys
sys.path.append('/Users/zjx/python/competition_function')

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from data_preprocessing import data_preprocessing
from data_visualization import data_visualization
path = '/Users/zjx/python/taitan/data/'

train_df = pd.read_csv(path+'new_train.csv')
test_df = pd.read_csv(path+'new_test.csv')

dp = data_preprocessing()
dv = data_visualization()

label = 'Survived'
  • 查看数据
print train_df.info(), test_df.info()

train_df :


test_df :

可以看到训练集大小为891行,测试集为418行,其中 Age、Cabin数据缺失较多,Fare、Embarked部分缺失。为了防止引入较多噪声,我们不对Age、Cabin进行填充,对于Fare、Embarked分别采用决策树回归填充、众数填充。

  • Fare、Embarked填充
X_fare = df[['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']]
y_fare = df['Fare']
#X_fare = dp.data_one_hot(X_fare,[])
X_fare = dp.data_replace_sort_by_mean(train_df, X_fare,['Sex', 'Embarked'], label)
fare_defined = df.Fare.notnull()

dtr = DecisionTreeRegressor(max_leaf_nodes=5)
dtr.fit(X_fare[fare_defined], y_fare[fare_defined])

df.loc[~fare_defined, 'Fare'] = dtr.predict(X_fare[~fare_defined])

df = dp.data_fill(df, ['Embarked'], style = 'mode')

画出Fare填充决策回归树

dot_data = tree.export_graphviz(dtr, out_file=None,feature_names=X_fare.columns,
class_names=['Fare'],filled=True, rounded=True,             special_characters=True,leaves_parallel=True)
graph = graphviz.Source(dot_data)
graph.render("dtr_of_fare")

  • 特征探索
    性别上的差异
dv.plt_mean(train_df,['Saex',label])


船舱位置的差异

dv.plt_mean(train_df,['Pclass',label])


Pclass和Sex的交叉

dv.plt_factor_count(train_df,'Pclass',label,'Sex')

可以看到1层和2层的女性几乎都获救了,3层的男性几乎都没有获救,相信这是一组最为重要的特征,从这里也可以发现预测的难点是1层的男性和3层的女性。


女性既然更高概率获救,儿童应该也有特殊性(由于3层的获救率普遍不高只考虑1、2层)

def get_is_child(age):
    age = float(age)
    if age < 14:
        return 1
    else:
        return 0

df['Is_child'] = df[['Age']].apply(get_is_child, axis=1)
df.loc[df['Pclass'] > 2, 'Is_child'] = 0

考虑团体上的优劣(团结就是力量)

df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

def get_freq(df, feature):
    freq_col = 'Freq_' + feature
    freq = df[feature].value_counts().to_frame()
    freq.columns = [freq_col]
    df[freq_col] = df.merge(freq, how='left', left_on=feature, right_index=True)[freq_col]
    return df

df = get_freq(df, 'Cabin')
df = get_freq(df, 'Ticket')
df = get_freq(df, 'Fare')

def get_single(df):
    if df['FamilySize'] > 1:
        return 0
    elif df['Freq_Ticket'] > 1:
        return 0
    elif df['Freq_Cabin'] > 1:
        return 0
    elif 1 < df['Freq_Fare'] < max_group:
        return 0
    else:
        return 1

df['Single'] = df.apply(get_single, axis=1)
df['Single'] = df['Single'].astype('int')

对超大团体进行特殊化(大团体可能出现相顾无暇的情况)

def get_freq_ticket(df):
    if df['Freq_Ticket'] == 7 or df['Freq_Ticket'] == 11:
        return 1
    elif df['Freq_Ticket'] == 8:
        return 2
    else:
        return 0

df['Big_Ticket'] = df.apply(get_freq_ticket, axis=1)

测试集中的人所在团体如果有获救的给以奖励(特殊化)

def get_reward(survived):    
    group_nan = survived.isnull().sum()
    Single = survived.shape[0]
    group_sum = np.sum(survived)

    if (group_nan > 0 and Single > 1 and group_sum > 0):
        return 1
    else:
        return 0

rewards = df[['Ticket','Survived']].groupby('Ticket')['Survived'].apply(get_reward).to_frame()
rewards.columns = ['Reward']
df = df.merge(rewards, left_on='Ticket', right_index=True, how='left')

由于决定使用决策树数模型,所以高维稀疏的独热编码容易引入较多噪声(相关性过大),只将性别进行独热编码,并去掉无用特征

df = dp.data_replace_sort_by_mean(train_df,df,['Sex'],label)
df = df.drop(['Name','Ticket','Freq_Ticket','Freq_Cabin','Freq_Fare','Cabin','Age','FamilySize','Parch','SibSp','Fare','Embarked'],axis=1)
  • k折交叉验证和随机森林预测
dtc = DecisionTreeClassifier(min_samples_leaf=10)
dtc.fit(train_df.drop(['Survived','PassengerId'],axis=1), train_df['Survived'])

dot_data = tree.export_graphviz(dtc, out_file=None,feature_names=train_df.drop(['Survived','PassengerId'],axis=1).columns,
                         class_names='Survived',
                         filled=True, rounded=True,
                         special_characters=True,leaves_parallel=True)
graph = graphviz.Source(dot_data)
graph.render("my_dtc_of_survived")

train_X = train_df.drop(['Survived','PassengerId'],axis=1).values
train_y = train_df['Survived'].values
test_X = test_df.drop(['Survived','PassengerId'],axis=1).values

num_folds = 7
num_repeats = 5

skf = StratifiedKFold(n_splits=num_folds, shuffle=True)

rf = RandomForestClassifier(random_state=0)

acc_scores = []
f1_scores = []

for i in range(num_repeats):
    for train_idx, test_idx in skf.split(train_X, train_y):
        train_X_cv = train_X[train_idx]
        test_X_cv = train_X[test_idx]

        train_y_cv = train_y[train_idx]
        test_y_cv = train_y[test_idx]


        rf.fit(train_X_cv, train_y_cv)
        y_pred_cv = rf.predict(test_X_cv)

        acc_scores.append(accuracy_score(test_y_cv, y_pred_cv))
        f1_scores.append(f1_score(test_y_cv, y_pred_cv))

acc_scores_mean = np.mean(acc_scores)
acc_scores_std = np.std(acc_scores)

f1_scores_mean = np.mean(f1_scores)
f1_scores_std = np.std(f1_scores)

print('CV summary for %s repeats on %s splits:'%(num_repeats, skf.n_splits))
print('accuracy score: %s +/- %s'%(acc_scores_mean, acc_scores_std))
print('f1 score:       %s +/- %s'%(f1_scores_mean, f1_scores_std))

rf = RandomForestClassifier(n_estimators=10,random_state=0)
rf.fit(train_X, train_y)

submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'], 'Survived': rf.predict(test_X).astype(int) })
print np.mean(submission['Survived'])
submission.to_csv(path+'submission.csv', index=False)

生存决策分类树
这里写图片描述

  • 最终成绩和总结

通过整个流程可以发现,越是在危难时刻,绅士精神越能体现的如此惊人,而金钱也决定着你的生存概率,Money is winner!

机器学习感悟:决策树模型不适合使用高维稀疏独热编码,可以将无序特征用概率来转化,另外生成单特征的决策树对选取特征有很大帮助!决策树的可解释性也对于特征工程有很大帮助!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值