Titanic: Machine Learning from Disaster是Kaggle发起的一场机器学习入门级比赛。数据量很少,适合快速验证想法,让我们用机器学习来一场泰坦尼克之旅,Let′s GO!
- 数据预处理和可视化函数
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
class data_preprocessing():
def __init__(self):
pass
#归一化函数
def data_norm(self, df, features):
for f in features:
max_value = np.max(df[f])
min_value = np.min(df[f])
df[f] = (df[f] - min_value) / (max_value - min_value)
return df
#数据转化函数,更符合标准正态分布
def data_convert(self, df, features, style = 'log'):
if style == 'log':
for f in features:
df[f] = np.log(df[f])
return df
#去除异常值
def remove_abnormal_value(self, df, features, alpha):
for f in features:
value_mean = np.mean(df[f])
value_std = np.std(df[f])
df = df[df[f] <= value_mean + alpha * value_std]
df = df[df[f] >= value_mean - alpha * value_std]
return df
#填充空缺值
def data_fill(self, df, features, style = 'mean'):
if style == 'mean':
for f in features:
df[f] = df[f].fillna(df[f][df[f].notnull()].mean())
if style == 'mode':
for f in features:
df[f] = df[f].fillna(df[f][df[f].notnull()].mode()[0])
if style == '0':
for f in features:
df[f] = df[f].fillna('0')
return df
def data_one_hot(self, df, features):
for f in features:
dummies = pd.get_dummies(df[f], prefix = df[[f]].columns[0])
df = df.join(dummies)
df = df.drop([f], axis = 1)
return df
def data_replace(self, df, features):
for f in features:
value = list(set(df[f]))
value_range = range(len(value))
df[f] = df[f].replace(value, value_range)
return df
def data_replace_random(self, df, features):
for f in features:
value = list(set(df[f]))
np.random.shuffle(value)
value_range = range(len(value))
df[f] = df[f].replace(value, value_range)
return df
def data_replace_sort_by_mean(self, train_df, df, features, label):
for f in features:
value = list(set(df[f]))
value_of_train = list(set(train_df[f]))
value_sort = []
for i in range(len(value)):
if value[i] in value_of_train:
value_sort.append(np.mean(train_df[train_df[f] == value[i]][label]))
else:
value_sort.append(0)
value = [value[i] for i in np.argsort(-np.array(value_sort))]
value_range = range(len(value))
df[f] = df[f].replace(value, value_range)
return df
# -*- coding: utf-8 -*-
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
class data_visualization:
def __init__(self):
pass
def plt_mean(self, df, features):
average = df[features].groupby([features[0]],as_index = False).mean()
sns.barplot(x = features[0], y = features[1], data = average)
plt.show()
def plt_count(self, df, features):
for i in range(len(features)):
plt.figure(i)
sns.countplot(x=features[i], data=df)
plt.show()
def plt_density(self, df, feature):
for i in range(len(features)):
plt.figure(i)
sns.distplot(df[features[i]], kde = True, rug = True)
plt.show()
def plt_density_compare(self, df1, df2, features):
for i in range(len(features)):
plt.figure(i)
sns.distplot(df1[features[i]], kde = True, rug = True)
sns.distplot(df2[features[i]], kde = True, rug = True)
plt.show()
def plt_factor_count(self, df, x, hue, col):
sns.factorplot(x=x, hue=hue, col=col, data=df, kind='count')
plt.show()
def plt_factor_ratio(self,df, x, y, hue):
sns.factorplot(x=x, y=y, hue=hue, data=df)
plt.show()
def plt_crosstab(self, df, features):
print pd.crosstab(df[features[0]],df[features[1]])
- 导入库和数据(注意路径修改为自己电脑下的路径)
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from math import log
import random
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
sns.set_style('whitegrid')
import sys
sys.path.append('/Users/zjx/python/competition_function')
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from data_preprocessing import data_preprocessing
from data_visualization import data_visualization
path = '/Users/zjx/python/taitan/data/'
train_df = pd.read_csv(path+'new_train.csv')
test_df = pd.read_csv(path+'new_test.csv')
dp = data_preprocessing()
dv = data_visualization()
label = 'Survived'
- 查看数据
print train_df.info(), test_df.info()
train_df :
test_df :
可以看到训练集大小为891行,测试集为418行,其中 Age、Cabin数据缺失较多,Fare、Embarked部分缺失。为了防止引入较多噪声,我们不对Age、Cabin进行填充,对于Fare、Embarked分别采用决策树回归填充、众数填充。
- Fare、Embarked填充
X_fare = df[['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']]
y_fare = df['Fare']
#X_fare = dp.data_one_hot(X_fare,[])
X_fare = dp.data_replace_sort_by_mean(train_df, X_fare,['Sex', 'Embarked'], label)
fare_defined = df.Fare.notnull()
dtr = DecisionTreeRegressor(max_leaf_nodes=5)
dtr.fit(X_fare[fare_defined], y_fare[fare_defined])
df.loc[~fare_defined, 'Fare'] = dtr.predict(X_fare[~fare_defined])
df = dp.data_fill(df, ['Embarked'], style = 'mode')
画出Fare填充决策回归树
dot_data = tree.export_graphviz(dtr, out_file=None,feature_names=X_fare.columns,
class_names=['Fare'],filled=True, rounded=True, special_characters=True,leaves_parallel=True)
graph = graphviz.Source(dot_data)
graph.render("dtr_of_fare")
- 特征探索
性别上的差异
dv.plt_mean(train_df,['Saex',label])
船舱位置的差异
dv.plt_mean(train_df,['Pclass',label])
Pclass和Sex的交叉
dv.plt_factor_count(train_df,'Pclass',label,'Sex')
可以看到1层和2层的女性几乎都获救了,3层的男性几乎都没有获救,相信这是一组最为重要的特征,从这里也可以发现预测的难点是1层的男性和3层的女性。
女性既然更高概率获救,儿童应该也有特殊性(由于3层的获救率普遍不高只考虑1、2层)
def get_is_child(age):
age = float(age)
if age < 14:
return 1
else:
return 0
df['Is_child'] = df[['Age']].apply(get_is_child, axis=1)
df.loc[df['Pclass'] > 2, 'Is_child'] = 0
考虑团体上的优劣(团结就是力量)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
def get_freq(df, feature):
freq_col = 'Freq_' + feature
freq = df[feature].value_counts().to_frame()
freq.columns = [freq_col]
df[freq_col] = df.merge(freq, how='left', left_on=feature, right_index=True)[freq_col]
return df
df = get_freq(df, 'Cabin')
df = get_freq(df, 'Ticket')
df = get_freq(df, 'Fare')
def get_single(df):
if df['FamilySize'] > 1:
return 0
elif df['Freq_Ticket'] > 1:
return 0
elif df['Freq_Cabin'] > 1:
return 0
elif 1 < df['Freq_Fare'] < max_group:
return 0
else:
return 1
df['Single'] = df.apply(get_single, axis=1)
df['Single'] = df['Single'].astype('int')
对超大团体进行特殊化(大团体可能出现相顾无暇的情况)
def get_freq_ticket(df):
if df['Freq_Ticket'] == 7 or df['Freq_Ticket'] == 11:
return 1
elif df['Freq_Ticket'] == 8:
return 2
else:
return 0
df['Big_Ticket'] = df.apply(get_freq_ticket, axis=1)
测试集中的人所在团体如果有获救的给以奖励(特殊化)
def get_reward(survived):
group_nan = survived.isnull().sum()
Single = survived.shape[0]
group_sum = np.sum(survived)
if (group_nan > 0 and Single > 1 and group_sum > 0):
return 1
else:
return 0
rewards = df[['Ticket','Survived']].groupby('Ticket')['Survived'].apply(get_reward).to_frame()
rewards.columns = ['Reward']
df = df.merge(rewards, left_on='Ticket', right_index=True, how='left')
由于决定使用决策树数模型,所以高维稀疏的独热编码容易引入较多噪声(相关性过大),只将性别进行独热编码,并去掉无用特征
df = dp.data_replace_sort_by_mean(train_df,df,['Sex'],label)
df = df.drop(['Name','Ticket','Freq_Ticket','Freq_Cabin','Freq_Fare','Cabin','Age','FamilySize','Parch','SibSp','Fare','Embarked'],axis=1)
- k折交叉验证和随机森林预测
dtc = DecisionTreeClassifier(min_samples_leaf=10)
dtc.fit(train_df.drop(['Survived','PassengerId'],axis=1), train_df['Survived'])
dot_data = tree.export_graphviz(dtc, out_file=None,feature_names=train_df.drop(['Survived','PassengerId'],axis=1).columns,
class_names='Survived',
filled=True, rounded=True,
special_characters=True,leaves_parallel=True)
graph = graphviz.Source(dot_data)
graph.render("my_dtc_of_survived")
train_X = train_df.drop(['Survived','PassengerId'],axis=1).values
train_y = train_df['Survived'].values
test_X = test_df.drop(['Survived','PassengerId'],axis=1).values
num_folds = 7
num_repeats = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
rf = RandomForestClassifier(random_state=0)
acc_scores = []
f1_scores = []
for i in range(num_repeats):
for train_idx, test_idx in skf.split(train_X, train_y):
train_X_cv = train_X[train_idx]
test_X_cv = train_X[test_idx]
train_y_cv = train_y[train_idx]
test_y_cv = train_y[test_idx]
rf.fit(train_X_cv, train_y_cv)
y_pred_cv = rf.predict(test_X_cv)
acc_scores.append(accuracy_score(test_y_cv, y_pred_cv))
f1_scores.append(f1_score(test_y_cv, y_pred_cv))
acc_scores_mean = np.mean(acc_scores)
acc_scores_std = np.std(acc_scores)
f1_scores_mean = np.mean(f1_scores)
f1_scores_std = np.std(f1_scores)
print('CV summary for %s repeats on %s splits:'%(num_repeats, skf.n_splits))
print('accuracy score: %s +/- %s'%(acc_scores_mean, acc_scores_std))
print('f1 score: %s +/- %s'%(f1_scores_mean, f1_scores_std))
rf = RandomForestClassifier(n_estimators=10,random_state=0)
rf.fit(train_X, train_y)
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'], 'Survived': rf.predict(test_X).astype(int) })
print np.mean(submission['Survived'])
submission.to_csv(path+'submission.csv', index=False)
生存决策分类树
- 最终成绩和总结
通过整个流程可以发现,越是在危难时刻,绅士精神越能体现的如此惊人,而金钱也决定着你的生存概率,Money is winner!
机器学习感悟:决策树模型不适合使用高维稀疏独热编码,可以将无序特征用概率来转化,另外生成单特征的决策树对选取特征有很大帮助!决策树的可解释性也对于特征工程有很大帮助!