和GBDT+LR目标几乎一致,利用GBDT自动进行特征筛选和组合,进而形成新的离散特征向量,再把该特征向量当做FM的输入,进行二阶特征交叉。
有关GBDT+LR的文章请看:
直接给出基于tf2实现的GBDT+FM代码:
# coding:utf-8
# @Time: 2022/1/18 3:35 下午
# @File: GBDT_FM.py
import time
import numpy as np, pandas as pd
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score
from sklearn.utils import shuffle
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import optimizers
from tensorflow.keras import metrics
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
import pickle
from tools import *
from settings import *
class FM(Model):
def __init__(self, k, w_reg=1e-4, v_reg=1e-4):
super(FM, self).__init__()
self.k = k
self.w_reg = w_reg
self.v_reg = v_reg
def build(self, input_shape):
self.b = self.add_weight(name='b', shape=(1,), initializer=tf.zeros_initializer(), trainable=True)
self.w = self.add_weight(name='w', shape=(input_shape[-1], 1), initializer=tf.random_normal_initializer(), trainable=True, regularizer=tf.keras.regularizers.l2(self.w_reg))
self.v = self.add_weight(name='v', shape=(input_shape[-1], self.k), initializer=tf.random_normal_initializer(), trainable=True, regularizer=tf.keras.regularizers.l2(self.v_reg))
def call(self, inputs):
# 线性部分
linear_part = tf.matmul(inputs, self.w) + self.b # (batchsize, 1)
# 内积项
inter_cross1 = tf.square(inputs @ self.v) # (batchsize, k)
inter_cross2 = tf.matmul(tf.pow(inputs, 2), tf.pow(self.v, 2)) # (batchsize, k)
cross_part = 0.5 * tf.reduce_sum(inter_cross1 - inter_cross2, axis=1, keepdims=True) # (batchsize, 1)
return tf.nn.sigmoid(linear_part + cross_part)
def gbdt_layer(x, y, train_x=None, train_y=None, val_x=None, val_y=None, is_train=False, is_test_flag=False):
'''
x、y、val_x、val_y:ndarray
'''
print('gbdt layer 训练开始 ', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))
start_time_start = time.time()
if is_train:
gbdt = GradientBoostingClassifier(n_estimators=100,
learning_rate=0.8,
max_depth=4,
min_samples_leaf=3,
min_samples_split=3, )
gbdt.fit(x, y.ravel())
x_leaves = gbdt.apply(x)[:, :, 0]
ohecodel = OneHotEncoder()
x_trans = ohecodel.fit_transform(x_leaves)
x_train_leaf = gbdt.apply(train_x)[:, :, 0]
x_train_leaf = ohecodel.transform(x_train_leaf)
with open(gbdt_save_path, 'wb') as f:
pickle.dump(gbdt, f)
with open(onehot_save_path, 'wb') as f:
pickle.dump(ohecodel, f)
if is_test_flag:
val_x_preba = gbdt.predict_proba(val_x)[:, 1]
print(val_x_preba)
y_pre = []
for proba in val_x_preba:
if proba > 0.5:
y_pre.append(1)
else:
y_pre.append(0)
f1 = f1_score(val_y, y_pre)
auc = roc_auc_score(val_y, val_x_preba)
acc = accuracy_score(val_y, y_pre)
print(' 验证集-GBDT 精确率: %.5f' % (precision_score(val_y, y_pre)))
print(' 验证集-GBDT 召回率: %.5f' % (recall_score(val_y, y_pre)))
print(' 验证集-GBDT F1: %.5f' % (f1))
print(' 验证集-GBDT AUC: %.5f' % (auc))
print(' 验证集-GBDT 准确率: %.5f' % (acc))
else:
with open(gbdt_save_path, 'rb') as f:
gbdt_m = pickle.load(f)
with open(onehot_save_path, 'rb') as f:
onehot_m = pickle.load(f)
x_train_leaf = gbdt_m.apply(train_x)[:, :, 0]
x_train_leaf = onehot_m.transform(x_train_leaf)
end_time_end = time.time()
print(('gbdt layer 获取叶子节点运行时间: {:.0f}分 {:.0f}秒'.format((end_time_end - start_time_start) // 60, (end_time_end - start_time_start) % 60)))
return x_train_leaf
def fm_layer(train_X, train_y, val_X, val_y):
'''
x、y、val_x、val_y:ndarray
'''
print('fm layer 训练开始 ', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))
start_time_start = time.time()
fm_modle = FM(k=8, w_reg=1e-4, v_reg=1e-4)
opt = optimizers.SGD()
fm_modle.compile(optimizer=opt, loss='binary_crossentropy', metrics=[metrics.AUC(), 'accuracy'])
fm_modle.fit(
train_X, train_y,
# validation_data=(val_X, val_y),
batch_size=512,
epochs=3,
verbose=1,
)
# fm_pre = fm_modle(test_X)
# fm_pre = [1 if x > 0.5 else 0 for x in fm_pre]
''' 获取FM训练得到的隐向量 '''
v = fm_modle.variables[2] # [None, onehot_dim, k]
print('FM隐向量提取完成')
print(v)
end_time_end = time.time()
print(('fm layer 获取叶子节点运行时间: {:.0f}分 {:.0f}秒'.format((end_time_end - start_time_start) // 60, (end_time_end - start_time_start) % 60)))
if __name__ == '__main__':
data = pd.read_csv(criteo_sampled_data_path)
data = shuffle(data, random_state=42)
data_X = data.iloc[:, 1:]
data_y = data['label'].values
# I1-I13:总共 13 列数值型特征
# C1-C26:共有 26 列类别型特征
dense_features = ['I' + str(i) for i in range(1, 14)]
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_feature_columns = [denseFeature(feat) for feat in dense_features]
spare_feature_columns = [sparseFeature(feat, data_X[feat].nunique(), 8) for feat in sparse_features]
tmp_X, test_X, tmp_y, test_y = train_test_split(data_X, data_y, test_size=0.05, random_state=42, stratify=data_y)
train_X, val_X, train_y, val_y = train_test_split(tmp_X, tmp_y, test_size=0.05, random_state=42, stratify=tmp_y)
# GBDT Layer:对全体样本空间进行划分子树得到叶子节点
x_trans_leaf = gbdt_layer(data_X.values, data_y, train_X.values, train_y, val_X.values, val_y, False, True)
x_trans_leaf = x_trans_leaf.toarray()
print(x_trans_leaf)
# FM Layer
fm_layer(x_trans_leaf, train_y, None, None)
[[0. 0. 1. ... 0. 1. 0.]
[0. 0. 0. ... 0. 0. 1.]
[1. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 1.]
[0. 0. 1. ... 0. 1. 0.]
[0. 1. 0. ... 0. 0. 0.]]
fm layer 训练开始 2022-01-18 15:36:57
2022-01-18 15:36:57.363645: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-01-18 15:36:57.380076: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f872105b3a0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-01-18 15:36:57.380088: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
Epoch 1/3
1058/1058 [==============================] - 5s 4ms/step - loss: 0.5287 - auc: 0.6911 - accuracy: 0.7519
Epoch 2/3
1058/1058 [==============================] - 4s 3ms/step - loss: 0.5029 - auc: 0.7361 - accuracy: 0.7648
Epoch 3/3
1058/1058 [==============================] - 4s 3ms/step - loss: 0.4953 - auc: 0.7474 - accuracy: 0.7689
FM隐向量提取完成
<tf.Variable 'fm/v:0' shape=(1577, 8) dtype=float32, numpy=
array([[-0.08735464, 0.03354083, -0.09567007, ..., 0.08642927,
0.03169894, 0.09905877],
[ 0.01405056, 0.06551613, 0.00270309, ..., -0.04188278,
0.05228156, -0.01343592],
[-0.0128391 , 0.06932496, 0.00022683, ..., 0.06743341,
-0.04202383, -0.04612865],
...,
[-0.01535424, -0.06073865, -0.06247789, ..., 0.00913803,
0.00183929, -0.13300501],
[ 0.03474345, 0.0124812 , -0.08187839, ..., -0.01550922,
0.1187585 , 0.00665127],
[ 0.00812627, -0.04846352, -0.01730932, ..., 0.02291123,
0.00797282, -0.0803114 ]], dtype=float32)>
fm layer 获取叶子节点运行时间: 0分 18秒
Process finished with exit code 0