1.介绍
数据包含约20万用户数据,分成12组,同时提供了用户行为属性,如:手机品牌、型号、APP的类型。
评价指标:logloss
步骤:
1.解读数据;
2.特征工程;
3.模型参数;
数据集说明:
每一个用户用一个ID表示,一个用户的行为是一系列的Events里面,每个Event里面的信息包括该ID发生的时间、地理坐标信息,安装的APP类型、手机型号类别等。
涉及知识点:
1.pandas多表连接、数据处理;
2.OneHot编码;
3.特征选择;
4.交叉验证选择参数
2.代码及注释
main.py
# -*- coding: utf-8 -*-
import pandas as pd
import os
from pd_tools import split_train_test, get_part_data
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.decomposition import PCA
from ml_tools import get_best_model
from sklearn.metrics import log_loss
from sklearn.feature_selection import VarianceThreshold
# 数据集变量声明
dataset_path = './dataset'
gender_age_filename = 'gender_age.csv'
phone_brand_device_model_filename = 'phone_brand_device_model.csv'
events_filename = 'events.csv'
app_events_filename = 'app_events.csv'
app_labels_filename = 'app_labels.csv'
label_categories_filename = 'label_categories.csv'
train_gender_age_filename = 'gender_age_train.csv'
test_gender_age_filename = 'gender_age_test.csv'
is_first_run = False
def run_main():
"""
主函数
"""
if is_first_run:
# 1. 分割数据集
print('分割数据集')
all_gender_age = pd.read_csv(os.path.join(dataset_path, gender_age_filename))
df_train, df_test = split_train_test(all_gender_age)
# 查看训练集测试集基本信息
print('训练集中各类的数据个数:', df_train.groupby('group').size())
print('测试集中各类的数据个数:', df_test.groupby('group').size())
# 保存分割的数据集
df_train.to_csv(os.path.join(dataset_path, train_gender_age_filename),
index=False)
df_test.to_csv(os.path.join(dataset_path, test_gender_age_filename),
index=False)
# 2. 加载数据
print('加载数据')
# 加载数据
gender_age_train = pd.read_csv(os.path.join(dataset_path, train_gender_age_filename),
index_col='device_id')
gender_age_test = pd.read_csv(os.path.join(dataset_path, test_gender_age_filename),
index_col='device_id')
# 选取部分数据用于实验
percent = 0.1
gender_age_train = get_part_data(gender_age_train, percent=percent)
gender_age_test = get_part_data(gender_age_test, percent=percent)
phone_brand_device_model = pd.read_csv(os.path.join(dataset_path, phone_brand_device_model_filename))
# 去掉重复数据
phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id').set_index