逻辑回归:是一个分类算法,它可以输出一个分类结果,同时输出分为此类的概率。
1 根据业务理解,个人经验,框选大量的候选特征因素;
2 再用统计学在样本中求因素跟结论之间的相关度大小,来进行筛选。
流失概率风险预测特征相关度
package cn.doitedu.ml.loss
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.linalg
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.mutable
/**
* 流失概率预测模型,特征相关度计算筛选
*/
object LossProbModelFeatureCorr {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache").setLevel(Level.WARN)
val spark = SparkSession
.builder()
.appName("流失概率风险预测特征相关度计算")
.master("local")
.getOrCreate()
import org.apache.spark.sql.functions._
import spark.implicits._
val arr2Vec: UserDefinedFunction = udf((arr:mutable.WrappedArray[Double])=>{
// Vector是一个接口,它有两个实现,一个是DenseVector,一个是SparseVector
val vector: linalg.Vector = Vectors.dense(arr.toArray)
vector
})
val sample = spark.read.option("header",true).option("inferSchema",true).csv("userprofile/data/loss_probability/sample")
// label,gid,cs_3,cs_15,xf_3,xf_15,th_3,th_15,hp_3,hp_15,cp_3,cp_15,last_dl,last_xf
val vec = sample.select(arr2Vec(array('label,'cs_3,'cs_15,'xf_3,'xf_15,'th_3,'th_15,'hp_3,'hp_15,'cp_3,'cp_15,'last_dl,'last_xf)) as "features")
// 0.3-弱,0.1-0.3为弱相关,0.3-0.5为中等相关,0.5-1.0为强相关
val corr: DataFrame = Correlation.corr(vec, "features", "pearson")
corr.show(1,false)
spark.close()
}
}
用户流失风险等级预测:
package cn.doitedu.ml.loss
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.expressions.UserDefinedFunction
import scala.collection.mutable
object LossProbModelTrain {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache").setLevel(Level.WARN)
val spark = SparkSession
.builder()
.appName("流失概率风险预测模型训练")
.master("local")
.getOrCreate()
import org.apache.spark.sql.functions._
import spark.implicits._
val arr2Vec: UserDefinedFunction = udf((arr:mutable.WrappedArray[Double])=>{
val vector: linalg.Vector = Vectors.dense(arr.toArray)
vector
})
val sample = spark.read.option("header",true).option("inferSchema",true).csv("userprofile/data/loss_probability/sample")
// label,gid,cs_3,cs_15,xf_3,xf_15,th_3,th_15,hp_3,hp_15,cp_3,cp_15,last_dl,last_xf
val vecDF = sample.select('label,arr2Vec(array('cs_3,'cs_15,'xf_3,'xf_15,'th_3,'th_15,'hp_3,'hp_15,'cp_3,'cp_15,'last_dl,'last_xf)) as "features")
val logisticRegression = new LogisticRegression()
.setFeaturesCol("features")
.setLabelCol("label")
.setRegParam(1.0) // 它在损失函数上加上了一个 组成部分,rp*θ的平方,最后带来的影响是,让每一个特征对结果的影响都不至于太大,相对比较均匀
// 将样本数据,拆分成(训练集,测试集)
val arr: Array[DataFrame] = vecDF.randomSplit(Array(8, 2))
println(arr.size)
val trainSets = arr(0)
val testSets = arr(1)
// 用训练集来训练模型
val model = logisticRegression.fit(trainSets)
model.save("userprofile/data/loss_probability/model")
// 用训练好的模型,对测试集进行预测
val testPredict = model.transform(testSets)
testPredict.show(100,false)
spark.close()
}
}