一、Spark自定义排序: 比较女孩
package cn.itcast.spark.day3
import org.apache.spark.{SparkConf, SparkContext}
object OrderContext {
implicit val girlOrdering = new Ordering[Girl] {
override def compare(x: Girl, y: Girl): Int = {
if(x.faceValue > y.faceValue) 1
else if (x.faceValue == y.faceValue) {
if(x.age > y.age) -1 else 1
} else -1
}
}
}
/**
* Created by root on 2016/5/18.
*/
//sort =>规则 先按faveValue,比较年龄
//name,faveValue,age
object CustomSort {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("CustomSort").setMaster("local[2]")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(List(("yuihatano", 90, 28, 1), ("angelababy", 90, 27, 2),("JuJingYi", 95, 22, 3)))
import OrderContext._
val rdd2 = rdd1.sortBy(x => Girl(x._2, x._3), false)
println(rdd2.collect().toBuffer)
sc.stop()
}
}
/**
* 第一种方式
* @param faceValue
* @param age
case class Girl(val faceValue: Int, val age: Int) extends Ordered[Girl] with Serializable {
override def compare(that: Girl): Int = {
if(this.faceValue == that.faceValue) {
that.age - this.age
} else {
this.faceValue -that.faceValue
}
}
}
/**
* 第二种,通过隐式转换完成排序
* @param faceValue
* @param age
*/
case class Girl(faceValue: Int, age: Int) extends Serializable
二、ip归属地查找: 二分法查找
数据既可以从Spark写入到mysql, 也可以从mysql写入到Spark
1、ip归属地查找:单机版
package cn.itcast.spark.day3
import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import scala.collection.mutable.ArrayBuffer
object IPLocationDemo {
def ip2Long(ip: String): Long = {
val fragments = ip.split("[.]")
var ipNum = 0L
for (i <- 0 until fragments.length){
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
def readData(path: String) = {
val br = new BufferedReader(new InputStreamReader(new FileInputStream(path)))
var s: String = null
var flag = true
val lines = new ArrayBuffer[String]()
while (flag)
{
s = br.readLine()
if (s != null)
lines += s
else
flag = false
}
lines
}
def binarySearch(lines: ArrayBuffer[String], ip: Long) : Int = {
var low = 0
var high = lines.length - 1
while (low <= high) {
val middle = (low + high) / 2
if ((ip >= lines(middle).split("\\|")(2).toLong) && (ip <= lines(middle).split("\\|")(3).toLong))
return middle
if (ip < lines(middle).split("\\|")(2).toLong)
high = middle - 1
else {
low = middle + 1
}
}
-1
}
def main(args: Array[String]) {
val ip = "120.55.185.61"
val ipNum = ip2Long(ip)
println(ipNum)
val lines = readData("c:/ip.txt")
val index = binarySearch(lines, ipNum)
print(lines(index))
}
}
2、把规则保存到redis里; 或者将规则缓存到worker上,通过广播变量broadcast广播出去(推荐)
ip归属地查询,并根据省份统计,写入到mysql数据库
package cn.itcast.spark.day3
import java.sql.{Connection, Date, DriverManager, PreparedStatement}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by root on 2016/5/18.
*/
object IPLocation {
//将结果写入mysql数据库
val data2MySQL = (iterator: Iterator[(String, Int)]) => {
var conn: Connection = null
var ps : PreparedStatement = null
val sql = "INSERT INTO location_info (location, counts, accesse_date) VALUES (?, ?, ?)"
try {
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata", "root", "123456")
iterator.foreach(line => {
ps = conn.prepareStatement(sql)
ps.setString(1, line._1)
ps.setInt(2, line._2)
ps.setDate(3, new Date(System.currentTimeMillis()))
ps.executeUpdate()
})
} catch {
case e: Exception => println("Mysql Exception")
} finally {
if (ps != null)
ps.close()
if (conn != null)
conn.close()
}
}
def ip2Long(ip: String): Long = {
val fragments = ip.split("[.]")
var ipNum = 0L
for (i <- 0 until fragments.length){
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
def binarySearch(lines: Array[(String, String, String)], ip: Long) : Int = {
var low = 0
var high = lines.length - 1
while (low <= high) {
val middle = (low + high) / 2
if ((ip >= lines(middle)._1.toLong) && (ip <= lines(middle)._2.toLong))
return middle
if (ip < lines(middle)._1.toLong)
high = middle - 1
else {
low = middle + 1
}
}
-1
}
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local[2]").setAppName("IpLocation")
val sc = new SparkContext(conf)
val ipRulesRdd = sc.textFile("c://ip.txt").map(line =>{
val fields = line.split("\\|")
val start_num = fields(2)
val end_num = fields(3)
val province = fields(6)
(start_num, end_num, province)
})
//全部的ip映射规则
val ipRulesArrary = ipRulesRdd.collect()
//广播规则
val ipRulesBroadcast = sc.broadcast(ipRulesArrary)
//加载要处理的数据
val ipsRDD = sc.textFile("c://access_log").map(line => {
val fields = line.split("\\|")
fields(1)
})
val result = ipsRDD.map(ip => {
val ipNum = ip2Long(ip)
val index = binarySearch(ipRulesBroadcast.value, ipNum)
val info = ipRulesBroadcast.value(index)
//(ip的起始Num, ip的结束Num,省份名)
info
}).map(t => (t._3, 1)).reduceByKey(_+_)
//向MySQL写入数据
result.foreachPartition(data2MySQL(_))
//println(result.collect().toBuffer)
sc.stop()
}
}
3、mysql写入到Spark
package cn.itcast.spark.day3
import java.sql.DriverManager
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by ZX on 2016/4/12.
*/
object JdbcRDDDemo {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("JdbcRDDDemo").setMaster("local[2]")
val sc = new SparkContext(conf)
val connection = () => {
Class.forName("com.mysql.jdbc.Driver").newInstance()
DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata", "root", "123456")
}
val jdbcRDD = new JdbcRDD(
sc,
connection,
"SELECT * FROM ta where id >= ? AND id <= ?",
1, 4, 2,
r => {
val id = r.getInt(1)
val code = r.getString(2)
(id, code)
}
)
val jrdd = jdbcRDD.collect()
println(jdbcRDD.collect().toBuffer)
sc.stop()
}
}
三、worldcount产生几个RDD
sc.textFile("hdfs://hadoop01:9000//wordcount/input").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).saveAsTextFile("hdfs://hadoop01:9000/wordcount/out")
worldcount执行流程:
1、textFile会产生两个RDD:HadoopRDD -> MapPartitinsRDD;
2、flatMap会产生产生一个RDD :MapPartitinsRDD;
3、map会产生一个RDD MapPartitionsRDD;
4、reduceByKey会产生一个RDD ShuffledRDD;
5、saveAsTextFile会产生一个RDD: mapPartitions;
四、集群方式debug(远程方式)
提交任务流程:
在window环境下debug:
object WordCount {
def main(args: Array[String]) {
//非常重要,是通向Spark集群的入口
val conf = new SparkConf().setAppName("WC")
.setJars(Array("C:\\HelloSpark\\target\\hello-spark-1.0.jar"))
.setMaster("spark://node-1.itcast.cn:7077")
val sc = new SparkContext(conf)
//textFile会产生两个RDD:HadoopRDD -> MapPartitinsRDD
sc.textFile(args(0)).cache()
// 产生一个RDD :MapPartitinsRDD
.flatMap(_.split(" "))
//产生一个RDD MapPartitionsRDD
.map((_, 1))
//产生一个RDD ShuffledRDD
.reduceByKey(_+_)
//产生一个RDD: mapPartitions
.saveAsTextFile(args(1))
sc.stop()
}
}
五、RDD缓存
cache(): 会把数据缓存到内存当中,cache是一个transformation(懒加载);
unpersist(true): 清空数据缓存;