coalesce[ˌkəʊəˈles]:改变 RDD 的分区数
/*
* false:不产生 shuffle
* true:产生 shuffle
* 如果重分区的数量大于原来的分区数量,必须设置为 true,否则分区数不变
* 增加分区会把原来的分区中的数据随机分配给设置的分区个数
*/
val coalesceRdd = result.coalesce(6,true)
val results = coalesceRdd.mapPartitionsWithIndex((index,x) => {
val list = ListBuffer[String]()
while (x.hasNext) {
list += "partition:"+ index + " content:[" + x.next + "]"
}
list.iterator
})
println("分区数量:" + results.partitions.size)
val resultArr = results.collect()
for(x <- resultArr){
println(x)
}
结果:
分区数量:6
partition:0 content:[partition:1 content:Tom07]
partition:0 content:[partition:2 content:Tom10]
partition:1 content:[partition:0 content:Tom01]
partition:1 content:[partition:1 content:Tom08]
partition:1 content:[partition:2 content:Tom11]
partition:2 content:[partition:0 content:Tom02]
partition:2 content:[partition:2 content:Tom12]
partition:3 content:[partition:0 content:Tom03]
partition:4 content:[partition:0 content:Tom04]
partition:4 content:[partition:1 content:Tom05]
partition:5 content:[partition:1 content:Tom06]
partition:5 content:[partition:2 content:Tom09]
val coalesceRdd = result.coalesce(6,fasle)的结果是:
分区数量:3
partition:0 content:[partition:0 content:Tom01]
partition:0 content:[partition:0 content:Tom02]
partition:0 content:[partition:0 content:Tom03]
partition:0 content:[partition:0 content:Tom04]
partition:1 content:[partition:1 content:Tom05]
partition:1 content:[partition:1 content:Tom06]
partition:1 content:[partition:1 content:Tom07]
partition:1 content:[partition:1 content:Tom08]
partition:2 content:[partition:2 content:Tom09]
partition:2 content:[partition:2 content:Tom10]
partition:2 content:[partition:2 content:Tom11]
partition:2 content:[partition:2 content:Tom12]
val coalesceRdd = result.coalesce(2,fasle)的结果是:
分区数量:2
partition:0 content:[partition:0 content:Tom01]
partition:0 content:[partition:0 content:Tom02]
partition:0 content:[partition:0 content:Tom03]
partition:0 content:[partition:0 content:Tom04]
partition:1 content:[partition:1 content:Tom05]
partition:1 content:[partition:1 content:Tom06]
partition:1 content:[partition:1 content:Tom07]
partition:1 content:[partition:1 content:Tom08]
partition:1 content:[partition:2 content:Tom09]
partition:1 content:[partition:2 content:Tom10]
partition:1 content:[partition:2 content:Tom11]
partition:1 content:[partition:2 content:Tom12]
val coalesceRdd = result.coalesce(2,true)的结果是:
分区数量:2
partition:0 content:[partition:0 content:Tom01]
partition:0 content:[partition:0 content:Tom03]
partition:0 content:[partition:1 content:Tom05]
partition:0 content:[partition:1 content:Tom07]
partition:0 content:[partition:2 content:Tom09]
partition:0 content:[partition:2 content:Tom11]
partition:1 content:[partition:0 content:Tom02]
partition:1 content:[partition:0 content:Tom04]
partition:1 content:[partition:1 content:Tom06]
partition:1 content:[partition:1 content:Tom08]
partition:1 content:[partition:2 content:Tom10]
partition:1 content:[partition:2 content:Tom12]
详细图示:
repartition:改变 RDD 分区数
repartition(int n) = coalesce(int n, true)
partitionBy:通过自定义分区器改变 RDD 分区数
JavaPairRDD<Integer, String> partitionByRDD = nameRDD.partitionBy(new
Partitioner() {
private static final long serialVersionUID = 1L;
//分区数 2
@Override
public int numPartitions() {
return 2;
}
//分区逻辑
@Override
public int getPartition(Object obj) {
int i = (int)obj;
if(i % 2 == 0){
return 0;
}else{
return 1;
}
}
});