MapReduce案例
- 天气案例:细粒度介绍计算框架
- FOF案例:MR与数据模型
- PageRank案例:
- TFIDF案例:
- ItemCF案例:
一、需求
找出每个月气温最高的2天
思路:
二、代码编写
2.1、首先编写客户端程序
package com.lxk.hadoop.mr.weather;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TQMR {
public static void main(String[] args) throws Exception {
// 1--conf
Configuration conf = new Configuration(true);
// eclipse集成测试需要,否则报错找不到包
// conf.set("mapred.jar","I:\\10_code\\00_mapred.jar\\TQMR.jar");
// 2--job
Job job = Job.getInstance(conf);
job.setJarByClass(TQMR.class);
// 3--input,output
Path input = new Path("/tq/input/");
FileInputFormat.addInputPath(job, input);
Path output = new Path("/tq/output");
if (output.getFileSystem(conf).exists(output)) {
output.getFileSystem(conf).delete(output, true);
}
FileOutputFormat.setOutputPath(job, output);
// 4--map
job.setMapperClass(TqMapper.class);
job.setMapOutputKeyClass(TQ.class);
job.setMapOutputValueClass(Text.class);
// 5--other:sort,part..,group...
job.setPartitionerClass(TqPartitioner.class);
job.setSortComparatorClass(TqSortComparator.class);
job.setGroupingComparatorClass(TqGroupingComparator.class);
/* 如果不注释掉,报错wrong key class: class org.apache.hadoop.io.Text
is not class com.lxk.hadoop.mr.weather.TQ */
//job.setCombinerClass(TqReducer.class);
job.setCombinerKeyGroupingComparatorClass(TqGroupingComparator.class);
// 6--reduce
job.setReducerClass(TqReducer.class);
// ReduceTasks个数和分区Partitioner一对一关系
// ReduceTasks的数量不能小于partitioner的数量,否则结果会写到part-r-00000中
job.setNumReduceTasks(2);
// 7--submit
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
2.2、编写MapTask输出key的bean
package com.lxk.hadoop.mr.weather;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class TQ implements WritableComparable<TQ> {
private int year;
private int month;
private int day;
private int wd;
//getter/setter方法
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(year);
out.writeInt(month);
out.writeInt(day);
out.writeInt(wd);
}
@Override
public void readFields(DataInput in) throws IOException {
this.year = in.readInt();
this.month = in.readInt();
this.day = in.readInt();
this.wd = in.readInt();
}
@Override
public int compareTo(TQ that) {
// 按照年月日正序排序
int c1 = Integer.compare(this.getYear(), that.getYear());
if (c1 == 0) {
int c2 = Integer.compare(this.getMonth(), that.getMonth());
if (c2 == 0) {
return Integer.compare(this.getDay(), that.getDay());
}
return c2;
}
return c1;
}
}
2.3、编写MapperClass
package com.lxk.hadoop.mr.weather;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
public class TqMapper extends Mapper<LongWritable, Text, TQ, Text> {
TQ tq= new TQ();
Text vwd = new Text();
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
//value: 1949-10-01 14:21:02 34c >> TQ
try {
String[] strs = StringUtils.split(value.toString(), '\t');
// 获取每一行时间--年月日
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date date = null;
date = sdf.parse(strs[0]);
Calendar cal = Calendar.getInstance();
cal.setTime(date);
tq.setYear(cal.get(Calendar.YEAR));
tq.setMonth(cal.get(Calendar.MONTH)+1);
tq.setDay(cal.get(Calendar.DAY_OF_MONTH));
// 获取温度
int wd = Integer.parseInt(strs[1].substring(0, strs[1].length()-1));
tq.setWd(wd);
vwd.set(wd+"");
context.write(tq, vwd);
} catch (ParseException e) {
e.printStackTrace();
}
}
}
2.4、编写PartitionerClass--设置2个分区
package com.lxk.hadoop.mr.weather;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class TqPartitioner extends Partitioner<TQ, Text> {
@Override
public int getPartition(TQ key, Text value, int numPartitions) {
return key.getYear() % numPartitions;
}
}
2.5、编写分区排序规则--SortComparatorClass
package com.lxk.hadoop.mr.weather;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class TqSortComparator extends WritableComparator {
// 传入比较对象,True表示实例化
public TqSortComparator() {
super(TQ.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
TQ t1 = (TQ) a;
TQ t2 = (TQ) b;
int c1 = Integer.compare(t1.getYear(), t2.getYear());
if (c1 == 0) {
int c2 = Integer.compare(t1.getMonth(), t2.getMonth());
if (c2 == 0) {
// 温度倒序
return -Integer.compare(t1.getWd(), t2.getWd());
}
return c2;
}
return c1;
}
}
2.6、编写分组规则--GroupingComparatorClass
package com.lxk.hadoop.mr.weather;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class TqGroupingComparator extends WritableComparator {
public TqGroupingComparator() {
super(TQ.class, true);
}
public int compare(WritableComparable a, WritableComparable b) {
TQ t1 = (TQ) a;
TQ t2 = (TQ) b;
int c1 = Integer.compare(t1.getYear(), t2.getYear());
if (c1 == 0) {
return Integer.compare(t1.getMonth(), t2.getMonth());
}
return c1;
}
}
2.7、最后编写reduce程序
package com.lxk.hadoop.mr.weather;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class TqReducer extends Reducer<TQ, Text, Text, Text> {
Text rkey = new Text();
Text rval = new Text();
@Override
protected void reduce(TQ key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
int flg = 0;
int day = 0;
// 每个分区的分组(年月为key)只去取两个数值
for (Text v : vals) {
// 同一个月,第一次调用,获取气温最高的日期
if (flg == 0) {
day = key.getDay();
rkey.set(key.getYear() + "-" + key.getMonth() + "-" + key.getDay());
rval.set(key.getWd() + "");
context.write(rkey, rval);
flg++;
}
// key随着遍历,变化,同一个月,第二次调用,获取气温第二高的日期,然后跳出循环
if (flg != 0 && day != key.getDay()) {
rkey.set(key.getYear() + "-" + key.getMonth() + "-" + key.getDay());
rval.set(key.getWd() + "");
context.write(rkey, rval);
break;
}
}
}
}
三、eclipse集成测试
2019-09-30 11:03:10,182 INFO mapreduce.Job (Job.java:submit(1301)) - The url to track the job: http://node03:8088/proxy/application_1569651414091_0006/
2019-09-30 11:03:10,183 INFO mapreduce.Job (Job.java:monitorAndPrintJob(1346)) - Running job: job_1569651414091_0006
2019-09-30 11:03:17,381 INFO mapreduce.Job (Job.java:monitorAndPrintJob(1367)) - Job job_1569651414091_0006 running in uber mode : false
2019-09-30 11:03:17,382 INFO mapreduce.Job (Job.java:monitorAndPrintJob(1374)) - map 0% reduce 0%
2019-09-30 11:03:23,440 INFO mapreduce.Job (Job.java:monitorAndPrintJob(1374)) - map 100% reduce 0%
2019-09-30 11:03:29,482 INFO mapreduce.Job (Job.java:monitorAndPrintJob(1374)) - map 100% reduce 100%
2019-09-30 11:03:30,495 INFO mapreduce.Job (Job.java:monitorAndPrintJob(1385)) - Job job_1569651414091_0006 completed successfully
2019-09-30 11:03:30,594 INFO mapreduce.Job (Job.java:monitorAndPrintJob(1392)) - Counters: 49
File System Counters
FILE: Number of bytes read=243
FILE: Number of bytes written=328045
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=380
HDFS: Number of bytes written=101
HDFS: Number of read operations=9
HDFS: Number of large read operations=0
HDFS: Number of write operations=4
Job Counters
Launched map tasks=1
Launched reduce tasks=2
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=21168
Total time spent by all reduces in occupied slots (ms)=53766
Total time spent by all map tasks (ms)=3528
Total time spent by all reduce tasks (ms)=8961
Total vcore-milliseconds taken by all map tasks=3528
Total vcore-milliseconds taken by all reduce tasks=8961
Total megabyte-milliseconds taken by all map tasks=3612672
Total megabyte-milliseconds taken by all reduce tasks=9176064
Map-Reduce Framework
Map input records=11
Map output records=11
Map output bytes=209
Map output materialized bytes=243
Input split bytes=107
Combine input records=0
Combine output records=0
Reduce input groups=5
Reduce shuffle bytes=243
Reduce input records=11
Reduce output records=8
Spilled Records=22
Shuffled Maps =2
Failed Shuffles=0
Merged Map outputs=2
GC time elapsed (ms)=230
CPU time spent (ms)=2020
Physical memory (bytes) snapshot=398262272
Virtual memory (bytes) snapshot=6191116288
Total committed heap usage (bytes)=196284416
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=273
File Output Format Counters
Bytes Written=101
三、Linux服务打包测试
hadoop jar TQMR.jar com.lxk.hadoop.mr.weather.TQMR /tq/input /tq/out