Hadoop之MapReduce案例（一）

最新推荐文章于 2025-05-13 10:11:00 发布

Jeremy_Lee123

最新推荐文章于 2025-05-13 10:11:00 发布

阅读量1.1k

点赞数 2

CC 4.0 BY-SA版权

分类专栏： Hadoop 文章标签：案例

本文链接：https://blog.csdn.net/lixinkuan328/article/details/101766959

Hadoop 专栏收录该内容

35 篇文章

订阅专栏

本文详细介绍了一个使用MapReduce框架实现的天气案例，旨在找出每个月气温最高的两天。通过具体代码讲解了从客户端程序编写到Reducer程序的完整流程，包括MapTask输出key的Bean编写、PartitionerClass设置、分区排序规则及分组规则等。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

MapReduce案例

天气案例：细粒度介绍计算框架
FOF案例：MR与数据模型
PageRank案例：
TFIDF案例：
ItemCF案例：

一、需求

找出每个月气温最高的2天

思路：

二、代码编写

2.1、首先编写客户端程序

package com.lxk.hadoop.mr.weather;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TQMR {
	public static void main(String[] args) throws Exception {
		// 1--conf
		Configuration conf = new Configuration(true);
		// eclipse集成测试需要，否则报错找不到包
		// conf.set("mapred.jar","I:\\10_code\\00_mapred.jar\\TQMR.jar");
		// 2--job
		Job job = Job.getInstance(conf);
		job.setJarByClass(TQMR.class);

		// 3--input,output
		Path input = new Path("/tq/input/");
		FileInputFormat.addInputPath(job, input);

		Path output = new Path("/tq/output");
		if (output.getFileSystem(conf).exists(output)) {
			output.getFileSystem(conf).delete(output, true);
		}
		FileOutputFormat.setOutputPath(job, output);

		// 4--map
		job.setMapperClass(TqMapper.class);
		job.setMapOutputKeyClass(TQ.class);
		job.setMapOutputValueClass(Text.class);

		// 5--other:sort,part..,group...
		job.setPartitionerClass(TqPartitioner.class);
		job.setSortComparatorClass(TqSortComparator.class);
		job.setGroupingComparatorClass(TqGroupingComparator.class);

            /* 如果不注释掉，报错wrong key class: class org.apache.hadoop.io.Text
          is not class com.lxk.hadoop.mr.weather.TQ */
		//job.setCombinerClass(TqReducer.class); 
		job.setCombinerKeyGroupingComparatorClass(TqGroupingComparator.class);
		
		// 6--reduce
				job.setReducerClass(TqReducer.class);
				// ReduceTasks个数和分区Partitioner一对一关系
				// ReduceTasks的数量不能小于partitioner的数量，否则结果会写到part-r-00000中
				job.setNumReduceTasks(2);
		
		// 7--submit
		boolean res = job.waitForCompletion(true);
		System.exit(res ? 0 : 1);
	}
}

2.2、编写MapTask输出key的bean

package com.lxk.hadoop.mr.weather;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class TQ implements WritableComparable<TQ> {
	private int year;
	private int month;
	private int day;
	private int wd;

	//getter/setter方法

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(year);
		out.writeInt(month);
		out.writeInt(day);
		out.writeInt(wd);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.year = in.readInt();
		this.month = in.readInt();
		this.day = in.readInt();
		this.wd = in.readInt();
	}

	@Override
	public int compareTo(TQ that) {
		// 按照年月日正序排序
		int c1 = Integer.compare(this.getYear(), that.getYear());
		if (c1 == 0) {
			int c2 = Integer.compare(this.getMonth(), that.getMonth());
			if (c2 == 0) {
				return Integer.compare(this.getDay(), that.getDay());
			}
			return c2;
		}
		return c1;
	}
}

2.3、编写MapperClass

package com.lxk.hadoop.mr.weather;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;

public class TqMapper extends Mapper<LongWritable, Text, TQ, Text> {
	TQ tq= new TQ();
	Text vwd = new Text();
	
	@Override
	protected void map(LongWritable key, Text value,Context context)
			throws IOException, InterruptedException {
		//value:  1949-10-01 14:21:02	34c  >>  TQ
		try {
			String[] strs = StringUtils.split(value.toString(), '\t');
			
			// 获取每一行时间--年月日
			SimpleDateFormat  sdf = new SimpleDateFormat("yyyy-MM-dd");
			Date date = null;
			date = sdf.parse(strs[0]);
			Calendar  cal = Calendar.getInstance();
			cal.setTime(date);
			tq.setYear(cal.get(Calendar.YEAR));
			tq.setMonth(cal.get(Calendar.MONTH)+1);
			tq.setDay(cal.get(Calendar.DAY_OF_MONTH));
			
			// 获取温度
			int wd = Integer.parseInt(strs[1].substring(0, strs[1].length()-1));
			tq.setWd(wd);
			vwd.set(wd+"");
			
			context.write(tq, vwd);
		} catch (ParseException e) {
			e.printStackTrace();
		}
	}
}

2.4、编写PartitionerClass--设置2个分区

package com.lxk.hadoop.mr.weather;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class TqPartitioner extends Partitioner<TQ, Text> {

	@Override
	public int getPartition(TQ key, Text value, int numPartitions) {
		return key.getYear() % numPartitions;
	}
}

2.5、编写分区排序规则--SortComparatorClass

package com.lxk.hadoop.mr.weather;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TqSortComparator extends WritableComparator {

	// 传入比较对象，True表示实例化
	public TqSortComparator() {
		super(TQ.class, true);
	}

	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		TQ t1 = (TQ) a;
		TQ t2 = (TQ) b;

		int c1 = Integer.compare(t1.getYear(), t2.getYear());
		if (c1 == 0) {
			int c2 = Integer.compare(t1.getMonth(), t2.getMonth());
			if (c2 == 0) {
				// 温度倒序 
				return -Integer.compare(t1.getWd(), t2.getWd());
			}
			return c2;
		}

		return c1;
	}
}

2.6、编写分组规则--GroupingComparatorClass

package com.lxk.hadoop.mr.weather;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TqGroupingComparator extends WritableComparator {
	public TqGroupingComparator() {
		super(TQ.class, true);
	}

	public int compare(WritableComparable a, WritableComparable b) {
		TQ t1 = (TQ) a;
		TQ t2 = (TQ) b;

		int c1 = Integer.compare(t1.getYear(), t2.getYear());
		if (c1 == 0) {
			return Integer.compare(t1.getMonth(), t2.getMonth());
		}
		return c1;
	}
}

2.7、最后编写reduce程序

package com.lxk.hadoop.mr.weather;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TqReducer extends Reducer<TQ, Text, Text, Text> {
	Text rkey = new Text();
	Text rval = new Text();

	@Override
	protected void reduce(TQ key, Iterable<Text> vals, Context context) throws IOException, InterruptedException {
		int flg = 0;
		int day = 0;

		// 每个分区的分组（年月为key）只去取两个数值
		for (Text v : vals) {
			// 同一个月，第一次调用，获取气温最高的日期
			if (flg == 0) {
				day = key.getDay();
				rkey.set(key.getYear() + "-" + key.getMonth() + "-" + key.getDay());
				rval.set(key.getWd() + "");
				context.write(rkey, rval);
				flg++;
			}

			// key随着遍历，变化，同一个月，第二次调用，获取气温第二高的日期，然后跳出循环
			if (flg != 0 && day != key.getDay()) {
				rkey.set(key.getYear() + "-" + key.getMonth() + "-" + key.getDay());
				rval.set(key.getWd() + "");
				context.write(rkey, rval);
				break;
			}
		}
	}
}

三、eclipse集成测试

2019-09-30 11:03:10,182 INFO  mapreduce.Job (Job.java:submit(1301)) - The url to track the job: http://node03:8088/proxy/application_1569651414091_0006/
2019-09-30 11:03:10,183 INFO  mapreduce.Job (Job.java:monitorAndPrintJob(1346)) - Running job: job_1569651414091_0006
2019-09-30 11:03:17,381 INFO  mapreduce.Job (Job.java:monitorAndPrintJob(1367)) - Job job_1569651414091_0006 running in uber mode : false
2019-09-30 11:03:17,382 INFO  mapreduce.Job (Job.java:monitorAndPrintJob(1374)) -  map 0% reduce 0%
2019-09-30 11:03:23,440 INFO  mapreduce.Job (Job.java:monitorAndPrintJob(1374)) -  map 100% reduce 0%
2019-09-30 11:03:29,482 INFO  mapreduce.Job (Job.java:monitorAndPrintJob(1374)) -  map 100% reduce 100%
2019-09-30 11:03:30,495 INFO  mapreduce.Job (Job.java:monitorAndPrintJob(1385)) - Job job_1569651414091_0006 completed successfully
2019-09-30 11:03:30,594 INFO  mapreduce.Job (Job.java:monitorAndPrintJob(1392)) - Counters: 49
	File System Counters
		FILE: Number of bytes read=243
		FILE: Number of bytes written=328045
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=380
		HDFS: Number of bytes written=101
		HDFS: Number of read operations=9
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=4
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=2
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=21168
		Total time spent by all reduces in occupied slots (ms)=53766
		Total time spent by all map tasks (ms)=3528
		Total time spent by all reduce tasks (ms)=8961
		Total vcore-milliseconds taken by all map tasks=3528
		Total vcore-milliseconds taken by all reduce tasks=8961
		Total megabyte-milliseconds taken by all map tasks=3612672
		Total megabyte-milliseconds taken by all reduce tasks=9176064
	Map-Reduce Framework
		Map input records=11
		Map output records=11
		Map output bytes=209
		Map output materialized bytes=243
		Input split bytes=107
		Combine input records=0
		Combine output records=0
		Reduce input groups=5
		Reduce shuffle bytes=243
		Reduce input records=11
		Reduce output records=8
		Spilled Records=22
		Shuffled Maps =2
		Failed Shuffles=0
		Merged Map outputs=2
		GC time elapsed (ms)=230
		CPU time spent (ms)=2020
		Physical memory (bytes) snapshot=398262272
		Virtual memory (bytes) snapshot=6191116288
		Total committed heap usage (bytes)=196284416
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=273
	File Output Format Counters 
		Bytes Written=101

三、Linux服务打包测试

 hadoop jar TQMR.jar com.lxk.hadoop.mr.weather.TQMR /tq/input /tq/out