hive实战训练
需求:
统计影音视频网站的常规指标,各种TopN指标:
– 统计视频观看数Top10
– 统计视频类别热度Top10
– 统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
– 统计视频观看数Top50所关联视频的所属类别Rank
– 统计每个类别中的视频热度Top10,以Music为例
– 统计每个类别视频观看数Top10
– 统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
数据结构
该项目主要涉及两张表:视频表以及用户表
视频表
字段 | 备注 | 详细描述 |
---|---|---|
videoId | 视频唯一id(String) | 11位字符串 |
uploader | 视频上传者(String) | 上传视频的用户名String |
age | 视频年龄(int) | 视频在平台上的整数天 |
category | 视频类别(Array) | 上传视频指定的视频分类 |
length | 视频长度(Int) | 整形数字标识的视频长度 |
views | 观看次数(Int) | 视频被浏览的次数 |
rate | 视频评分(Double) | 满分5分 |
Ratings | 流量(Int) | 视频的流量,整型数字 |
conments | 评论数(Int) | 一个视频的整数评论数 |
relatedId | 相关视频id(Array) | 相关视频的id,最多20个 |
# 看下视频表的数据
# videoId uploader age category length views rate Ratings conments relatedId
SDNkMu8ZT68 w00dy911 630 People & Blogs 186 10181 3.49 494 257 rjnbgpPJUks
注意:relatedId可能包含多个,并且每个也是以tab分隔开的
用户表
字段 | 备注 | 字段类型 |
---|---|---|
uploader | 上传者用户名 | string |
videos | 上传视频数 | int |
friends | 朋友数量 | int |
# 看下用户表的数据
# uploader videos friends
barelypolitical 151 5106
准备工作
etl数据清洗
原始数据并不是开箱即用的,比如存在的 &以及tab, 需要先处理为我们想要的格式
etl封装数据处理工具类
package com.pihao.etl.util;
/**
* 进行数据处理的工具类
*/
public class EtlUtils {
/**
* 清洗谷粒影音的原始数据
* 清洗规则
* 1. 将数据长度小于9的清洗掉
* 2. 将数据中的视频类别中间的空格去掉 People & Blogs
* 3. 将数据中的关联视频id通过&符号拼接
*/
public static String processLine(String line){
StringBuffer resultData = new StringBuffer();
//1. 先将数据通过\t 切割
String[] datas = line.split("\t");
//2. 判断长度是否小于9
if(datas.length <9){
return null ;
}
//3. 将数据中的视频类别的空格去掉
datas[3]=datas[3].replaceAll(" ","");
//4. 将数据中的关联视频id通过&拼接
for (int i = 0; i < datas.length; i++) {
if(i < 9){
//4.1 没有关联视频的情况
if(i == datas.length-1){
resultData.append(datas[i]);
}else{
resultData.append(datas[i]).append("\t");
}
}else{
//4.2 有关联视频的情况
if(i == datas.length-1){
resultData.append(datas[i]);
}else{
resultData.append(datas[i]).append("&");
}
}
}
return resultData.toString();
}
}
etl的mapper
package com.pihao.etl;
import com.pihao.etl.util.EtlUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class VideoMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
private Text outputKey = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String afterProcessLine = EtlUtils.processLine(line);
if(afterProcessLine == null){
return ;
}
//有效的数据就把他写出
outputKey.set(afterProcessLine);
context.write(outputKey,NullWritable.get());
}
}
etl的driver
package com.pihao.etl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class VideoDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(VideoDriver.class);
job.setMapperClass(VideoMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0); //这里的数据清洗不用涉及reducer
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
}
将maven工程打包并扔到hadoop102
/opt/module/hive-3.1.2/datas目录
[atguigu@hadoop102 datas]$ pwd
/opt/module/hive-3.1.2/datas
[atguigu@hadoop102 datas]$ ll
总用量 18644
-rw-rw-r--. 1 atguigu atguigu 266 10月 7 17:52 business.txt
-rw-rw-r--. 1 atguigu atguigu 69 10月 1 20:11 dept.txt
-rw-rw-r--. 1 atguigu atguigu 612 10月 1 20:19 emp.txt
-rw-r--r--. 1 atguigu atguigu 4633 10月 9 21:47 guliVideo-1.0-SNAPSHOT.jar
[atguigu@hadoop102 datas]$
上传原始数据到hdfs
将user.txt以及video.txt的五个文件分别上传到/gulivideo/user, /gulivideo/video
怎么上传都行,web页面或者hdfs命令上传都可以
运行jar包,执行清洗命令
[atguigu@hadoop102 datas] hadoop jar guliVideo-1.0-SNAPSHOT.jar com.pihao.etl.VideoDriver /gulivideo/video /gulivideo/video/etl-video
执行后如图所示
准备表
需要准备的表
因为我们最终建的表是带orc压缩的,而这个原始数据是txt格式的,需要转换一次,查出后再插入到压缩表
创建原始数据表:gulivideo_ori,gulivideo_user_ori,
创建最终表:gulivideo_orc,gulivideo_user_orc
创建原始表
(1)gulivideo_ori
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&" -- 处理过后的关联视频数据都是 & 连接
stored as textfile; -- 原始的格式
(2)创建原始数据表: gulivideo_user_ori
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
创建orc存储格式带snappy压缩的表
(1)gulivideo_orc
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
(2)gulivideo_user_orc
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
(3)向ori表插入数据
load data inpath "/gulivideo/video/output" into table gulivideo_ori;
load data inpath "/gulivideo/user" into table gulivideo_user_ori;
(4)向orc表插入数据
insert into table gulivideo_orc select * from gulivideo_ori;
insert into table gulivideo_user_orc select * from gulivideo_user_ori;
ok,至此,我们的数据已经准备完毕,并且都创建了相应的表。接下来就可以来写需求对应的hql了
一.统计视频观看数Top10
select
videoId,
views
from
gulivideo_orc
order by views desc
limit 10 ;
+--------------+-----------+
| videoid | views |
+--------------+-----------+
| dMH0bHeiRNg | 42513417 |
| 0XxI-hvPRRA | 20282464 |
| 1dmVU08zVpA | 16087899 |
| RB-wUgnyGv0 | 15712924 |
| QjA5faZF1A8 | 15256922 |
| -_CSo1gOd48 | 13199833 |
| 49IDp76kjPw | 11970018 |
| tYnn51C3X_w | 11823701 |
| pv5zWaTEVkI | 11672017 |
| D2kJZOfq7zk | 11184051 |
+--------------+-----------+
二.统计视频类别热度Top10 (通过类别下包含的视频个数衡量热度)
1)炸开视频的类别
select
videoId,
category_name
from gulivideo_orc
lateral view explode(category) gulivideo_orc_tmp as category_name =>t1
2) 按照列别分组,求每个组中视频个数
select
t1.category_name,
count(t1.videoId) hot
from
(
select
videoId,
category_name
from gulivideo_orc
lateral view explode(category) gulivideo_orc_tmp as category_name
)t1
group by t1.category_name
order by hot desc
limit 10 ;
+-------------------+---------+
| t1.category_name | hot |
+-------------------+---------+
| Music | 179049 |
| Entertainment | 127674 |
| Comedy | 87818 |
| Animation | 73293 |
| Film | 73293 |
| Sports | 67329 |
| Games | 59817 |
| Gadgets | 59817 |
| People | 48890 |
| Blogs | 48890 |
+-------------------+---------+
三.统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
1) 视频观看数最高的20个视频的所属类别
select
videoId,
views ,
category
from
gulivideo_orc
order by views desc
limit 20 =>t1
+--------------+-----------+---------------------+
| videoid | views | category |
+--------------+-----------+---------------------+
| dMH0bHeiRNg | 42513417 | ["Comedy"] |
| 0XxI-hvPRRA | 20282464 | ["Comedy"] |
| 1dmVU08zVpA | 16087899 | ["Entertainment"] |
| RB-wUgnyGv0 | 15712924 | ["Entertainment"] |
| QjA5faZF1A8 | 15256922 | ["Music"] |
| -_CSo1gOd48 | 13199833 | ["People","Blogs"] |
| 49IDp76kjPw | 11970018 | ["Comedy"] |
| tYnn51C3X_w | 11823701 | ["Music"] |
| pv5zWaTEVkI | 11672017 | ["Music"] |
| D2kJZOfq7zk | 11184051 | ["People","Blogs"] |
| vr3x_RRJdd4 | 10786529 | ["Entertainment"] |
| lsO6D1rwrKc | 10334975 | ["Entertainment"] |
| 5P6UU6m3cqk | 10107491 | ["Comedy"] |
| 8bbTtPL1jRs | 9579911 | ["Music"] |
| _BuRwH59oAo | 9566609 | ["Comedy"] |
| aRNzWyD7C9o | 8825788 | ["UNA"] |
| UMf40daefsI | 7533070 | ["Music"] |
| ixsZy2425eY | 7456875 | ["Entertainment"] |
| MNxwAU_xAMk | 7066676 | ["Comedy"] |
| RUCZJVJ_M8o | 6952767 | ["Entertainment"] |
+--------------+-----------+---------------------+
2) 炸开视频的类别
select
t1.videoId,
category_name
from
(
select
videoId,
views ,
category
from
gulivideo_orc
order by views desc
limit 20
)t1
lateral view explode(t1.category) t1_tmp as category_name =>t2
+--------------+----------------+
| t1.videoid | category_name |
+--------------+----------------+
| dMH0bHeiRNg | Comedy |
| 0XxI-hvPRRA | Comedy |
| 1dmVU08zVpA | Entertainment |
| RB-wUgnyGv0 | Entertainment |
| QjA5faZF1A8 | Music |
| -_CSo1gOd48 | People |
| -_CSo1gOd48 | Blogs |
| 49IDp76kjPw | Comedy |
| tYnn51C3X_w | Music |
| pv5zWaTEVkI | Music |
| D2kJZOfq7zk | People |
| D2kJZOfq7zk | Blogs |
| vr3x_RRJdd4 | Entertainment |
| lsO6D1rwrKc | Entertainment |
| 5P6UU6m3cqk | Comedy |
| 8bbTtPL1jRs | Music |
| _BuRwH59oAo | Comedy |
| aRNzWyD7C9o | UNA |
| UMf40daefsI | Music |
| ixsZy2425eY | Entertainment |
| MNxwAU_xAMk | Comedy |
| RUCZJVJ_M8o | Entertainment |
+--------------+----------------+
3) 按照类别分组,求每个类别下的视频个数
select
t2.category_name,
count(t2.videoId) video_num
from
(
select
t1.videoId,
category_name
from
(
select
videoId,
views ,
category
from
gulivideo_orc
order by views desc
limit 20
)t1
lateral view explode(t1.category) t1_tmp as category_name
)t2
group by t2.category_name ;
+-------------------+------------+
| t2.category_name | video_num |
+-------------------+------------+
| Blogs | 2 |
| Comedy | 6 |
| Entertainment | 6 |
| Music | 5 |
| People | 2 |
| UNA | 1 |
+-------------------+------------+
四.统计视频观看数Top50所关联视频的所属类别排名
1) 统计视频观看数Top50所关联视频
select
videoId,
views,
relatedId
from
gulivideo_orc
order by views desc
limit 50 =>t1
2) 炸开关联视频
select
relatedId_video
from
(select
videoId,
views,
relatedId
from
gulivideo_orc
order by views desc
limit 50)t1
lateral view explode(t1.relatedId) t1_tmp as relatedId_video => t2
3) 关联原表, 求每个关联视频的类别
select
t2.relatedId_video,
t3.category
from
(
select
relatedId_video
from
(select
videoId,
views,
relatedId
from
gulivideo_orc
order by views desc
limit 50)t1
lateral view explode(t1.relatedId) t1_tmp as relatedId_video
)t2
join
gulivideo_orc t3
on t2.relatedId_video = t3.videoId => t4
4) 炸开类别
select
t4.relatedId_video,
category_name
from
(
select
t2.relatedId_video,
t3.category
from
(
select
relatedId_video
from
(select
videoId,
views,
relatedId
from
gulivideo_orc
order by views desc
limit 50)t1
lateral view explode(t1.relatedId) t1_tmp as relatedId_video
)t2
join
gulivideo_orc t3
on t2.relatedId_video = t3.videoId
)t4
lateral view explode(t4.category) t4_tmp as category_name =>t5
5) 按照类别分组,求统计
select
t5.category_name,
count(t5.relatedId_video) video_num
from
(
select
t4.relatedId_video,
category_name
from
(
select
t2.relatedId_video,
t3.category
from
(
select
relatedId_video
from
(select
videoId,
views,
relatedId
from
gulivideo_orc
order by views desc
limit 50)t1
lateral view explode(t1.relatedId) t1_tmp as relatedId_video
)t2
join
gulivideo_orc t3
on t2.relatedId_video = t3.videoId
)t4
lateral view explode(t4.category) t4_tmp as category_name
)t5
group by t5.category_name => t6
6) 求排名
select
t6.category_name,
t6.video_num,
rank() over(order by t6.video_num desc ) rk
from
(
select
t5.category_name,
count(t5.relatedId_video) video_num
from
(
select
t4.relatedId_video,
category_name
from
(
select
t2.relatedId_video,
t3.category
from
(
select
relatedId_video
from
(select
videoId,
views,
relatedId
from
gulivideo_orc
order by views desc
limit 50)t1
lateral view explode(t1.relatedId) t1_tmp as relatedId_video
)t2
join
gulivideo_orc t3
on t2.relatedId_video = t3.videoId
)t4
lateral view explode(t4.category) t4_tmp as category_name
)t5
group by t5.category_name
)t6 ;
+-------------------+---------------+-----+
| t6.category_name | t6.video_num | rk |
+-------------------+---------------+-----+
| Comedy | 237 | 1 |
| Entertainment | 216 | 2 |
| Music | 195 | 3 |
| Blogs | 51 | 4 |
| People | 51 | 4 |
| Film | 47 | 6 |
| Animation | 47 | 6 |
| News | 24 | 8 |
| Politics | 24 | 8 |
| Games | 22 | 10 |
| Gadgets | 22 | 10 |
| Sports | 19 | 12 |
| Howto | 14 | 13 |
| DIY | 14 | 13 |
| UNA | 13 | 15 |
| Places | 12 | 16 |
| Travel | 12 | 16 |
| Animals | 11 | 18 |
| Pets | 11 | 18 |
| Autos | 4 | 20 |
| Vehicles | 4 | 20 |
五.统计每个类别中的视频热度Top10,以Music为例
1) 炸开类别
select
videoId,
category_name,
views
from
gulivideo_orc
lateral view explode(category) gulivideo_orc_tmp as category_name =>t1
2) 求Music类别下的top10
select
t1.videoId,
t1.category_name,
t1.views
from
(select
videoId,
category_name,
views
from
gulivideo_orc
lateral view explode(category) gulivideo_orc_tmp as category_name)t1
where t1.category_name = 'Music'
order by t1.views desc
limit 10
+--------------+-------------------+-----------+
| t1.videoid | t1.category_name | t1.views |
+--------------+-------------------+-----------+
| QjA5faZF1A8 | Music | 15256922 |
| tYnn51C3X_w | Music | 11823701 |
| pv5zWaTEVkI | Music | 11672017 |
| 8bbTtPL1jRs | Music | 9579911 |
| UMf40daefsI | Music | 7533070 |
| -xEzGIuY7kw | Music | 6946033 |
| d6C0bNDqf3Y | Music | 6935578 |
| HSoVKUVOnfQ | Music | 6193057 |
| 3URfWTEPmtE | Music | 5581171 |
| thtmaZnxk_0 | Music | 5142238 |
+--------------+-------------------+-----------+
六.统计每个类别视频观看数Top10
1). 炸开类别
select
videoId,
category_name,
views
from
gulivideo_orc
lateral view explode(category) gulivideo_orc_tmp as category_name =>t1
2) 开窗, 按照类别分区,观看数排序,求排名
select
t1.videoId,
t1.category_name,
t1.views.
rank() over(partition by t1.category_name order by t1.views desc ) rk
from
(
select
videoId,
category_name,
views
from
gulivideo_orc
lateral view explode(category) gulivideo_orc_tmp as category_name
)t1 =>t2
3) 求top10
select
t2.videoId,
t2.category_name,
t2.views ,
t2.rk
from
(
select
t1.videoId,
t1.category_name,
t1.views ,
rank() over(partition by t1.category_name order by t1.views desc ) rk
from
(
select
videoId,
category_name,
views
from
gulivideo_orc
lateral view explode(category) gulivideo_orc_tmp as category_name
)t1
)t2
where t2.rk <=10 ;
七.统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
1) 统计上传视频最多的用户Top10
select
uploader,
videos
from
gulivideo_user_orc
order by videos desc
limit 10 =>t1
+---------------------+---------+
| uploader | videos |
+---------------------+---------+
| expertvillage | 86228 |
| TourFactory | 49078 |
| myHotelVideo | 33506 |
| AlexanderRodchenko | 24315 |
| VHTStudios | 20230 |
| ephemeral8 | 19498 |
| HSN | 15371 |
| rattanakorn | 12637 |
| Ruchaneewan | 10059 |
| futifu | 9668 |
+---------------------+---------+
2) 关联视频表,求他们上传的视频
select
t1.uploader,
t2.videoId,
t2.views
from
(
select
uploader,
videos
from
gulivideo_user_orc
order by videos desc
limit 10
)t1
join
gulivideo_orc t2
on t1.uploader = t2.uploader
order by t2.views desc
limit 20 ;
+----------------+--------------+-----------+
| t1.uploader | t2.videoid | t2.views |
+----------------+--------------+-----------+
| expertvillage | -IxHBW0YpZw | 39059 |
| expertvillage | BU-fT5XI_8I | 29975 |
| expertvillage | ADOcaBYbMl0 | 26270 |
| expertvillage | yAqsULIDJFE | 25511 |
| expertvillage | vcm-t0TJXNg | 25366 |
| expertvillage | 0KYGFawp14c | 24659 |
| expertvillage | j4DpuPvMLF4 | 22593 |
| expertvillage | Msu4lZb2oeQ | 18822 |
| expertvillage | ZHZVj44rpjE | 16304 |
| expertvillage | foATQY3wovI | 13576 |
| expertvillage | -UnQ8rcBOQs | 13450 |
| expertvillage | crtNd46CDks | 11639 |
| expertvillage | D1leA0JKHhE | 11553 |
| expertvillage | NJu2oG1Wm98 | 11452 |
| expertvillage | CapbXdyv4j4 | 10915 |
| expertvillage | epr5erraEp4 | 10817 |
| expertvillage | IyQoDgaLM7U | 10597 |
| expertvillage | tbZibBnusLQ | 10402 |
| expertvillage | _GnCHodc7mk | 9422 |
| expertvillage | hvEYlSlRitU | 7123 |
+----------------+--------------+-----------+