业务数据库的实时分析:
业务数据库就是MySQL(集群),假设直接在业务数据库中写SQL进行查询,如果复杂的查询(多维度聚合、join、并且数据流比较大)业务数据库的性能就会下降。甚至不能完成正常的业务功能(不能完成普通的业务数据的查询、插入、修改或是性能下降)
离线:MySQL -----> Sqoop / SparkSQL/ DataX -----> HDFS(Hive)
实时:MySQL —> Canal(数据同步工具) ------> Kafka
Canal安装与配置可以参考 Canal安装与配置
一、业务数据导入到Kafka
一个表对应一个topic
1、创建多个topic
备注:不要有下划线或者.
2、修改canal
vi instatnce.properties
启动cancal…
3、启动kafka
二、订单表双流join
订单明细表和订单主播进行join,明细表当成左流(左表),订单主表当成右流(右表),在Flink如果希望两个流能join到一起,有WindowJoin和IntervalJoin,为了实现左外连接,使用了Window CoGroup,按照EventTime划分窗口,在一些极端情况下,会有数据迟到,就被丢弃了。Widnow的Join和CoGroup没法回去到迟到的数据。就有可能会数据丢失。
解决方案:
在Cogrop之前将左流的数据先划分一个分为了CoGroup的window长度和类型一样的窗口,由于窗口的长度和类型一样,进入到CoGroup的窗口迟到,之前的窗口也肯定迟到。可以将迟到的数据在之前的窗口中打上Tag,通过侧流数据将迟到的数据输出。在将数据跟join后的数据Union,如果右流为空,就查询数据库关联右流的信息。
终极的解决方案:修改Flink源码,在Join、CoGroup后的WindowStream添加一个sideOutputLateData方法,将迟到的数据打上指定的Tag,然后使用OutPut输出。
方式一:不管迟到数据
package cn._51doit.flink.day10;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class OrderJoin {
public static void main(String[] args) throws Exception {
ParameterTool parameters = ParameterTool.fromPropertiesFile(args[0]);
//使用EventTime作为时间标准
FlinkUtilsV2.getEnv().setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<String> orderMainLinesDataStream = FlinkUtilsV2.createKafkaDataStream(parameters, "ordermain", "g1", SimpleStringSchema.class);
DataStream<String> orderDetailLinesDataStream = FlinkUtilsV2.createKafkaDataStream(parameters, "orderdetail", "g1", SimpleStringSchema.class);
//对数据进行解析
SingleOutputStreamOperator<OrderMain> orderMainDataStream = orderMainLinesDataStream.process(new ProcessFunction<String, OrderMain>() {
@Override
public void processElement(String line, Context ctx, Collector<OrderMain> out) throws Exception {
//flatMap+filter
try {
JSONObject jsonObject = JSON.parseObject(line);
String type = jsonObject.getString("type");
if (type.equals("INSERT") || type.equals("UPDATE")) {
JSONArray jsonArray = jsonObject.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
OrderMain orderMain = jsonArray.getObject(i, OrderMain.class);
orderMain.setType(type); //设置操作类型
out.collect(orderMain);
}
}
} catch (Exception e) {
//e.printStackTrace();
//记录错误的数据
}
}
});
//对数据进行解析
SingleOutputStreamOperator<OrderDetail> orderDetailDataStream = orderDetailLinesDataStream.process(new ProcessFunction<String, OrderDetail>() {
@Override
public void processElement(String line, Context ctx, Collector<OrderDetail> out) throws Exception {
//flatMap+filter
try {
JSONObject jsonObject = JSON.parseObject(line);
String type = jsonObject.getString("type");
if (type.equals("INSERT") || type.equals("UPDATE")) {
JSONArray jsonArray = jsonObject.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
OrderDetail orderDetail = jsonArray.getObject(i, OrderDetail.class);
orderDetail.setType(type); //设置操作类型
out.collect(orderDetail);
}
}
} catch (Exception e) {
//e.printStackTrace();
//记录错误的数据
}
}
});
int delaySeconds = 2;
//提取EventTime生成WaterMark
SingleOutputStreamOperator<OrderMain> orderMainStreamWithWaterMark = orderMainDataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<OrderMain>(Time.seconds(delaySeconds)) {
@Override
public long extractTimestamp(OrderMain element) {
return element.getCreate_time().getTime();
}
});
SingleOutputStreamOperator<OrderDetail> orderDetailStreamWithWaterMark = orderDetailDataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<OrderDetail>(Time.seconds(delaySeconds)) {
@Override
public long extractTimestamp(OrderDetail element) {
return element.getCreate_time().getTime();
}
});
//Left Out JOIN,并且将订单明细表作为左表
DataStream<Tuple2<OrderDetail, OrderMain>> joined = orderDetailStreamWithWaterMark.coGroup(orderMainStreamWithWaterMark)
.where(new KeySelector<OrderDetail, Long>() {
@Override
public Long getKey(OrderDetail value) throws Exception {
return value.getOrder_id();
}
})
.equalTo(new KeySelector<OrderMain, Long>() {
@Override
public Long getKey(OrderMain value) throws Exception {
return value.getOid();
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.apply(new CoGroupFunction<OrderDetail, OrderMain, Tuple2<OrderDetail, OrderMain>>() {
@Override
public void coGroup(Iterable<OrderDetail> first, Iterable<OrderMain> second, Collector<Tuple2<OrderDetail, OrderMain>> out) throws Exception {
for (OrderDetail orderDetail : first) {
boolean isJoined = false;
for (OrderMain orderMain : second) {
out.collect(Tuple2.of(orderDetail, orderMain));
isJoined = true;
}
if (!isJoined) {
out.collect(Tuple2.of(orderDetail, null));
}
}
}
});
joined.print();
FlinkUtilsV2.getEnv().execute();
}
}
方式二:管迟到数据
package cn._51doit.flink.day11;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import java.sql.*;
public class OrderJoinAdv {
public static void main(String[] args) throws Exception {
ParameterTool parameters = ParameterTool.fromPropertiesFile(args[0]);
FlinkUtilsV2.getEnv().setParallelism(1);
//使用EventTime作为时间标准
FlinkUtilsV2.getEnv().setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<String> orderMainLinesDataStream = FlinkUtilsV2.createKafkaDataStream(parameters, "ordermain", "g1", SimpleStringSchema.class);
DataStream<String> orderDetailLinesDataStream = FlinkUtilsV2.createKafkaDataStream(parameters, "orderdetail", "g1", SimpleStringSchema.class);
//对数据进行解析
SingleOutputStreamOperator<OrderMain> orderMainDataStream = orderMainLinesDataStream.process(new ProcessFunction<String, OrderMain>() {
@Override
public void processElement(String line, Context ctx, Collector<OrderMain> out) throws Exception {
//flatMap+filter
try {
JSONObject jsonObject = JSON.parseObject(line);
String type = jsonObject.getString("type");
if (type.equals("INSERT") || type.equals("UPDATE")) {
JSONArray jsonArray = jsonObject.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
OrderMain orderMain = jsonArray.getObject(i, OrderMain.class);
orderMain.setType(type); //设置操作类型
out.collect(orderMain);
}
}
} catch (Exception e) {
//e.printStackTrace();
//记录错误的数据
}
}
});
//对数据进行解析
SingleOutputStreamOperator<OrderDetail> orderDetailDataStream = orderDetailLinesDataStream.process(new ProcessFunction<String, OrderDetail>() {
@Override
public void processElement(String line, Context ctx, Collector<OrderDetail> out) throws Exception {
//flatMap+filter
try {
JSONObject jsonObject = JSON.parseObject(line);
String type = jsonObject.getString("type");
if (type.equals("INSERT") || type.equals("UPDATE")) {
JSONArray jsonArray = jsonObject.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
OrderDetail orderDetail = jsonArray.getObject(i, OrderDetail.class);
orderDetail.setType(type); //设置操作类型
out.collect(orderDetail);
}
}
} catch (Exception e) {
//e.printStackTrace();
//记录错误的数据
}
}
});
int delaySeconds = 2;
int windowSize = 5;
//提取EventTime生成WaterMark
SingleOutputStreamOperator<OrderMain> orderMainStreamWithWaterMark = orderMainDataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<OrderMain>(Time.seconds(delaySeconds)) {
@Override
public long extractTimestamp(OrderMain element) {
return element.getCreate_time().getTime();
}
});
SingleOutputStreamOperator<OrderDetail> orderDetailStreamWithWaterMark = orderDetailDataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<OrderDetail>(Time.seconds(delaySeconds)) {
@Override
public long extractTimestamp(OrderDetail element) {
return element.getCreate_time().getTime();
}
});
//定义迟到侧流输出的Tag
OutputTag<OrderDetail> lateTag = new OutputTag<OrderDetail>("late-date") {};
//对左表进行单独划分窗口,窗口的长度与cogroup的窗口长度一样
SingleOutputStreamOperator<OrderDetail> orderDetailWithWindow = orderDetailStreamWithWaterMark.windowAll(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
.sideOutputLateData(lateTag) //将迟到的数据打上Tag
.apply(new AllWindowFunction<OrderDetail, OrderDetail, TimeWindow>() {
@Override
public void apply(TimeWindow window, Iterable<OrderDetail> values, Collector<OrderDetail> out) throws Exception {
for (OrderDetail value : values) {
out.collect(value);
}
}
});
//获取迟到的数据
DataStream<OrderDetail> lateOrderDetailStream = orderDetailWithWindow.getSideOutput(lateTag);
//应为orderDetail表的数据迟到数据不是很多,没必要使用异步IO,直接使用RichMapFunction
SingleOutputStreamOperator<Tuple2<OrderDetail, OrderMain>> lateOrderDetailAndOrderMain = lateOrderDetailStream.map(new RichMapFunction<OrderDetail, Tuple2<OrderDetail, OrderMain>>() {
@Override
public Tuple2<OrderDetail, OrderMain> map(OrderDetail detail) throws Exception {
return Tuple2.of(detail, null);
}
});
//Left Out JOIN,并且将订单明细表作为左表
DataStream<Tuple2<OrderDetail, OrderMain>> joined = orderDetailWithWindow.coGroup(orderMainStreamWithWaterMark)
.where(new KeySelector<OrderDetail, Long>() {
@Override
public Long getKey(OrderDetail value) throws Exception {
return value.getOrder_id();
}
})
.equalTo(new KeySelector<OrderMain, Long>() {
@Override
public Long getKey(OrderMain value) throws Exception {
return value.getOid();
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
.apply(new CoGroupFunction<OrderDetail, OrderMain, Tuple2<OrderDetail, OrderMain>>() {
@Override
public void coGroup(Iterable<OrderDetail> first, Iterable<OrderMain> second, Collector<Tuple2<OrderDetail, OrderMain>> out) throws Exception {
for (OrderDetail orderDetail : first) {
boolean isJoined = false;
for (OrderMain orderMain : second) {
out.collect(Tuple2.of(orderDetail, orderMain));
isJoined = true;
}
if (!isJoined) {
out.collect(Tuple2.of(orderDetail, null));
}
}
}
});
joined.union(lateOrderDetailAndOrderMain).map(new RichMapFunction<Tuple2<OrderDetail, OrderMain>, Tuple2<OrderDetail, OrderMain>>() {
private transient Connection connection;
@Override
public void open(Configuration parameters) throws Exception {
//可以创建数据库连接
connection = DriverManager.getConnection("jdbc:mysql://172.16.100.100:3306/doit?characterEncoding=UTF-8", "root", "123456");
}
@Override
public Tuple2<OrderDetail, OrderMain> map(Tuple2<OrderDetail, OrderMain> tp) throws Exception {
//每个关联上订单主表的数据,就查询书库
if (tp.f1 == null) {
tp.f1 = queryOrderMainFromMySQL(tp.f0.getOrder_id(), connection);
}
return tp;
}
@Override
public void close() throws Exception {
//关闭数据库连接
}
}).print();
FlinkUtilsV2.getEnv().execute();
}
private static OrderMain queryOrderMainFromMySQL(Long order_id, Connection connection) throws Exception {
PreparedStatement preparedStatement = connection.prepareStatement("SELECT * ordermain WHERE oid = ?");
//设置参数
preparedStatement.setLong(1, order_id);
//执行查询
ResultSet resultSet = preparedStatement.executeQuery();
//取出结果
long oid = resultSet.getLong("oid");
Date createTime = resultSet.getDate("create_time");
double totalMoney = resultSet.getDouble("total_money");
int status = resultSet.getInt("status");
OrderMain orderMain = new OrderMain();
orderMain.setOid(oid);
orderMain.setStatus(status);
return orderMain;
}
}
三、修改源码解决双流join迟到数据
略
四、多流进行关联(Join)
比如三个流进行Join(拼团明细表 Left Join 订单主表 Left Join 拼团主表),Flink窗口的Join、CoGroup不支持多个流在一个窗口内进行Join、CoGroup
解决方案一:
将两个流进行Join、CoGroup,将的得到Join后的流再查数据库关联信息(异步IO,要查数据库,效率较低)
解决方案二:
将两个流进行Join、CoGroup,将的得到Join后的流在跟第三流进行JOIN(有两个窗口,在窗口中进行Join,数据是放在WindowState中,效率高一些)
五、使用侧流输出实现数据拆分
package cn._51doit.flink.day10;
import cn._51doit.flink.day09.KafkaStringSerializationSchema;
import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.core.fs.Path;
import org.apache.flink.formats.parquet.avro.ParquetAvroWriters;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import java.time.ZoneId;
import java.util.Properties;
/**
* 使用侧流输出实现数据拆分
*
* 程序的功能:实时的ETL(拉取、过滤、关联维度、筛选字段、字段脱敏、数据拆分、转换格式或类型)
*
* 1.使用RichMapFunction关联维度数据(可以使用异步IO进行优化)
* 2.将数据使用测流(旁路)输出进行查询
* 3.将数据在写回到Kafka
* Kafka吞吐量高,并且可以保证ExactlyOnce【FlinkKafkaProducer实现了分两个阶段提交,继承了一个类TwoPhaseCommitSinkFunction】
* 实现了两个接口:CheckpointedFunction和CheckpointListener,可以保证Checkpoint成功在提交事物,如果checkpoint失败将事物回滚
* 4.(可选的)将主流的是还可以写入到HDFS、可以使用BulkSink,以Parquet格式写入
*/
public class PreETLAndTopicSplit {
public static void main(String[] args) throws Exception{
System.setProperty("HADOOP_USER_NAME", "root");
ParameterTool parameters = ParameterTool.fromPropertiesFile(args[0]);
DataStream<String> lines = FlinkUtils.createKafkaStream(parameters, SimpleStringSchema.class);
//设置全局的参数
FlinkUtils.env.getConfig().setGlobalJobParameters(parameters);
SingleOutputStreamOperator<LogBean> beanDataStream = lines.map(new ToJSONMapFunction());
SingleOutputStreamOperator<LogBean> filteredStream = beanDataStream.filter(new FilterFunction<LogBean>() {
@Override
public boolean filter(LogBean bean) throws Exception {
return bean != null;
}
});
//将数据进行拆分
//原来使用split方式,再select,现在使用侧流输出
//流量的Tag
OutputTag<LogBean> flowOutputTag = new OutputTag<LogBean>("flow-output") {};
//活动的Tag
OutputTag<LogBean> activityOutputTag = new OutputTag<LogBean>("activity-output") {};
SingleOutputStreamOperator<LogBean> mainStream = filteredStream.process(new ProcessFunction<LogBean, LogBean>() {
@Override
public void processElement(LogBean bean, Context ctx, Collector<LogBean> out) throws Exception {
//根据数据所携带的具有类型进行判断
String logType = bean.getLogType();
if (logType.startsWith("act")) {
//打上标签
ctx.output(activityOutputTag, bean);
} else {
//流量的类型
//打上标签
ctx.output(flowOutputTag, bean);
}
//没有打标签的也输出
out.collect(bean);
}
});
DataStream<LogBean> flowStream = mainStream.getSideOutput(flowOutputTag);
DataStream<LogBean> activityStream = mainStream.getSideOutput(activityOutputTag);
String flowTopic = parameters.getRequired("flow.topic");
Properties properties = parameters.getProperties();
properties.setProperty("transaction.timeout.ms", parameters.getRequired("kafka.transaction.timeout.ms"));
FlinkKafkaProducer<String> kafkaProducer1 = new FlinkKafkaProducer<String>(
flowTopic, //指定topic
new KafkaStringSerializationSchema(flowTopic), //指定写入Kafka的序列化Schema
properties, //指定Kafka的相关参数
FlinkKafkaProducer.Semantic.EXACTLY_ONCE //指定写入Kafka为EXACTLY_ONCE语义
);
//将数据存储转换成JSON字符串在写到Kafka
flowStream.map(new MapFunction<LogBean, String>() {
@Override
public String map(LogBean bean) throws Exception {
return JSON.toJSONString(bean);
}
}).addSink(kafkaProducer1);
String activityTopic = parameters.getRequired("activity.topic");
FlinkKafkaProducer<String> kafkaProducer2 = new FlinkKafkaProducer<String>(
activityTopic, //指定topic
new KafkaStringSerializationSchema(activityTopic), //指定写入Kafka的序列化Schema
properties, //指定Kafka的相关参数
FlinkKafkaProducer.Semantic.EXACTLY_ONCE //指定写入Kafka为EXACTLY_ONCE语义
);
activityStream.map(new MapFunction<LogBean, String>() {
@Override
public String map(LogBean bean) throws Exception {
return JSON.toJSONString(bean);
}
}).addSink(kafkaProducer2);
String path = parameters.getRequired("mainstream.hdfs.out.path");
// StreamingFileSink<String> streamingFileSink = StreamingFileSink
// .forRowFormat(new Path(path), new SimpleStringEncoder<String>("UTF-8"))
// .withRollingPolicy(
// DefaultRollingPolicy.builder()
// .withRolloverInterval(TimeUnit.SECONDS.toMillis(30)) //
// .withInactivityInterval(TimeUnit.SECONDS.toMillis(10))
// .withMaxPartSize(1024 * 1024 * 1024)
// .build())
// .build();
//存储到HDFS中
// mainStream.map(new MapFunction<LogBean, String>() {
// @Override
// public String map(LogBean bean) throws Exception {
// return JSON.toJSONString(bean);
// }
// }).addSink(streamingFileSink);
//指定文件目录生成的格式
DateTimeBucketAssigner<LogBean> bucketAssigner = new DateTimeBucketAssigner<>(
"yyyy-MM-dd--HH-mm",
ZoneId.of("Asia/Shanghai"));
//构建一个StreamingFileSink,数据使用Bulk批量写入方式,存储格式为Parquet列式存储
StreamingFileSink<LogBean> streamingFileSink = StreamingFileSink.
forBulkFormat(
new Path(path), //数据写入的目录
ParquetAvroWriters.forReflectRecord(LogBean.class) //以Parquet格式写入
)
.withBucketAssigner(bucketAssigner).build();
mainStream.addSink(streamingFileSink);
FlinkUtils.env.execute();
}
}