背景
在Flink状态编程中,经常会用到状态编程,其中也包括广播状态。广播变量作为K-V类型状态数据,平时使用的基本类型比较多(比如String,Boolean,Byte,Short,Int,Long,Float,Double,Char,Date,Void,BigInteger,BigDecimal,Instant等),以K和V都是String举例,定义如下:
MapStateDescriptor<String, String> mapStateDescriptor = new MapStateDescriptor<String, String>("testMapState", BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
在这次的项目中,基本类型已无法满足业务场景,经过研究,可以在广播状态中使用其他的类型,比如HashMap,定义广播变量的时候,只需要在类型声明出做出调整
MapStateDescriptor<String, HashMap> mapMapStateDescriptor = new MapStateDescriptor<String, HashMap>("testMapMapState", BasicTypeInfo.STRING_TYPE_INFO, TypeInformation.of(new TypeHint<HashMap>() {
@Override
public TypeInformation<HashMap> getTypeInfo() {
return super.getTypeInfo();
}
}));
当然,这里直接用的是父类的方法,可以不用重写,改造如下:
MapStateDescriptor<String, HashMap> mapMapStateDescriptor = new MapStateDescriptor<String, HashMap>("testMapMapState", BasicTypeInfo.STRING_TYPE_INFO, TypeInformation.of(new TypeHint<HashMap>() {}));
参考官网资料:Apache Flink 1.12 Documentation: Broadcast State 模式
案例说明
下面以案例来说明HashMap在广播变量中的使用
Flink DataStream消费kafka的两个topic,形成两个流,数据格式如下:
topic1:{"name":"zhangsan","province":"anhui","city":"hefei"}
topic2:{"province":"anhui","city":"hefei","address":"rongchuang"}
topic1 -> stream1,topic2 -> stream2;
topic2的数据作为广播数据;topic1的数据关联topic2的数据,获取address(逻辑可能不严谨,能满足功能测试即可)。
整体代码实现如下:
package flinkbroadcasttest;
import flinkbroadcasttest.processfunction.FlinkBroadcastTestProcess;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.datastream.BroadcastConnectedStream;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.util.HashMap;
import java.util.Properties;
public class FlinkBroadcastTest {
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 禁用全局任务链
env.disableOperatorChaining();
String brokers = "kafka-log1.test.xl.com:9092,kafka-log2.test.xl.com:9092,kafka-log3.test.xl.com:9092";
String topic1 = "0000-topic1";
String topic2 = "0000-topic2";
String groupId = "demo";
Properties props = new Properties();
props.setProperty("bootstrap.servers", brokers);
props.setProperty("group.id", groupId);
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("auto.offset.reset", "earliest");
props.put("max.poll.records", 1000);
props.put("session.timeout.ms", 90000);
props.put("request.timeout.ms", 120000);
props.put("enable.auto.commit", true);
props.put("auto.commit.interval.ms", 100);
FlinkKafkaConsumer<String> consumer1 = new FlinkKafkaConsumer<String>(topic1, new SimpleStringSchema(), props);
consumer1.setCommitOffsetsOnCheckpoints(true);
DataStream<String> data1KafkaDataDS = env.addSource(consumer1);
FlinkKafkaConsumer<String> consumer2 = new FlinkKafkaConsumer<String>(topic2, new SimpleStringSchema(), props);
consumer2.setCommitOffsetsOnCheckpoints(true);
DataStream<String> data2KafkaDataDS = env.addSource(consumer2);
MapStateDescriptor<String, HashMap> mapMapStateDescriptor = new MapStateDescriptor<String, HashMap>("testMapMapState", BasicTypeInfo.STRING_TYPE_INFO, TypeInformation.of(new TypeHint<HashMap>() {}));
BroadcastStream<String> broadcast = data2KafkaDataDS.broadcast(mapMapStateDescriptor);
BroadcastConnectedStream<String, String> connect = data1KafkaDataDS.connect(broadcast);
DataStream<String> result = connect.process(new FlinkBroadcastTestProcess());
result.print();
env.execute("FlinkBroadcastTest");
}
}
package flinkbroadcasttest.processfunction;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.util.Collector;
import java.util.HashMap;
public class FlinkBroadcastTestProcess extends BroadcastProcessFunction<String, String, String> {
MapStateDescriptor<String, HashMap> mapMapStateDescriptor = new MapStateDescriptor<String, HashMap>("testMapMapState", BasicTypeInfo.STRING_TYPE_INFO, TypeInformation.of(new TypeHint<HashMap>() {}));
@Override
public void processElement(String value, ReadOnlyContext ctx, Collector<String> out) throws Exception {
try {
ReadOnlyBroadcastState<String, HashMap> broadcastState = ctx.getBroadcastState(mapMapStateDescriptor);
JSONObject obj = JSON.parseObject(value);
String name = obj.getString("name");
String province = obj.getString("province");
String city = obj.getString("city");
HashMap hashMap = broadcastState.get(province);
if (hashMap != null && hashMap.containsKey(city)) {
String address = hashMap.get(city).toString();
System.out.println(address);
JSONObject object = new JSONObject();
obj.put("name", name);
object.put("province", province);
object.put("city", city);
object.put("address", address);
out.collect(object.toString());
}
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void processBroadcastElement(String value, Context ctx, Collector<String> out) throws Exception {
try {
BroadcastState<String, HashMap> broadcastState = ctx.getBroadcastState(mapMapStateDescriptor);
JSONObject obj = JSON.parseObject(value);
String province = obj.getString("province");
String city = obj.getString("city");
String address = obj.getString("address");
String kind = obj.getString("kind");
HashMap hashMap = broadcastState.get(province);
if (kind.equals("delete")) {
if (hashMap != null && hashMap.containsKey(city)) {
hashMap.remove(city);
broadcastState.put(province, hashMap);
}
} else if (kind.equals("add")) {
if (hashMap == null) {
hashMap = new HashMap();
}
hashMap.put(city, address);
broadcastState.put(province, hashMap);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}