Flink Sql 解析复杂Json《嵌套,数组,多数组》
- Flink 版本为1.12.0
1、数据源为嵌套
- 数据源为kafka,数据Json如下:
{"id":"1","info":"1234_5678","subid":{"tid1":11,"info1":"info1test"}}
{"id":"2","info":"1234_5678","subid":{"tid1":12,"info1":"info1test"}}
- Flink sql 编写
object QianTaoJson {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val setting: EnvironmentSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build()
val tEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, setting)
tEnv.getConfig.getConfiguration.setString("parallelism.default", "5")
tEnv.getConfig.setIdleStateRetentionTime(Time.days(15), Time.days(16))
tEnv.executeSql(
"""
|create temporary table kafkatable(
|id string,
|info string,
|subid row<
|tid1 int,
|info1 string>
|) WITH(
|'connector' = 'kafka',
|'topic' = 'flinksplit',
|'properties.bootstrap.servers' = '***:9092',
|'value.format' = 'json'
|)
|""".stripMargin)
val result: TableResult = tEnv.executeSql(
"""
|select
|id,
|info,
|subid.tid1,
|subid.info1
|from kafkatable
|""".stripMargin)
result.print()
}
}
- 结果如下:
+----+--------------------------------+--------------------------------+-------------+--------------------------------+
| op | id | info | tid1 | info1 |
+----+--------------------------------+--------------------------------+-------------+--------------------------------+
| +I | 1 | 1234_5678 | 11 | info1test |
| +I | 2 | 1234_5678 | 12 | info1test |
2、数据源为集合
- kafka数据源
{"id":1,"arr1":[{"tid1":11,"info1":"info1test"},{"tid1":21,"info1":"info1test1"}]}
- flink sql
object ArrayTest {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val setting: EnvironmentSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build()
val tEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, setting)
tEnv.getConfig.getConfiguration.setString("parallelism.default", "5")
tEnv.getConfig.setIdleStateRetentionTime(Time.days(15), Time.days(16))
tEnv.executeSql(
"""
|create temporary table kafkatable(
|id int,
|arr1 array<row<
|tid1 int,
|info1 string
|>>
|) WITH(
|'connector' = 'kafka',
|'topic' = 'flinksplit',
|'properties.bootstrap.servers' = '***:9092',
|'value.format' = 'json'
|)
|""".stripMargin)
val result: TableResult = tEnv.executeSql(
"""
|select
|kt.id,
|t1.tid1,
|t1.info1
|from kafkatable kt cross join unnest(arr1) as t1(
|tid1,
|info1)
|""".stripMargin)
result.print()
}
}
- 结果如下:
+----+-------------+-------------+--------------------------------+
| op | id | tid1 | info1 |
+----+-------------+-------------+--------------------------------+
| +I | 1 | 11 | info1test |
| +I | 1 | 21 | info1test1 |
3、双集合如何处理
- kafka数据源
{"id":1,"arr1":[{"tid1":11,"info1":"info1test"},{"tid1":21,"info1":"info1test1"}],"arr2":[{"tid2":12,"info2":"info2test"},{"tid2":22,"info2":"info2test1"}]}
- flink sql
object DoubleArrayTest {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val setting: EnvironmentSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build()
val tEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, setting)
tEnv.getConfig.getConfiguration.setString("parallelism.default", "5")
tEnv.getConfig.setIdleStateRetentionTime(Time.days(15), Time.days(16))
tEnv.executeSql(
"""
|create temporary table flinksplit(
|id int,
|arr1 array<row<
|tid1 int,
|info1 string
|>>,
|arr2 array<row<
|tid2 int,
|info2 string
|>>
|) WITH(
|'connector' = 'kafka',
|'topic' = 'flinksplit',
|'properties.bootstrap.servers' = '***:9092',
|'value.format' = 'json'
|)
|""".stripMargin)
val result: TableResult = tEnv.executeSql(
"""
|select
|fs.id,
|t1.tid1,
|t1.info1,
|t2.tid2,
|t2.info2
|from flinksplit fs CROSS JOIN UNNEST(`arr1`) AS t1(
|tid1,
|info1)
|CROSS JOIN UNNEST(`arr2`) AS t2(
|tid2,
|info2
|)
|""".stripMargin)
result.print()
}
}
- 结果如下,可价where过滤出所需要的
+----+-------------+-------------+--------------------------------+-------------+--------------------------------+
| op | id | tid1 | info1 | tid2 | info2 |
+----+-------------+-------------+--------------------------------+-------------+--------------------------------+
| +I | 1 | 11 | info1test | 12 | info2test |
| +I | 1 | 11 | info1test | 22 | info2test1 |
| +I | 1 | 21 | info1test1 | 12 | info2test |
| +I | 1 | 21 | info1test1 | 22 | info2test1 |