DataStream和DataSet的用法一样
// DataStream
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// DataSet
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
依赖
<!--flink核心包-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.7.2</version>
</dependency>
<!--flink流处理包-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>1.7.2</version>
<scope>provided</scope>
</dependency>
1)基于文件
readTextFile(path)
需求
统计一个文件中各个单词出现的次数
1.从本地文件读取
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class StreamFromFile {
public static void main(String[] args) throws Exception {
// 1、获取flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2、用flink的运行环境,去获取待分析数据
String input = "C:\\data\\input\\hello.txt";
DataSource<String> lines = env.readTextFile(input);
// 3、处理数据
//将lines中的每一行元素都施加一个算法
FlatMapOperator<String, Tuple2<String, Integer>> wordAndOne = lines.flatMap(new SplitFun());
// (单词,1)
// b。把相同的单词聚合到一起
UnsortedGrouping<Tuple2<String, Integer>> groupByWord = wordAndOne.groupBy(0);
// c。把聚合到一起的数据累加处理
DataSet<Tuple2<String, Integer>> result = groupByWord.sum(1);
// 4、保存处理结果
result.print();
// 5、触发程序执行
env.execute("wordcount batch process");
}
/**
* 作用:将输入进来的每一行元素根据空格切分,切分成一个一个的单词
* 再把切分好的单词变成《单词,1)这样的组合,元组中
*/
static class SplitFun implements FlatMapFunction<String, Tuple2<String ,Integer>>{
@Override
public void flatMap(String in, Collector<Tuple2<String, Integer>> out) throws Exception {
// a.将文本内容打散成一个一个单词
String[] words = in.split(" "); //hello you ==> (hello,1) (you,1)
for (String word : words) {
Tuple2<String, Integer> wordAndOne = new Tuple2<>();
out.collect(wordAndOne);
}
}
}
}
2.从hdfs中文件读取
读取文本文件,文件遵循TextInputFormat逐行读取规则并返回
tip:本地Idea读Hadoop中hdfs文件需要:
a、System.setProperty("HADOOP_USER_NAME","文件所有者名称");(通过界面查看)
b、Window配置hadoop(自行百度查找)
c、依赖:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hadoop-compatibility_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
在resource目录下放三个文件
第一:hdfs-site.xml
<configuration>
<-- 块副本数,默认为3 -->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<-- secondarynamenode的地址 辅助namenode工作 -->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>bigdata:9001</value>
</property>
<-- 指定一个本地的文件系统,决定NameNode 在何处存放fsimage -->
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/opt/media/hadoop-2.6.0/dfs/name</value>
</property>
<-- 设置 DataNode 节点存储数据文件的本地路径,可以通过逗号分隔指定多个路径 -->
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/opt/medis/hadoop-2.6.0/dfs/data</value>
</property>
<-- 外域HDFS客户端访问内网HDFS datanode 以下两个 -->
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
</property>
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>true</value>
</property>
</configuration>
第二:core-site.xml
<configuration>
<!-- 指定NameNode的地址 -->
<property>
<name>fs.deaultFS</name>
<value>hdfs://hadoop100:8020</value>
</property>
<!-- 指定hadoop数据的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/module/hadoop-3.1.3/data</value>
</property>
<!-- 指定hadoop数据的存储目录 -->
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
</configuration>
第三:log4j.properties
#将等级为DEBUG的日志信息输出到console和file这两个目的地,console和file的定义在下面的代码
log4j.rootLogger=DEBUG,console,file
#控制台输出的相关设置
log4j.appender.console = org.apache.log4j.ConsoleAppender
log4j.appender.console.Target = System.out
log4j.appender.console.Threshold=DEBUG
log4j.appender.console.layout = org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=[%c]-%m%n
#文件输出的相关设置
log4j.appender.file = org.apache.log4j.RollingFileAppender
log4j.appender.file.File=./log/logFile.log
log4j.appender.file.MaxFileSize=10mb
log4j.appender.file.Threshold=DEBUG
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=[%p][%d{yy-MM-dd}][%c]%m%n
#日志输出级别
log4j.logger.org.mybatis=DEBUG
log4j.logger.java.sql=DEBUG
log4j.logger.java.sql.Statement=DEBUG
log4j.logger.java.sql.ResultSet=DEBUG
log4j.logger.java.sql.PreparedStatement=DEBUG
案例
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class StreamFromFile {
public static void main(String[] args) throws Exception {
// 文件拥有者的名称,通过ui界面即可获得
System.setProperty("HADOOP_USER_NAME","user");
// 9000为内部文件通信端口
String inputHdfs = "hdfs://ip:9000/bigdata.txt";
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> data = env.readTextFile(inputHdfs);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = data.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
for (String word : s.split(" ")) {
collector.collect(Tuple2.of(word, 1));
}
}
});
SingleOutputStreamOperator<Tuple2<String, Integer>> result = wordAndOne.keyBy(0).sum(1);
result.print();
env.execute();
}
}
2)基于Socket
socketTextStream
从Socket中读取数据,元素可以通过一个分隔符分开
需求
统计一个文件中各个单词出现的次数
发送
netcat下载:https://blog.csdn.net/qq_38762390/article/details/115789281
nc -lk 7777
案例
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class WordCountStream {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 获取流式数据
String hostname = "192.168.25.129";
int port = 7777;
DataStreamSource<String> dSource = env.socketTextStream(hostname, port);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = dSource.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String in, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] words = in.split(" ");
for (String word : words) {
Tuple2<String, Integer> wordAndOne = new Tuple2<>();
out.collect(wordAndOne);
}
}
});
SingleOutputStreamOperator<Tuple2<String, Integer>> result = wordAndOne.keyBy(0).sum(1);
result.print();
}
}
3)基于集合
fromCollection(Collection)
fromElements
通过Java的Collection集合创建一个数据流,集合中的所有元素必须是相同类型的
如果满足以下条件,Flink将数据类型识别为POJO类型(并允许“按名称”字段引用):
-
该类是共有且独立的(没有非静态内部类)
-
该类有共有的无参构造方法
-
类(及父类)中所有的不被static、transient修饰的属性要么有公有的(且不被final修饰),要么是包含共有的getter和setter方法,这些方法遵循java bean命名规范。
案例
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.ArrayList;
public class StreamFromCollection {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// DataStreamSource<String> data = env.fromElements("spark", "flink");
ArrayList<People> peopleList = new ArrayList<People>();
peopleList.add(new People("lucas", 18));
peopleList.add(new People("jack", 30));
peopleList.add(new People("jack", 40));
DataStreamSource<People> data = env.fromCollection(peopleList);
// DataStreamSource<People> data = env.fromElements(new People("lucas", 18), new People("jack", 30), new People("jack", 40));
SingleOutputStreamOperator<People> filtered = data.filter(new FilterFunction<People>() {
public boolean filter(People people) throws Exception {
return people.age > 20;
}
});
filtered.print();
env.execute();
}
public static class People{
public String name;
public Integer age;
public People(String name, Integer age) {
this.name = name;
this.age = age;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Integer getAge() {
return age;
}
public void setAge(Integer age) {
this.age = age;
}
@Override
public String toString() {
return "People{" +
"name='" + name + '\'' +
", age=" + age +
'}';
}
}
}
4)自定义输入
addSource
可以实现读取第三方数据源的数据
Flink也提供了一批内置的Connector(连接器),如下表列了几个主要的
连接器 | 是否提供Source支持 | 是否提供Sink支持 |
---|---|---|
Apache Kafka | 是 | 是 |
ElasticSearch | 否 | 是 |
HDFS | 否 | 是 |
Twitter Streaming PI | 是 | 否 |
案例
Kafka连接器
a、启动kafka
./kafka-server-start.sh -daemon ../config/server.properties
b、创建topic
bin/kafka-topics.sh --create --zookeepper 192.168.25.129:2181 --replication-factor 1 --partitions 1 --topic test
c、启动控制台kafka生产者
bin/kafka-console-producer.sh --broker-list 192.168.25.129:9092 --topic test
d、依赖:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>1.7.2</version>
</dependency>
e、代码:
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import java.util.Properties;
public class StreamFromKafka {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
Properties properties = new Properties();
properties.setProperty("bootstrap.servers","192.168.25.129:9092");
// 参数一:主题名
// 参数二:反序列化约束,以便于Flink决定如何反序列化从Kafka获得的数据
// 参数三:消费者的一些配置信息
FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<String>("test", new SimpleStringSchema(), properties);
DataStreamSource<String> data = env.addSource(consumer);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = data.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
for (String word : s.split(" ")) {
collector.collect(Tuple2.of(word, 1));
}
}
});
SingleOutputStreamOperator<Tuple2<String, Integer>> result = wordAndOne.keyBy(0).sum(1);
result.print();
env.execute();
}
}