big_data_example
This commit is contained in:
parent
03d2c0b6cc
commit
c91f81e123
|
|
@ -0,0 +1,58 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.atguigu.gmail</groupId>
|
||||
<artifactId>Collect</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>2.3.2</version>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<configuration>
|
||||
<descriptorRefs>
|
||||
<descriptorRef>jar-with-dependencies</descriptorRef>
|
||||
</descriptorRefs>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>make-assembly</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.flume</groupId>
|
||||
<artifactId>flume-ng-core</artifactId>
|
||||
<version>1.9.0</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>fastjson</artifactId>
|
||||
<version>1.2.62</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
package com.atguigu.gmail.interceptor;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONException;
|
||||
import org.apache.flume.Context;
|
||||
import org.apache.flume.Event;
|
||||
import org.apache.flume.interceptor.Interceptor;
|
||||
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 第一层采集过滤
|
||||
* 清洗掉数据不完整的event
|
||||
*/
|
||||
public class EtlLogInterceptor implements Interceptor {
|
||||
|
||||
public void initialize() {
|
||||
|
||||
}
|
||||
|
||||
public Event intercept(Event event) {
|
||||
//1.取出body
|
||||
String body = new String(event.getBody(), StandardCharsets.UTF_8);
|
||||
//2.通过阿里的fastJson判断数据是否完整
|
||||
try{
|
||||
//将数据进行解析,如果解析没有问题,则传回,如果catch到异常了则不完整return null
|
||||
JSON.parseObject(body);
|
||||
}catch (JSONException e){
|
||||
return null;
|
||||
}
|
||||
return event;
|
||||
}
|
||||
|
||||
public List<Event> intercept(List<Event> events) {
|
||||
|
||||
//通过获取迭代器,判断回来的迭代器是否为null,如果是就移除
|
||||
//这么写逻辑没有问题,但是不能通过集合的形式去移除一个
|
||||
// for (Event event : events) {
|
||||
// Event intercept = intercept(event);
|
||||
// if(intercept==null){
|
||||
// events.remove(event);
|
||||
// }
|
||||
// }
|
||||
// return events;
|
||||
|
||||
//先获取迭代器对象,用迭代器的方式去写,用迭代器的方式去移除一个
|
||||
Iterator<Event> iterator = events.iterator();
|
||||
while(iterator.hasNext()){
|
||||
Event event = iterator.next();
|
||||
Event result = intercept(event);
|
||||
if(result==null){
|
||||
iterator.remove();
|
||||
}
|
||||
}
|
||||
return events;
|
||||
|
||||
}
|
||||
|
||||
public void close() {
|
||||
|
||||
}
|
||||
public static class MyBuilder implements Builder{
|
||||
|
||||
@Override
|
||||
public Interceptor build() {
|
||||
return new EtlLogInterceptor();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void configure(Context context) {
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
package com.atguigu.gmail.interceptor;
|
||||
|
||||
import com.alibaba.fastjson.JSON;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import org.apache.flume.Context;
|
||||
import org.apache.flume.Event;
|
||||
import org.apache.flume.interceptor.Interceptor;
|
||||
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 给每个event加上header
|
||||
* 每个header存放改数据的在采集时的时间戳
|
||||
* 这个事件中在采集时已有,存放于json格式中
|
||||
*/
|
||||
public class TimeStampInterceptor implements Interceptor {
|
||||
@Override
|
||||
public void initialize() {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Event intercept(Event event) {
|
||||
//1.取出body,json格式的普通字符串
|
||||
String body = new String(event.getBody(), StandardCharsets.UTF_8);
|
||||
//2.将json字符串解析成对象
|
||||
JSONObject jsonObject = JSON.parseObject(body);
|
||||
//3.从对象中获取时间戳ts
|
||||
String ts = jsonObject.getString("ts");
|
||||
//4.将ts的值设置到event的header中
|
||||
event.getHeaders().put("timestamp",ts);
|
||||
|
||||
return event;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Event> intercept(List<Event> events) {
|
||||
//迭代调用event
|
||||
for (Event event : events) {
|
||||
intercept(event);
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
|
||||
}
|
||||
|
||||
public static class MyBuilder implements Builder{
|
||||
public TimeStampInterceptor build() {
|
||||
|
||||
return new TimeStampInterceptor();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void configure(Context context) {
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.atguigu.flink</groupId>
|
||||
<artifactId>Flink</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<flink.verison>1.13.0</flink.verison>
|
||||
<java.version>1.8</java.version>
|
||||
<scala.binary.version>2.12</scala.binary.version>
|
||||
<slf4j.version>1.7.30</slf4j.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<!--flink的版本-->
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-java</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<!--scala库-->
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-clients_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-common</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-cep_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-csv</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>mysql</groupId>
|
||||
<artifactId>mysql-connector-java</artifactId>
|
||||
<version>8.0.21</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-connector-jdbc_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<version>${slf4j.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-to-slf4j</artifactId>
|
||||
<version>2.14.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-common</artifactId>
|
||||
<version>${flink.verison}</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<version>3.3.0</version>
|
||||
<configuration>
|
||||
<descriptorRefs>
|
||||
<descriptorRef>jar.with-dependencies</descriptorRef>
|
||||
</descriptorRefs>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>make-assembly</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>8</source>
|
||||
<target>8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,120 @@
|
|||
package day01.java;
|
||||
|
||||
import org.apache.flink.api.common.functions.FlatMapFunction;
|
||||
import org.apache.flink.api.common.functions.ReduceFunction;
|
||||
import org.apache.flink.api.java.functions.KeySelector;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.datastream.KeyedStream;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
|
||||
/**
|
||||
* 从socket读取文件然后进行处理
|
||||
* wordCount
|
||||
*/
|
||||
public class Example1 {
|
||||
|
||||
|
||||
//记得抛出异常
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
//TODO 配置环境
|
||||
//获取流处理的运行时环境
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
//设置并行任务的数量为1
|
||||
env.setParallelism(1);
|
||||
|
||||
//TODO 执行程序
|
||||
|
||||
//读取数据源
|
||||
//现在终端启动 ‘nc -lp 9999’
|
||||
DataStreamSource<String> stream = env.socketTextStream("localhost",9999);
|
||||
|
||||
//1. map操作
|
||||
//这里使用的是flatMap方法
|
||||
//map:针对流中的每一个元素,输出一个元素
|
||||
//flatMap:针对流中的每一个元素,输出0个,1个或者多个元素
|
||||
SingleOutputStreamOperator<WordWithCount> mappedStream = stream
|
||||
//输入泛型:string;输出泛型:WordWithCount
|
||||
.flatMap(new FlatMapFunction<String, WordWithCount>() {
|
||||
|
||||
@Override
|
||||
public void flatMap(String value, Collector<WordWithCount> out) throws Exception {
|
||||
String[] arr = value.split(" ");
|
||||
//使用collect方法向下游发送数据
|
||||
for (String e : arr) {
|
||||
out.collect(new WordWithCount(e, 1L));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
//2.分组:shuffle
|
||||
KeyedStream<WordWithCount, String> keyedStream = mappedStream
|
||||
//第一个泛型:流中元素的泛型
|
||||
//第二个反省:key的泛型
|
||||
.keyBy(new KeySelector<WordWithCount, String>() {
|
||||
@Override
|
||||
public String getKey(WordWithCount value) throws Exception {
|
||||
return value.word;
|
||||
}
|
||||
});
|
||||
|
||||
//3.reduce操作
|
||||
//reduce会维护一个累加器
|
||||
//第一条数据到来,作为累加器输出
|
||||
//第二条数据到来,和累加器进行聚合操作,然后输出累加器
|
||||
//累加器和流中元素的类型是一样的
|
||||
SingleOutputStreamOperator<WordWithCount> result = keyedStream
|
||||
.reduce(new ReduceFunction<WordWithCount>() {
|
||||
//
|
||||
@Override
|
||||
public WordWithCount reduce(WordWithCount value1, WordWithCount value2) throws Exception {
|
||||
return new WordWithCount(value1.word, value1.count + value2.count);
|
||||
}
|
||||
});
|
||||
|
||||
//输出
|
||||
result.print();
|
||||
/*
|
||||
WordWithCount{word='hello', count=1}
|
||||
WordWithCount{word='world', count=1}
|
||||
WordWithCount{word='hello', count=2}
|
||||
WordWithCount{word='world', count=2}
|
||||
*/
|
||||
|
||||
|
||||
//执行程序
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
//POJO类
|
||||
//1.必须是共有类
|
||||
//2.所有字段必须是public
|
||||
//3.必须有空构造器
|
||||
//模拟了case class
|
||||
public static class WordWithCount{
|
||||
public String word;
|
||||
public Long count;
|
||||
|
||||
public WordWithCount(){
|
||||
|
||||
}
|
||||
public WordWithCount(String word ,Long count){
|
||||
this.word =word;
|
||||
this.count = count;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "WordWithCount{" +
|
||||
"word='" + word + '\'' +
|
||||
", count=" + count +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
package day01.java;
|
||||
|
||||
import org.apache.flink.api.common.functions.FlatMapFunction;
|
||||
import org.apache.flink.api.common.functions.ReduceFunction;
|
||||
import org.apache.flink.api.java.functions.KeySelector;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.datastream.KeyedStream;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
|
||||
/**
|
||||
* 从离线数据读取文件然后进行处理
|
||||
* wordCount
|
||||
*/
|
||||
public class Example2 {
|
||||
|
||||
|
||||
//记得抛出异常
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
//TODO 配置环境
|
||||
//获取流处理的运行时环境
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
//设置并行任务的数量为1
|
||||
// 需要一个任务插槽
|
||||
env.setParallelism(1);
|
||||
|
||||
//TODO 执行程序
|
||||
|
||||
//读取数据源
|
||||
//现在终端启动 ‘nc -lp 9999’
|
||||
//DataStreamSource<String> stream = env.socketTextStream("localhost",9999);
|
||||
DataStreamSource<String> stream = env.fromElements("hello world", "hello world");
|
||||
|
||||
//1. map操作
|
||||
//这里使用的是flatMap方法
|
||||
//map:针对流中的每一个元素,输出一个元素
|
||||
//flatMap:针对流中的每一个元素,输出0个,1个或者多个元素
|
||||
SingleOutputStreamOperator<WordWithCount> mappedStream = stream
|
||||
//输入泛型:string;输出泛型:WordWithCount
|
||||
.flatMap(new FlatMapFunction<String, WordWithCount>() {
|
||||
|
||||
@Override
|
||||
public void flatMap(String value, Collector<WordWithCount> out) throws Exception {
|
||||
String[] arr = value.split(" ");
|
||||
//使用collect方法向下游发送数据
|
||||
for (String e : arr) {
|
||||
out.collect(new WordWithCount(e, 1L));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
//2.分组:shuffle
|
||||
KeyedStream<WordWithCount, String> keyedStream = mappedStream
|
||||
//第一个泛型:流中元素的泛型
|
||||
//第二个反省:key的泛型
|
||||
.keyBy(new KeySelector<WordWithCount, String>() {
|
||||
@Override
|
||||
public String getKey(WordWithCount value) throws Exception {
|
||||
return value.word;
|
||||
}
|
||||
});
|
||||
|
||||
//3.reduce操作
|
||||
//reduce会维护一个累加器
|
||||
//第一条数据到来,作为累加器输出
|
||||
//第二条数据到来,和累加器进行聚合操作,然后输出累加器
|
||||
//累加器和流中元素的类型是一样的
|
||||
SingleOutputStreamOperator<WordWithCount> result = keyedStream
|
||||
.reduce(new ReduceFunction<WordWithCount>() {
|
||||
//
|
||||
@Override
|
||||
public WordWithCount reduce(WordWithCount value1, WordWithCount value2) throws Exception {
|
||||
return new WordWithCount(value1.word, value1.count + value2.count);
|
||||
}
|
||||
});
|
||||
|
||||
//输出
|
||||
result.print();
|
||||
/*
|
||||
WordWithCount{word='hello', count=1}
|
||||
WordWithCount{word='world', count=1}
|
||||
WordWithCount{word='hello', count=2}
|
||||
WordWithCount{word='world', count=2}
|
||||
*/
|
||||
|
||||
|
||||
//执行程序
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
//POJO类
|
||||
//1.必须是共有类
|
||||
//2.所有字段必须是public
|
||||
//3.必须有空构造器
|
||||
//模拟了case class
|
||||
public static class WordWithCount{
|
||||
public String word;
|
||||
public Long count;
|
||||
|
||||
public WordWithCount(){
|
||||
|
||||
}
|
||||
public WordWithCount(String word ,Long count){
|
||||
this.word =word;
|
||||
this.count = count;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "WordWithCount{" +
|
||||
"word='" + word + '\'' +
|
||||
", count=" + count +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
package day01.java;
|
||||
|
||||
import org.apache.flink.api.common.functions.FlatMapFunction;
|
||||
import org.apache.flink.api.common.functions.ReduceFunction;
|
||||
import org.apache.flink.api.java.functions.KeySelector;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.datastream.KeyedStream;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
/**
|
||||
* 并行度的设置
|
||||
* 针对没有算子设置的并行度的优先级高于全局并行度
|
||||
* 本程序需要两个任务插槽
|
||||
*
|
||||
* 可以设置并行度的位置:
|
||||
* 1.全局并行度
|
||||
* 2.算子并行度
|
||||
* 3.配制文件中有默认并行度
|
||||
* 4.命令行提交时可以设置并行度(flink run -p 2)
|
||||
* 2 > 1 > 4 > 3
|
||||
*/
|
||||
public class Example3 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
//全局并行度设置为1
|
||||
env.setParallelism(1);
|
||||
|
||||
//并行度设置为1
|
||||
DataStreamSource<String> stream = env.fromElements("hello world", "hello world").setParallelism(1);
|
||||
|
||||
//1. map操作
|
||||
//这里使用的是flatMap方法
|
||||
//map:针对流中的每一个元素,输出一个元素
|
||||
//flatMap:针对流中的每一个元素,输出0个,1个或者多个元素
|
||||
//并行度设置为2
|
||||
SingleOutputStreamOperator<Example2.WordWithCount> mappedStream = stream
|
||||
//输入泛型:string;输出泛型:WordWithCount
|
||||
.flatMap(new FlatMapFunction<String, Example2.WordWithCount>() {
|
||||
|
||||
@Override
|
||||
public void flatMap(String value, Collector<Example2.WordWithCount> out) throws Exception {
|
||||
String[] arr = value.split(" ");
|
||||
//使用collect方法向下游发送数据
|
||||
for (String e : arr) {
|
||||
out.collect(new Example2.WordWithCount(e, 1L));
|
||||
}
|
||||
}
|
||||
}).setParallelism(2);
|
||||
|
||||
//2.分组:shuffle
|
||||
|
||||
KeyedStream<Example2.WordWithCount, String> keyedStream = mappedStream
|
||||
//第一个泛型:流中元素的泛型
|
||||
//第二个反省:key的泛型
|
||||
.keyBy(new KeySelector<Example2.WordWithCount, String>() {
|
||||
@Override
|
||||
public String getKey(Example2.WordWithCount value) throws Exception {
|
||||
return value.word;
|
||||
}
|
||||
});
|
||||
|
||||
//3.reduce操作
|
||||
//reduce会维护一个累加器
|
||||
//第一条数据到来,作为累加器输出
|
||||
//第二条数据到来,和累加器进行聚合操作,然后输出累加器
|
||||
//累加器和流中元素的类型是一样的
|
||||
//并行度设置为2
|
||||
SingleOutputStreamOperator<Example2.WordWithCount> result = keyedStream
|
||||
.reduce(new ReduceFunction<Example2.WordWithCount>() {
|
||||
//
|
||||
@Override
|
||||
public Example2.WordWithCount reduce(Example2.WordWithCount value1, Example2.WordWithCount value2) throws Exception {
|
||||
return new Example2.WordWithCount(value1.word, value1.count + value2.count);
|
||||
}
|
||||
}).setParallelism(2);
|
||||
|
||||
//输出
|
||||
//并行度设置为1
|
||||
result.print().setParallelism(1);
|
||||
/*
|
||||
WordWithCount{word='hello', count=1}
|
||||
WordWithCount{word='world', count=1}
|
||||
WordWithCount{word='hello', count=2}
|
||||
WordWithCount{word='world', count=2}
|
||||
*/
|
||||
|
||||
|
||||
//执行程序
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
package day01.scala
|
||||
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
|
||||
|
||||
object Example_scala1 {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
//TODO 配置环境
|
||||
val env = StreamExecutionEnvironment.getExecutionEnvironment
|
||||
env.setParallelism(1)
|
||||
|
||||
//TODO 读取数据源
|
||||
val stream = env.socketTextStream("localhost", 9999)
|
||||
|
||||
// stream.flatMap(
|
||||
// words =>{
|
||||
// val word = words.split(" ")
|
||||
//
|
||||
// word.map(
|
||||
// word =>{
|
||||
// WordWithCount(word,1L)
|
||||
// }
|
||||
// )
|
||||
//
|
||||
// }
|
||||
// )
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
case class WordWithCount(var word:String,var count:Long)
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
package day02;
|
||||
|
||||
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 自定义数据源
|
||||
*/
|
||||
public class Example1 {
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Event> stream = env.addSource(new ClickSource());
|
||||
|
||||
stream.print();
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event>{
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr ={"Mary","Bob","Alice","liz"};
|
||||
private String[] urlArr = {"./home","./cart","./fav","./prod?id=1","prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while(running){
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event{
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event(){
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
package day02;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.functions.FlatMapFunction;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.common.typeinfo.TypeInformation;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* MAP
|
||||
*/
|
||||
public class Example2 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
//匿名函数的方式
|
||||
env
|
||||
.addSource(new SourceFunction<Integer>() {
|
||||
|
||||
private boolean running = true;
|
||||
private Random random = new Random();
|
||||
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
|
||||
while (running) {
|
||||
ctx.collect(random.nextInt(1000));
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
})
|
||||
//lambada表达式的形式,java8推断不出来返回值类型,因此需要
|
||||
.map(r -> Tuple2.of(r,r))
|
||||
//会被擦出成Tuple2<Object,Object>
|
||||
//需要returns方法来标注一下map函数的输出类型
|
||||
.returns(Types.TUPLE(Types.INT,Types.INT))
|
||||
.print();
|
||||
|
||||
//匿名内部类的方式
|
||||
// env
|
||||
// .addSource(new SourceFunction<Integer>() {
|
||||
//
|
||||
// private boolean running = true;
|
||||
// private Random random = new Random();
|
||||
//
|
||||
//
|
||||
// @Override
|
||||
// public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
//
|
||||
// while (running) {
|
||||
// ctx.collect(random.nextInt(1000));
|
||||
// Thread.sleep(1000L);
|
||||
// }
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public void cancel() {
|
||||
// running = false;
|
||||
//
|
||||
// }
|
||||
// })
|
||||
// .map(new MapFunction<Integer, Tuple2<Integer, Integer>>() {
|
||||
// @Override
|
||||
// public Tuple2<Integer, Integer> map(Integer value) throws Exception {
|
||||
// return Tuple2.of(value, value);
|
||||
// }
|
||||
// }).print();
|
||||
|
||||
|
||||
//外部类的方式
|
||||
env
|
||||
.addSource(new SourceFunction<Integer>() {
|
||||
|
||||
private boolean running = true;
|
||||
private Random random = new Random();
|
||||
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
|
||||
while (running) {
|
||||
ctx.collect(random.nextInt(1000));
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
})
|
||||
.map(new MyMap()).print();
|
||||
|
||||
//flatMap的方式
|
||||
env
|
||||
.addSource(new SourceFunction<Integer>() {
|
||||
|
||||
private boolean running = true;
|
||||
private Random random = new Random();
|
||||
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
|
||||
while (running) {
|
||||
ctx.collect(random.nextInt(1000));
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
})
|
||||
.flatMap(new FlatMapFunction<Integer, Tuple2<Integer, Integer>>() {
|
||||
@Override
|
||||
public void flatMap(Integer value, Collector<Tuple2<Integer, Integer>> collector) throws Exception {
|
||||
collector.collect((Tuple2.of(value,value)));
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
public static class MyMap implements MapFunction<Integer,Tuple2<Integer,Integer>>{
|
||||
|
||||
|
||||
@Override
|
||||
public Tuple2<Integer, Integer> map(Integer value) throws Exception {
|
||||
return Tuple2.of(value,value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
package day02;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.functions.FilterFunction;
|
||||
import org.apache.flink.api.common.functions.FlatMapFunction;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* FILTER
|
||||
*/
|
||||
public class Example3 {
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Example1.Event> stream = env.addSource(new ClickSource());
|
||||
|
||||
stream.filter(r -> r.user.equals("Mary")).print();
|
||||
|
||||
stream
|
||||
.filter(new FilterFunction<Example1.Event>() {
|
||||
@Override
|
||||
public boolean filter(Example1.Event value) throws Exception {
|
||||
return value.user.equals("Mary");
|
||||
}
|
||||
}).print();
|
||||
|
||||
stream
|
||||
.filter(new MyFilter())
|
||||
.print();
|
||||
|
||||
|
||||
stream
|
||||
.flatMap(new FlatMapFunction<Example1.Event, Example1.Event>() {
|
||||
@Override
|
||||
public void flatMap(Example1.Event value, Collector<Example1.Event> collector) throws Exception {
|
||||
if(value.user.equals("Mary")) collector.collect(value);
|
||||
}
|
||||
}).print();
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
public static class MyFilter implements FilterFunction<Example1.Event> {
|
||||
|
||||
|
||||
@Override
|
||||
public boolean filter(Example1.Event value) throws Exception {
|
||||
return value.user.equals("Mary");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<day02.Example1.Event>{
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr ={"Mary","Bob","Alice","liz"};
|
||||
private String[] urlArr = {"./home","./cart","./fav","./prod?id=1","prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<day02.Example1.Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while(running){
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new day02.Example1.Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event{
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event(){
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
package day02;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.functions.FlatMapFunction;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
/**
|
||||
* FlatMap
|
||||
*/
|
||||
public class Example4 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<String> stream = env.fromElements("white", "black", "gray");
|
||||
|
||||
stream
|
||||
.flatMap(new FlatMapFunction<String, String>() {
|
||||
@Override
|
||||
public void flatMap(String value, Collector<String> collector) throws Exception {
|
||||
if(value.equals("white")){
|
||||
collector.collect(value);
|
||||
}else if(value.equals("black")){
|
||||
collector.collect(value);
|
||||
collector.collect(value);
|
||||
}
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
stream.flatMap(
|
||||
(String value,Collector<String> collector) ->{
|
||||
if(value.equals("white")){
|
||||
collector.collect(value);
|
||||
}else if(value.equals("black")){
|
||||
collector.collect(value);
|
||||
collector.collect(value);
|
||||
}
|
||||
}
|
||||
)
|
||||
.returns(Types.STRING);
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
package day02;
|
||||
|
||||
import org.apache.flink.api.common.functions.ReduceFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.datastream.KeyedStream;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.scala.DataStream;
|
||||
|
||||
|
||||
/**
|
||||
* sum滚动聚合
|
||||
*/
|
||||
public class Example5 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
|
||||
DataStreamSource<Tuple2<Integer, Integer>> stream = env
|
||||
.fromElements(
|
||||
Tuple2.of(1, 2),
|
||||
Tuple2.of(1, 3)
|
||||
);
|
||||
|
||||
//键控流
|
||||
KeyedStream<Tuple2<Integer, Integer>, Integer> keyedStream = stream.keyBy(r -> r.f0);
|
||||
|
||||
keyedStream.sum(1).print();
|
||||
|
||||
//reduce是以上几组算子的泛化实现
|
||||
keyedStream.reduce(new ReduceFunction<Tuple2<Integer, Integer>>() {
|
||||
@Override
|
||||
public Tuple2<Integer, Integer> reduce(Tuple2<Integer, Integer> value1, Tuple2<Integer, Integer> value2) throws Exception {
|
||||
return Tuple2.of(value1.f0, value1.f1 + value2.f1);
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
package day02;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.common.functions.ReduceFunction;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 求整数平均值
|
||||
*/
|
||||
public class Example6 {
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.addSource(new SourceFunction<Integer>() {
|
||||
|
||||
private boolean running =true;
|
||||
private Random random =new Random();
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
while (running){
|
||||
ctx.collect(random.nextInt(10));
|
||||
Thread.sleep(100L);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running=false;
|
||||
}
|
||||
})
|
||||
.map(r -> Tuple2.of(r,1))
|
||||
.returns(Types.TUPLE(Types.INT,Types.INT))
|
||||
.keyBy(r ->true)
|
||||
.reduce(new ReduceFunction<Tuple2<Integer, Integer>>() {
|
||||
@Override
|
||||
public Tuple2<Integer, Integer> reduce(Tuple2<Integer, Integer> value1, Tuple2<Integer, Integer> value2) throws Exception {
|
||||
return Tuple2.of(value1.f0+value2.f0,value1.f1+value2.f1);
|
||||
}
|
||||
})
|
||||
.map(new MapFunction<Tuple2<Integer, Integer>, Double>() {
|
||||
@Override
|
||||
public Double map(Tuple2<Integer, Integer> value) throws Exception {
|
||||
return (double)value.f0/value.f1;
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
package day02;
|
||||
|
||||
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
|
||||
/**
|
||||
* shuffle
|
||||
*/
|
||||
public class Example7 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
//随机向下游分配
|
||||
env
|
||||
.fromElements(1,2,3,4).setParallelism(1)
|
||||
.shuffle()
|
||||
.print("shuffle:").setParallelism(2);
|
||||
|
||||
//平均分配 -底层采用轮询的方式
|
||||
env
|
||||
.fromElements(1,2,3,4).setParallelism(1)
|
||||
.rebalance()
|
||||
.print("rebalance:").setParallelism(2);
|
||||
|
||||
|
||||
//广播的方式 -两个分区均有复制
|
||||
env
|
||||
.fromElements(1,2,3,4).setParallelism(1)
|
||||
.broadcast()
|
||||
.print("broadcast:").setParallelism(2);
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
package day03;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.functions.RichMapFunction;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.runtime.operators.resettable.SpillingResettableIterator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
|
||||
/**
|
||||
* 富函数
|
||||
*/
|
||||
public class Example1 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.fromElements(1, 2, 3)
|
||||
.map(new RichMapFunction<Integer, Integer>() {
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
super.open(parameters);
|
||||
System.out.println("生命周期开始了");
|
||||
System.out.println("当前子任务的索引是:" + getRuntimeContext().getIndexOfThisSubtask());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer map(Integer value) throws Exception {
|
||||
return value * value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
super.close();
|
||||
System.out.println("生命周期结束");
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
package day03;
|
||||
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
|
||||
|
||||
|
||||
public class Example2 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
|
||||
env
|
||||
.addSource(new RichParallelSourceFunction<Integer>() {
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
System.out.println("生命周期开始,子任务索引是:"+getRuntimeContext().getIndexOfThisSubtask());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
if(i%2 ==getRuntimeContext().getIndexOfThisSubtask()){
|
||||
ctx.collect(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
|
||||
}
|
||||
})
|
||||
.setParallelism(2)
|
||||
.print()
|
||||
.setParallelism(2);
|
||||
|
||||
/*
|
||||
生命周期开始,子任务索引是:0
|
||||
生命周期开始,子任务索引是:1
|
||||
2> 1
|
||||
2> 3
|
||||
2> 5
|
||||
2> 7
|
||||
2> 9
|
||||
1> 0
|
||||
1> 2
|
||||
1> 4
|
||||
1> 6
|
||||
1> 8
|
||||
*/
|
||||
|
||||
|
||||
env.execute();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
package day03;
|
||||
|
||||
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
|
||||
|
||||
/**
|
||||
* 自定义输出
|
||||
*/
|
||||
public class Example3 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env.fromElements(1,2,3,4)
|
||||
.addSink(new SinkFunction<Integer>() {
|
||||
|
||||
//每收到一条数据,就会触发一次invoke函数的调用
|
||||
@Override
|
||||
public void invoke(Integer value, Context context) throws Exception {
|
||||
SinkFunction.super.invoke(value,context);
|
||||
System.out.println(value);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
env.execute();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
package day03;
|
||||
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
|
||||
/**
|
||||
* KeyedProcessFunction简单例子
|
||||
*/
|
||||
public class Example4 {
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.socketTextStream("localhost",9999)
|
||||
.keyBy(r -> 1)
|
||||
.process( new MyKeyed())
|
||||
.print();
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 继承实现接口KeyedProcessFunction<Key,In,Out>
|
||||
* key:1 Integer
|
||||
* In:socket进来的数据 string
|
||||
* Out:输出数据,string
|
||||
*/
|
||||
public static class MyKeyed extends KeyedProcessFunction<Integer,String,String>{
|
||||
|
||||
//当一条元素到来的时候,就会触发这个方法的调用
|
||||
@Override
|
||||
public void processElement(String value, Context ctx, Collector<String> out) throws Exception {
|
||||
|
||||
//当前机器时间
|
||||
long ts = ctx.timerService().currentProcessingTime();
|
||||
out.collect("元素:" + value+"在"+new Timestamp(ts) + "到达");
|
||||
|
||||
//注册一个10秒之后的定时器
|
||||
long tenSecLater = ts +10*1000L;
|
||||
out.collect("注册了一个时间在:"+new Timestamp(tenSecLater)+"的定时器");
|
||||
//注册定时器的语法,注意:注册的是处理时间(机器时间)
|
||||
ctx.timerService().registerProcessingTimeTimer(tenSecLater);
|
||||
}
|
||||
|
||||
//定时器触发后执行,也可以向下游发送数据
|
||||
//定时器也是状态,每个key独有定时器
|
||||
//每个key都可以注册自己的定时器
|
||||
//对于每个key,在某个时间戳,只能注册一个定时器
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
super.onTimer(timestamp, ctx, out);
|
||||
out.collect("定时器触发了!触发事件是:" +new Timestamp(timestamp));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
package day03;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 状态变量
|
||||
*/
|
||||
public class Example5 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.addSource(new SourceFunction<Integer>() {
|
||||
private boolean running = true;
|
||||
private Random random =new Random();
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
|
||||
while(running){
|
||||
|
||||
ctx.collect(random.nextInt(10));
|
||||
Thread.sleep(1000);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running =false;
|
||||
|
||||
}
|
||||
})
|
||||
.keyBy( r -> true)
|
||||
.process(new KeyedProcessFunction<Boolean, Integer, Double>() {
|
||||
|
||||
//声明一个状态变量作为累加器
|
||||
//状态变量的可见范围(作用域)的当前key
|
||||
//状态变量是单例,只能被实例化一次
|
||||
private ValueState<Tuple2<Integer,Integer>> valueState;
|
||||
//保存定时器的时间戳
|
||||
private ValueState<Long> timerTs;
|
||||
|
||||
//初始化状态变量
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
super.open(parameters);
|
||||
//实例化状态变量
|
||||
valueState = getRuntimeContext().getState(
|
||||
|
||||
|
||||
//在checkpoint中用状态描述符去找他
|
||||
//ValueStateDescriptor状态描述符
|
||||
new ValueStateDescriptor<Tuple2<Integer, Integer>>("sum-count", Types.TUPLE(Types.INT,Types.INT))
|
||||
);
|
||||
timerTs=getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Long>("timer",Types.LONG)
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(Integer value, Context ctx, Collector<Double> out) throws Exception {
|
||||
|
||||
//当第一条数据到来时,状态变量的值为null
|
||||
//使用.value()方法读取状态变量的值,使用.update()方法更新状态变量的值
|
||||
if(valueState.value() == null){
|
||||
valueState.update(Tuple2.of(value,1));
|
||||
}else {
|
||||
Tuple2<Integer, Integer> tmp = valueState.value();
|
||||
valueState.update(Tuple2.of(tmp.f0+value,tmp.f1+1));
|
||||
}
|
||||
|
||||
|
||||
if(timerTs.value() == null){
|
||||
long tenSecLater =ctx.timerService().currentProcessingTime()+10*1000L;
|
||||
ctx.timerService().registerProcessingTimeTimer(tenSecLater);
|
||||
timerTs.update(tenSecLater);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Double> out) throws Exception {
|
||||
|
||||
if(valueState.value() != null){
|
||||
out.collect((double)valueState.value().f0/valueState.value().f1);
|
||||
timerTs.clear();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
package day03;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 整数连续一秒上升
|
||||
*/
|
||||
public class Example6 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Integer> stream = env.addSource(new SourceFunction<Integer>() {
|
||||
private boolean running = true;
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
|
||||
while (running) {
|
||||
ctx.collect(random.nextInt());
|
||||
Thread.sleep(300L);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
}
|
||||
});
|
||||
|
||||
stream
|
||||
.keyBy(r -> 1)
|
||||
.process(new KeyedProcessFunction<Integer, Integer, String>() {
|
||||
|
||||
//初始化两个状态变量
|
||||
private ValueState<Integer> lastInt; //最后一次的整数类型
|
||||
private ValueState<Long> timerTs; //时间戳
|
||||
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
lastInt = getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Integer>("last-integer", Types.INT)
|
||||
);
|
||||
timerTs = getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Long>("timer", Types.LONG)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(Integer value, Context ctx, Collector<String> out) throws Exception {
|
||||
Integer prevInt = null;
|
||||
if (lastInt.value() != null) {
|
||||
|
||||
prevInt = lastInt.value();
|
||||
}
|
||||
lastInt.update(value);
|
||||
|
||||
Long ts = null;
|
||||
if (timerTs != null) {
|
||||
ts = timerTs.value();
|
||||
}
|
||||
|
||||
if (prevInt == null || value < prevInt) {
|
||||
if (ts != null) {
|
||||
ctx.timerService().deleteEventTimeTimer(ts);
|
||||
|
||||
timerTs.clear();
|
||||
}
|
||||
} else if (value > prevInt && ts == null) {
|
||||
long oneSecLater = ctx.timerService().currentProcessingTime() + 1000L;
|
||||
ctx.timerService().registerProcessingTimeTimer(oneSecLater);
|
||||
timerTs.update(oneSecLater);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
|
||||
out.collect("整数连续1S上升了!");
|
||||
timerTs.clear();
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
package day03;
|
||||
|
||||
import org.apache.flink.api.common.state.ListState;
|
||||
import org.apache.flink.api.common.state.ListStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 使用列表状态变量求平均值
|
||||
*/
|
||||
public class Example7 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Integer> stream = env.addSource(new SourceFunction<Integer>() {
|
||||
private boolean running = true;
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
|
||||
while (running) {
|
||||
ctx.collect(random.nextInt(10));
|
||||
Thread.sleep(300L);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
}
|
||||
});
|
||||
|
||||
stream
|
||||
.keyBy(r -> 1)
|
||||
.process(new KeyedProcessFunction<Integer, Integer, Double>() {
|
||||
|
||||
//由于列表状态标量会把以前的值都存起来,所以列表状态变量会非常的占内存
|
||||
private ListState<Integer> listState;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
listState =getRuntimeContext().getListState(
|
||||
new ListStateDescriptor<Integer>("list-state", Types.INT)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(Integer value, Context ctx, Collector<Double> out) throws Exception {
|
||||
listState.add(value);
|
||||
Integer sum = 0;
|
||||
Integer count = 0;
|
||||
for (Integer i : listState.get()) {
|
||||
|
||||
sum += i;
|
||||
count +=1;
|
||||
|
||||
}
|
||||
out.collect((double) sum /count);
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
package day03;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.state.MapState;
|
||||
import org.apache.flink.api.common.state.MapStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 字典状态变量
|
||||
*/
|
||||
public class Example8 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Event> stream = env.addSource(new ClickSource());
|
||||
|
||||
stream
|
||||
.keyBy(r -> 1)
|
||||
.process(new KeyedProcessFunction<Integer, Event, String>() {
|
||||
private MapState<String, Long> mapState;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
mapState = getRuntimeContext().getMapState(
|
||||
new MapStateDescriptor<String, Long>("map", Types.STRING, Types.LONG)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void processElement(Event value, Context ctx, Collector<String> out) throws Exception {
|
||||
|
||||
if (mapState.contains(value.user)) {
|
||||
mapState.put(value.user, mapState.get(value.user) + 1L);
|
||||
} else {
|
||||
mapState.put(value.user, 1L);
|
||||
}
|
||||
//求pv平均值
|
||||
long userNum = 0L;
|
||||
long pvSum = 0L;
|
||||
for (String user : mapState.keys()) {
|
||||
userNum += 1L;
|
||||
pvSum += mapState.get(user);
|
||||
}
|
||||
out.collect("当前pv的平均值是" + (double) pvSum / userNum);
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
env.execute();
|
||||
}
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event> {
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
|
||||
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while (running) {
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event {
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
package day03;
|
||||
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 计算每个用户每5秒的中的pv
|
||||
*/
|
||||
public class Example9 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Event> stream = env.addSource(new ClickSource());
|
||||
|
||||
stream
|
||||
.keyBy(r -> r.user)
|
||||
.window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
|
||||
.process(new WindowResult())
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 继承ProcessWindowFunction<IN,OUT,KEY,window>
|
||||
*/
|
||||
public static class WindowResult extends ProcessWindowFunction<Event, String, String, TimeWindow> {
|
||||
|
||||
|
||||
//在窗口关闭的时候触发调用
|
||||
//可以发现这种方式也是保留了一个窗口中的所有数据,会拖慢速度,因此考虑用以前累加器的方式来优化
|
||||
@Override
|
||||
public void process(String key, Context context, Iterable<Event> iterable, Collector<String> collector) throws Exception {
|
||||
//迭代器参数中包含了窗口中所有的元素
|
||||
long windowStart =context.window().getStart();
|
||||
long windowEnd = context.window().getEnd();
|
||||
long count = iterable.spliterator().getExactSizeIfKnown(); //迭代器里面共多少条元素
|
||||
collector.collect("用户:"+key+"在窗口"
|
||||
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
|
||||
+""+"中的pv次数是:"+count);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event> {
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
|
||||
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while (running) {
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event {
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
package day03.selftry;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 状态变量
|
||||
*/
|
||||
public class Example5_try {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.addSource(new SourceFunction<Integer>() {
|
||||
private boolean running = true;
|
||||
private Random random =new Random();
|
||||
@Override
|
||||
public void run(SourceContext<Integer> ctx) throws Exception {
|
||||
|
||||
while(running){
|
||||
|
||||
ctx.collect(random.nextInt(10));
|
||||
Thread.sleep(1000);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running =false;
|
||||
|
||||
}
|
||||
})
|
||||
.keyBy( r -> true)
|
||||
.process(new KeyedProcessFunction<Boolean, Integer, Double>() {
|
||||
|
||||
//声明一个状态变量作为累加器
|
||||
//状态变量的可见范围(作用域)的当前key
|
||||
//状态变量是单例,只能被实例化一次
|
||||
private ValueState<Tuple2<Integer,Integer>> valueState;
|
||||
//保存定时器的时间戳
|
||||
private ValueState<Long> timerTs;
|
||||
|
||||
//初始化状态变量
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
super.open(parameters);
|
||||
//实例化状态变量
|
||||
valueState = getRuntimeContext().getState(
|
||||
|
||||
|
||||
//在checkpoint中用状态描述符去找他
|
||||
//ValueStateDescriptor状态描述符
|
||||
new ValueStateDescriptor<Tuple2<Integer, Integer>>("sum-count", Types.TUPLE(Types.INT,Types.INT))
|
||||
);
|
||||
timerTs=getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Long>("timer",Types.LONG)
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(Integer value, Context ctx, Collector<Double> out) throws Exception {
|
||||
|
||||
//当第一条数据到来时,状态变量的值为null
|
||||
//使用.value()方法读取状态变量的值,使用.update()方法更新状态变量的值
|
||||
if(valueState.value() == null){
|
||||
valueState.update(Tuple2.of(value,1));
|
||||
}else {
|
||||
Tuple2<Integer, Integer> tmp = valueState.value();
|
||||
valueState.update(Tuple2.of(tmp.f0+value,tmp.f1+1));
|
||||
}
|
||||
|
||||
|
||||
if(timerTs.value() == null){
|
||||
long tenSecLater =ctx.timerService().currentProcessingTime()+10*1000L;
|
||||
ctx.timerService().registerProcessingTimeTimer(tenSecLater);
|
||||
timerTs.update(1111L);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Double> out) throws Exception {
|
||||
|
||||
if(valueState.value() != null){
|
||||
out.collect((double)valueState.value().f0/valueState.value().f1);
|
||||
timerTs.clear();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
package day03.selftry;
|
||||
|
||||
import day03.Example9;
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
import sun.awt.SunHints;
|
||||
|
||||
import java.sql.Time;
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 计算每个用户每5秒的中的pv
|
||||
* 尚且有问题
|
||||
*/
|
||||
public class Example9_try {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Event> stream = env.addSource(new ClickSource());
|
||||
|
||||
stream
|
||||
.keyBy(r -> r.user)
|
||||
.process(new KeyedProcessFunction<String, Event, String>() {
|
||||
|
||||
private ValueState<Tuple2<String, Integer>> count;
|
||||
private ValueState<Long> timeTs;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
count = getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Tuple2<String, Integer>>("count", Types.TUPLE(Types.STRING, Types.INT)));
|
||||
timeTs = getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Long>("timer", Types.LONG)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(Event result, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
if (count.value() == null) {
|
||||
count.update(Tuple2.of(result.user, 1));
|
||||
} else {
|
||||
Tuple2<String, Integer> value = count.value();
|
||||
count.update(Tuple2.of(result.user, value.f1 + 1));
|
||||
}
|
||||
if (timeTs.value() == null) {
|
||||
long tenSecLater = ctx.timerService().currentProcessingTime() + 5 * 1000L-1L;
|
||||
ctx.timerService().registerProcessingTimeTimer(tenSecLater);
|
||||
timeTs.update(tenSecLater);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
if (count != null) {
|
||||
out.collect("用户" + count.value().f0 + "在" + new Timestamp(timeTs.value()) + "到" + new Timestamp(timeTs.value()+ 5 * 1000L) + "的PV次数为:" + count.value().f1);
|
||||
timeTs.clear();
|
||||
count.clear();
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event> {
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
|
||||
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while (running) {
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event {
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,146 @@
|
|||
package day04;
|
||||
|
||||
|
||||
import org.apache.calcite.rel.type.RelDataType;
|
||||
import org.apache.calcite.rel.type.RelDataTypeFactory;
|
||||
|
||||
import org.apache.calcite.schema.FunctionParameter;
|
||||
import org.apache.flink.api.common.functions.AggregateFunction;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 增量聚合函数
|
||||
* 实现每个用户每5秒钟窗口的pv
|
||||
* 但是无法获取窗口信息
|
||||
*/
|
||||
public class Example1 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Event> stream = env.addSource(new ClickSource());
|
||||
|
||||
stream
|
||||
.keyBy(r -> r.user)
|
||||
.window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
|
||||
.aggregate(new CountAgg())
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 实现AggregateFunction<IN,累加器,out>接口
|
||||
*/
|
||||
public static class CountAgg implements AggregateFunction<Event,Integer,Integer> {
|
||||
|
||||
|
||||
//创建累加器
|
||||
@Override
|
||||
public Integer createAccumulator() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
//定义累加规则
|
||||
@Override
|
||||
public Integer add(Event event, Integer accumulator) {
|
||||
return accumulator+1;
|
||||
}
|
||||
|
||||
//在窗口关闭时返回结果
|
||||
@Override
|
||||
public Integer getResult(Integer accumulator) {
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
//在窗口合并的时候merge
|
||||
@Override
|
||||
public Integer merge(Integer integer, Integer acc1) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event> {
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
|
||||
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while (running) {
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event {
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
package day04;
|
||||
|
||||
import org.apache.flink.api.common.functions.AggregateFunction;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 将增量窗口函数和全量窗口函数结合在一起使用
|
||||
* 每个用户5秒窗口的pv
|
||||
*/
|
||||
public class Example2 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Event> stream = env.addSource(new ClickSource());
|
||||
|
||||
stream
|
||||
.keyBy(r -> r.user)
|
||||
.window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
|
||||
//给增量聚合函数包裹一层窗口信息
|
||||
.aggregate(new CountAgg(),new WindowResult())
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
//输入的泛型是增量聚合函数的输出的类型
|
||||
public static class WindowResult extends ProcessWindowFunction<Integer,String,String, TimeWindow>{
|
||||
|
||||
@Override
|
||||
public void process(String key, Context context, Iterable<Integer> iterable, Collector<String> collector) throws Exception {
|
||||
//窗口关闭时触发调用
|
||||
//迭代器参数中只包含了一个元素,就是增量聚合函数发送过来的聚合结果
|
||||
long windowStart =context.window().getStart();
|
||||
long windowEnd = context.window().getEnd();
|
||||
long count = iterable.iterator().next(); //取出增量聚合函数的哪一个元素
|
||||
collector.collect("用户:"+key+"在窗口"
|
||||
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
|
||||
+""+"中的pv次数是:"+count);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 实现AggregateFunction<IN,累加器,out>接口
|
||||
*/
|
||||
public static class CountAgg implements AggregateFunction<Event,Integer,Integer> {
|
||||
|
||||
|
||||
//创建累加器
|
||||
@Override
|
||||
public Integer createAccumulator() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
//定义累加规则
|
||||
@Override
|
||||
public Integer add(Event event, Integer accumulator) {
|
||||
return accumulator+1;
|
||||
}
|
||||
|
||||
//在窗口关闭时返回结果
|
||||
@Override
|
||||
public Integer getResult(Integer accumulator) {
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
//在窗口合并的时候merge
|
||||
@Override
|
||||
public Integer merge(Integer integer, Integer acc1) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event> {
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
|
||||
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while (running) {
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event {
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
package day04;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.state.MapState;
|
||||
import org.apache.flink.api.common.state.MapStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 使用KeyedProcessFunction模拟5秒滚动窗口,模拟的是增量聚合函数和和全窗口聚合函数结合使用的情况
|
||||
*/
|
||||
public class Example3 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.addSource(new ClickSource())
|
||||
.keyBy(r ->r.user)
|
||||
.process( new FakeWindow())
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static class FakeWindow extends KeyedProcessFunction<String,Event,String>{
|
||||
//用map模拟窗口
|
||||
//key是窗口的开始时间,value是窗口中的pv数值(累加器)
|
||||
private MapState<Long,Integer> mapState;
|
||||
//窗口大小
|
||||
private Long windowSize = 5000L;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
mapState = getRuntimeContext().getMapState(
|
||||
new MapStateDescriptor<Long, Integer>("windowStart-pvCount",Types.LONG, Types.INT)
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(Event event, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
//计算当前元素所属的窗口的开始时间
|
||||
long currTime =ctx.timerService().currentProcessingTime();
|
||||
//计算窗口开始时间的公式
|
||||
long windowStart = currTime - currTime % windowSize;
|
||||
long windowEnd = windowStart +windowSize;
|
||||
|
||||
if(mapState.contains(windowStart)){
|
||||
//之前已经来过数据了
|
||||
mapState.put(windowStart,mapState.get(windowStart)+1);
|
||||
}else {
|
||||
//之前没有来过元素
|
||||
mapState.put(windowStart,1);
|
||||
}
|
||||
|
||||
//注册一个定时器
|
||||
ctx.timerService().registerProcessingTimeTimer(windowEnd-1L);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
|
||||
//timestamp就是触发这个方法的时间,即windowEnd-1L
|
||||
long windowEnd = timestamp +1L;
|
||||
long windowStart = windowEnd - windowSize;
|
||||
int count = mapState.get(windowStart);
|
||||
out.collect("用户:"+ctx.getCurrentKey()+"在窗口"
|
||||
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
|
||||
+""+"中的pv次数是:"+count);
|
||||
mapState.remove(windowStart);
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event> {
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
|
||||
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while (running) {
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event {
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
package day04;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.time.Duration;
|
||||
|
||||
/**
|
||||
* 水位线测试
|
||||
*/
|
||||
public class Example4 {
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
// 'a 1'
|
||||
.socketTextStream("localhost",9999)
|
||||
// (a , 1000L)
|
||||
.map(new MapFunction<String, Tuple2<String,Long>>() {
|
||||
@Override
|
||||
public Tuple2 map(String value) throws Exception {
|
||||
String[] arr = value.split(" ");
|
||||
return Tuple2.of(arr[0],Long.parseLong(arr[1]) * 1000L);
|
||||
}
|
||||
})
|
||||
//抽取时间戳,分配水位线
|
||||
//默认每隔200ms的机器时间,插入一次水位线
|
||||
.assignTimestampsAndWatermarks(
|
||||
//最大延迟时间设置为5秒nc
|
||||
WatermarkStrategy.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(5))
|
||||
//时间戳字段
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
|
||||
return element.f1; //告诉flink事件时间是哪一个字段
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy(r ->r.f0)
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(5))) //5秒的时间滚动窗口
|
||||
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
|
||||
@Override
|
||||
public void process(String key, Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
|
||||
long windowStart =context.window().getStart();
|
||||
long windowEnd = context.window().getEnd();
|
||||
long count = iterable.spliterator().getExactSizeIfKnown(); //迭代器里面共多少条元素
|
||||
collector.collect("用户:"+key+"在窗口"
|
||||
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
|
||||
+""+"中的pv次数是:"+count);
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
package day04;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.time.Duration;
|
||||
|
||||
/**
|
||||
* 水位线测试
|
||||
*/
|
||||
public class Example5 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.socketTextStream("localhost",9999)
|
||||
.map(r -> Tuple2.of(r.split(" ")[0],Long.parseLong(r.split(" ")[1])*1000L))
|
||||
.returns(Types.TUPLE(Types.STRING,Types.LONG))
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(5))
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
|
||||
return element.f1;
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy(r -> r.f0)
|
||||
.process(new KeyedProcessFunction<String, Tuple2<String, Long>, String>() {
|
||||
@Override
|
||||
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
collector.collect("当前的水位线是:"+ctx.timerService().currentWatermark());
|
||||
ctx.timerService().registerEventTimeTimer(value.f1+5000L);
|
||||
collector.collect("注册了一个时间戳是:"+new Timestamp(value.f1+5000L)+"的定时器");
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
|
||||
out.collect("定时器触发了!");
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
package day04;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.time.Duration;
|
||||
|
||||
/**
|
||||
* 水位线测试
|
||||
*/
|
||||
public class Example6 {
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
//每个一分钟插入一次水位线
|
||||
env.getConfig().setAutoWatermarkInterval(60*1000L);
|
||||
|
||||
env
|
||||
// 'a 1'
|
||||
.socketTextStream("localhost",9999)
|
||||
// (a , 1000L)
|
||||
.map(new MapFunction<String, Tuple2<String,Long>>() {
|
||||
@Override
|
||||
public Tuple2 map(String value) throws Exception {
|
||||
String[] arr = value.split(" ");
|
||||
return Tuple2.of(arr[0],Long.parseLong(arr[1]) * 1000L);
|
||||
}
|
||||
})
|
||||
//抽取时间戳,分配水位线
|
||||
//默认每隔200ms的机器时间,插入一次水位线
|
||||
.assignTimestampsAndWatermarks(
|
||||
//最大延迟时间设置为5秒nc
|
||||
WatermarkStrategy.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0))
|
||||
//时间戳字段
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
|
||||
return element.f1; //告诉flink事件时间是哪一个字段
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy(r ->r.f0)
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(5))) //5秒的时间滚动窗口
|
||||
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
|
||||
@Override
|
||||
public void process(String key, Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
|
||||
long windowStart =context.window().getStart();
|
||||
long windowEnd = context.window().getEnd();
|
||||
long count = iterable.spliterator().getExactSizeIfKnown(); //迭代器里面共多少条元素
|
||||
collector.collect("用户:"+key+"在窗口"
|
||||
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
|
||||
+""+"中的pv次数是:"+count);
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,240 @@
|
|||
package day04;
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.AggregateFunction;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.common.state.ListState;
|
||||
import org.apache.flink.api.common.state.ListStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import javax.print.DocFlavor;
|
||||
import java.sql.Timestamp;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* 每个窗口中最热门的商品是什么
|
||||
*/
|
||||
public class Example7 {
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
|
||||
.map(new MapFunction<String, UserBehavior>() {
|
||||
@Override
|
||||
public UserBehavior map(String value) throws Exception {
|
||||
|
||||
String[] arr = value.split(",");
|
||||
return new UserBehavior(arr[0],arr[1],arr[2],arr[3],Long.parseLong(arr[4])*1000L);
|
||||
|
||||
}
|
||||
})
|
||||
.filter(r -> r.behavior.equals("pv"))
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(0))
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
|
||||
@Override
|
||||
public long extractTimestamp(UserBehavior element, long recordTimestamp) {
|
||||
return element.timeStamp;
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy( r -> r.itemId)
|
||||
.window(SlidingEventTimeWindows.of(Time.hours(1),Time.minutes(5)))
|
||||
.aggregate(new CountAgg(),new WindowResult())
|
||||
.keyBy(r ->r.windowEnd)
|
||||
.process(new TopN(3))
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 一段时间内的TopN商品排序函数
|
||||
*/
|
||||
public static class TopN extends KeyedProcessFunction<Long,ItemViewCount,String>{
|
||||
|
||||
private ListState<ItemViewCount> listState;
|
||||
private Integer n;
|
||||
|
||||
public TopN(Integer n) {
|
||||
this.n = n;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
listState = getRuntimeContext().getListState(
|
||||
new ListStateDescriptor<ItemViewCount>("list-state", Types.POJO(ItemViewCount.class)));
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(ItemViewCount value, Context ctx, Collector<String> collector) throws Exception {
|
||||
listState.add(value);
|
||||
ctx.timerService().registerEventTimeTimer(value.windowEnd+100L);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
|
||||
ArrayList<ItemViewCount> itemViewCountArrayList =new ArrayList<>();
|
||||
for (ItemViewCount ivc : listState.get()) {
|
||||
itemViewCountArrayList.add(ivc);
|
||||
}
|
||||
listState.clear();
|
||||
|
||||
itemViewCountArrayList.sort(new Comparator<ItemViewCount>() {
|
||||
@Override
|
||||
public int compare(ItemViewCount t1, ItemViewCount t2) {
|
||||
return t2.count.intValue() -t1.count.intValue();
|
||||
|
||||
}
|
||||
});
|
||||
StringBuilder result = new StringBuilder();
|
||||
result
|
||||
.append("=====================================\n")
|
||||
.append("窗口结束时间:"+new Timestamp(timestamp-1L))
|
||||
.append("\n");
|
||||
for (int i = 0;i<n ;i++){
|
||||
|
||||
ItemViewCount curr = itemViewCountArrayList.get(i);
|
||||
result
|
||||
.append("第"+(i+1)+"名的商品id是:"+curr.itemId)
|
||||
.append(",浏览次数是:"+curr.count)
|
||||
.append("\n");
|
||||
|
||||
}
|
||||
result
|
||||
.append("=====================================\n\n");
|
||||
out.collect(result.toString());
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 全量聚合函数
|
||||
*/
|
||||
public static class WindowResult extends ProcessWindowFunction<Long,ItemViewCount,String, TimeWindow>{
|
||||
|
||||
@Override
|
||||
public void process(String key, Context context, Iterable<Long> elements, Collector<ItemViewCount> collector) throws Exception {
|
||||
collector.collect(new ItemViewCount(key,elements.iterator().next(),context.window().getStart(),context.window().getEnd()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 增量聚合函数
|
||||
*/
|
||||
public static class CountAgg implements AggregateFunction<UserBehavior,Long,Long>{
|
||||
|
||||
@Override
|
||||
public Long createAccumulator() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long add(UserBehavior value, Long accumulator) {
|
||||
return accumulator+1L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getResult(Long accumulator) {
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long merge(Long aLong, Long acc1) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 商品视图POJO类
|
||||
* 每个商品在每个窗口中的浏览次数
|
||||
*/
|
||||
public static class ItemViewCount{
|
||||
|
||||
public String itemId;
|
||||
public Long count;
|
||||
public Long windowStart;
|
||||
public Long windowEnd;
|
||||
|
||||
public ItemViewCount() {
|
||||
}
|
||||
|
||||
public ItemViewCount(String itemId, Long count, Long windowStart, Long windowEnd) {
|
||||
this.itemId = itemId;
|
||||
this.count = count;
|
||||
this.windowStart = windowStart;
|
||||
this.windowEnd = windowEnd;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ItemViewCount{" +
|
||||
"itemId='" + itemId + '\'' +
|
||||
", count=" + count +
|
||||
", windowStart=" + new Timestamp(windowStart) +
|
||||
", windowEnd=" + new Timestamp(windowEnd) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 用户行为POJO类
|
||||
*/
|
||||
public static class UserBehavior{
|
||||
public String userId;
|
||||
public String itemId;
|
||||
public String categoryId;
|
||||
public String behavior;
|
||||
public Long timeStamp;
|
||||
|
||||
public UserBehavior(){
|
||||
|
||||
}
|
||||
|
||||
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
|
||||
this.userId = userId;
|
||||
this.itemId = itemId;
|
||||
this.categoryId = categoryId;
|
||||
this.behavior = behavior;
|
||||
this.timeStamp = timeStamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "UserBehavior{" +
|
||||
"userId='" + userId + '\'' +
|
||||
", itemId='" + itemId + '\'' +
|
||||
", categoryId='" + categoryId + '\'' +
|
||||
", behavior='" + behavior + '\'' +
|
||||
", timeStamp=" + new Timestamp(timeStamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
package day04.selftry;
|
||||
|
||||
import day04.Example3;
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 不用mapState实现增量和全量窗口结合
|
||||
*/
|
||||
public class Example3_try {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.addSource(new ClickSource())
|
||||
.keyBy(r -> r.user)
|
||||
.process(new KeyedProcessFunction<String, Event, String>() {
|
||||
|
||||
private ValueState<Integer> valueState;
|
||||
private Long windowSize = 5000L;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
valueState = getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Integer>("count", Types.INT)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(Event event, Context context, Collector<String> collector) throws Exception {
|
||||
|
||||
//获取现在的时间戳及其窗口时间
|
||||
long currTime = context.timerService().currentProcessingTime();
|
||||
long startWindow = currTime - currTime % windowSize;
|
||||
long endWindow = startWindow+windowSize;
|
||||
if(valueState.value() == null){
|
||||
valueState.update(1);
|
||||
}else {
|
||||
valueState.update(valueState.value()+1);
|
||||
}
|
||||
//注册定时器
|
||||
context.timerService().registerProcessingTimeTimer(endWindow-1L);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
|
||||
long startWindow = timestamp - windowSize + 1L;
|
||||
long endWindow = timestamp + 1L;
|
||||
Integer count = valueState.value();
|
||||
out.collect("用户:"+ctx.getCurrentKey()+"窗口"+new Timestamp(startWindow)+"~"+new Timestamp(endWindow)
|
||||
+"中的pv次数是"+count);
|
||||
valueState.clear();
|
||||
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event> {
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
|
||||
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while (running) {
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event {
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
package day05;
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkGenerator;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkGeneratorSupplier;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.ProcessFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
|
||||
/**
|
||||
* 什么是迟到元素
|
||||
*/
|
||||
public class Example1 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
|
||||
DataStreamSource<String> stream = env.socketTextStream("localhost", 9999);
|
||||
|
||||
stream
|
||||
.map(new MapFunction<String, Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public Tuple2<String, Long> map(String s) throws Exception {
|
||||
String[] arr = s.split(" ");
|
||||
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
|
||||
}
|
||||
})
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps() //最大延迟时间是0
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long l) {
|
||||
|
||||
return element.f1;
|
||||
}
|
||||
})
|
||||
|
||||
)
|
||||
.process(
|
||||
new ProcessFunction<Tuple2<String, Long>, String>() {
|
||||
@Override
|
||||
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
if (value.f1 < ctx.timerService().currentWatermark()) {
|
||||
collector.collect("迟到元素迟到了:" + value);
|
||||
} else {
|
||||
collector.collect(value + "元素没有迟到");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
)
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
package day05;
|
||||
|
||||
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.ProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.streaming.api.watermark.Watermark;
|
||||
import org.apache.flink.util.Collector;
|
||||
import org.apache.flink.util.OutputTag;
|
||||
|
||||
/**
|
||||
* 迟到数据发送到侧输出流中去
|
||||
* 重定向到侧输出流
|
||||
*/
|
||||
public class Example2 {
|
||||
|
||||
//定义侧输出流的名字:侧输出标签
|
||||
private static OutputTag<String> lateElement = new OutputTag<String>("late-element"){};
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<String> result = env
|
||||
//自定义一个数据源
|
||||
.addSource(new SourceFunction<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public void run(SourceContext<Tuple2<String, Long>> ctx) throws Exception {
|
||||
//指定时间戳发送数据
|
||||
ctx.collectWithTimestamp(Tuple2.of("hello world", 1000L), 1000L);
|
||||
//发送水位线
|
||||
ctx.emitWatermark(new Watermark(999L));
|
||||
|
||||
ctx.collectWithTimestamp(Tuple2.of("hello flink", 2000L), 2000L);
|
||||
ctx.emitWatermark(new Watermark(1999L));
|
||||
|
||||
ctx.collectWithTimestamp(Tuple2.of("hello late", 1000L), 1000L);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
|
||||
}
|
||||
})
|
||||
.process(new ProcessFunction<Tuple2<String, Long>, String>() {
|
||||
@Override
|
||||
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
if (value.f1 < ctx.timerService().currentWatermark()) {
|
||||
//发送到侧输出流
|
||||
ctx.output(lateElement, "迟到元素发送到侧输出流" + value);
|
||||
} else {
|
||||
collector.collect("正常到达的元素:" + value);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
result.print("主流:");
|
||||
|
||||
//打印侧输出流
|
||||
result.getSideOutput(lateElement).print("侧输出流:");
|
||||
|
||||
|
||||
env.execute();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
package day05;
|
||||
|
||||
|
||||
|
||||
import org.apache.flink.api.common.functions.AggregateFunction;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.watermark.Watermark;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
import org.apache.flink.util.OutputTag;
|
||||
|
||||
|
||||
/**
|
||||
* 开了窗口之后,如何把迟到元素发送到侧输出流
|
||||
*/
|
||||
public class Example3 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<String> result = env
|
||||
.addSource(new SourceFunction<String>() {
|
||||
@Override
|
||||
public void run(SourceContext<String> ctx) throws Exception {
|
||||
|
||||
ctx.collectWithTimestamp("a", 1000L);
|
||||
ctx.emitWatermark(new Watermark(999L));
|
||||
|
||||
ctx.collectWithTimestamp("a", 2000L);
|
||||
ctx.emitWatermark(new Watermark(1999L));
|
||||
|
||||
ctx.collectWithTimestamp("a", 4000L);
|
||||
ctx.emitWatermark(new Watermark(4999L));
|
||||
|
||||
//0-5秒数据窗口已关闭
|
||||
ctx.collectWithTimestamp("a", 3000L); //迟到元素
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
|
||||
}
|
||||
})
|
||||
.keyBy(r -> 1)
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
|
||||
//发送迟到数据,并且迟到数据的窗口已被销毁的数据到侧输出流
|
||||
.sideOutputLateData(new OutputTag<String>("late") {
|
||||
})
|
||||
.process(new ProcessWindowFunction<String, String, Integer, TimeWindow>() {
|
||||
@Override
|
||||
public void process(Integer integer, Context context, Iterable<String> element, Collector<String> collector) throws Exception {
|
||||
collector.collect("窗口中共有:" + element.spliterator().getExactSizeIfKnown() + "条数据");
|
||||
}
|
||||
});
|
||||
|
||||
result.print("主输出流:");
|
||||
|
||||
//侧输出标签通过 id 保证是单例模式
|
||||
result.getSideOutput(new OutputTag<String>("late"){}).print("侧输出流:");
|
||||
|
||||
/*
|
||||
主输出流:> 窗口中共有:3条数据
|
||||
侧输出流:> a
|
||||
*/
|
||||
env.execute();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
package day05;
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
import org.apache.flink.util.OutputTag;
|
||||
|
||||
import javax.print.DocFlavor;
|
||||
import java.time.Duration;
|
||||
|
||||
/**
|
||||
* 使用迟到数据更新窗口计算结果
|
||||
*/
|
||||
public class Example4 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
|
||||
DataStreamSource<String> stream = env.socketTextStream("localhost", 9999);
|
||||
|
||||
SingleOutputStreamOperator<String> result = stream
|
||||
.map(new MapFunction<String, Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public Tuple2<String, Long> map(String s) throws Exception {
|
||||
String[] arr = s.split(" ");
|
||||
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
|
||||
}
|
||||
})
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(5))
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
|
||||
return element.f1;
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy(r -> r.f0)
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
|
||||
.allowedLateness(Time.seconds(5)) //允许等待迟到事件5秒
|
||||
.sideOutputLateData(new OutputTag<Tuple2<String, Long>>("late") {
|
||||
}) //5秒以后被销毁的数据被发送到的位置
|
||||
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
|
||||
@Override
|
||||
public void process(String s, Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
|
||||
|
||||
//初始化一个窗口状态变量,注意:窗口状态变量的可见范围是当前窗口
|
||||
ValueState<Boolean> firstCalculate = context.windowState().getState(new ValueStateDescriptor<Boolean>("first", Types.BOOLEAN));
|
||||
|
||||
if (firstCalculate.value() == null) {
|
||||
collector.collect("窗口第一次触发计算了!水位线是:" + context.currentWatermark() + "窗口中共有" + iterable.spliterator().getExactSizeIfKnown() + "条数据");
|
||||
firstCalculate.update(true); //第一次触发process执行以后,更新为true
|
||||
} else {
|
||||
|
||||
collector.collect("迟到数据到了,更新以后的计算结果是:" + iterable.spliterator().getExactSizeIfKnown());
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
result.print("主输出流:");
|
||||
|
||||
result.getSideOutput(new OutputTag<Tuple2<String,Long>>("late"){}).print("侧输出流:");
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
package day05;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.*;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
|
||||
/**
|
||||
* 自定义水位线的产生
|
||||
*/
|
||||
public class Example5 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<String> stream = env.socketTextStream("localhost", 9999);
|
||||
|
||||
stream
|
||||
.map(new MapFunction<String, Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public Tuple2<String, Long> map(String s) throws Exception {
|
||||
String[] arr = s.split(" ");
|
||||
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
|
||||
}
|
||||
})
|
||||
.assignTimestampsAndWatermarks(new CustomWatermarkGenerator())
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static class CustomWatermarkGenerator implements WatermarkStrategy<Tuple2<String, Long>> {
|
||||
|
||||
//用来告诉时间戳是哪一个字段
|
||||
@Override
|
||||
public TimestampAssigner<Tuple2<String, Long>> createTimestampAssigner(TimestampAssignerSupplier.Context context) {
|
||||
|
||||
return new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long l) {
|
||||
return element.f1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public WatermarkGenerator<Tuple2<String, Long>> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
|
||||
|
||||
return new WatermarkGenerator<Tuple2<String, Long>>() {
|
||||
//最大延迟时间
|
||||
private Long bound = 500L;
|
||||
//最大时间戳,防止溢出
|
||||
private Long maxTs = -Long.MAX_VALUE + bound + 1L;
|
||||
|
||||
//更新时间戳
|
||||
@Override
|
||||
public void onEvent(Tuple2<String, Long> event, long l, WatermarkOutput watermarkOutput) {
|
||||
|
||||
maxTs =Math.max(maxTs,event.f1); //更新观察到的最大事件时间
|
||||
}
|
||||
|
||||
//周期性产生水位线
|
||||
@Override
|
||||
public void onPeriodicEmit(WatermarkOutput watermarkOutput) {
|
||||
//发送水位线,注意水位线的计算公式
|
||||
watermarkOutput.emitWatermark(new Watermark(maxTs-bound-1L));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
package day05;
|
||||
|
||||
import org.apache.flink.streaming.api.datastream.DataStream;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
|
||||
/**
|
||||
* 多流合并算子Union
|
||||
* 有两个要求:
|
||||
* 1.多条流的合并
|
||||
* 2.所有流中的时间类型必须是一样的
|
||||
*/
|
||||
public class Example6 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Integer> stream1 = env.fromElements(1, 2);
|
||||
DataStreamSource<Integer> stream2 = env.fromElements(3, 4);
|
||||
DataStreamSource<Integer> stream3 = env.fromElements(5, 6);
|
||||
|
||||
DataStream<Integer> result = stream1.union(stream2, stream3);
|
||||
|
||||
result.print(); //3 4 5 6 1 2
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
package day05;
|
||||
|
||||
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
/**
|
||||
* 分流水位线传递测试
|
||||
*/
|
||||
public class Example7 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
|
||||
env
|
||||
.socketTextStream("localhost",9999)
|
||||
.map(new MapFunction<String, Tuple2<String,Long>>() {
|
||||
@Override
|
||||
public Tuple2<String, Long> map(String s) throws Exception {
|
||||
String[] arr = s.split(" ");
|
||||
return Tuple2.of(arr[0],Long.parseLong(arr[1])*1000L);
|
||||
}
|
||||
})
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long l) {
|
||||
|
||||
return element.f1;
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy(r -> r.f0)
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
|
||||
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
|
||||
@Override
|
||||
public void process(String s, Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
|
||||
|
||||
collector.collect("key:"+s+"的窗口触发了"+"当前的水位线是:"+context.currentWatermark());
|
||||
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
env.execute();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
package day05;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.ProcessFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
/**
|
||||
* 合流水位线传递规则
|
||||
* 传递小的水位线
|
||||
*/
|
||||
public class Example8 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
//第一条流
|
||||
SingleOutputStreamOperator<Tuple2<String, Long>> stream1 = env
|
||||
.socketTextStream("localhost", 9999)
|
||||
.map(new MapFunction<String, Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public Tuple2<String, Long> map(String s) throws Exception {
|
||||
String[] arr = s.split(" ");
|
||||
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
|
||||
}
|
||||
})
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long l) {
|
||||
|
||||
return element.f1;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
|
||||
//第二条流
|
||||
SingleOutputStreamOperator<Tuple2<String, Long>> stream2 = env
|
||||
.socketTextStream("localhost", 9998)
|
||||
.map(new MapFunction<String, Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public Tuple2<String, Long> map(String s) throws Exception {
|
||||
String[] arr = s.split(" ");
|
||||
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
|
||||
}
|
||||
})
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Long> element, long l) {
|
||||
|
||||
return element.f1;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
stream1.union(stream2)
|
||||
.process(new ProcessFunction<Tuple2<String, Long>, String>() {
|
||||
@Override
|
||||
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
collector.collect("当前水位线是:"+ctx.timerService().currentWatermark());
|
||||
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
env.execute();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
package day05;
|
||||
|
||||
|
||||
import day02.Example1;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.co.CoFlatMapFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.Calendar;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* 双流join模式另一个算子connect
|
||||
* connect来连接两条流
|
||||
* 与union的不同:
|
||||
* 1.只能连接两条流
|
||||
* 2.两条流中元素类型可以不同
|
||||
*
|
||||
* 一般用于:
|
||||
* 1. 两条流keyBy
|
||||
* 2. 一条流keyby,一条流广播
|
||||
*/
|
||||
public class Example9 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Event> clickStream = env.addSource(new ClickSource());
|
||||
DataStreamSource<String> queryStream = env.socketTextStream("localhost", 9999).setParallelism(1);
|
||||
|
||||
|
||||
clickStream
|
||||
.keyBy(r ->r.user)
|
||||
.connect(queryStream.broadcast())
|
||||
//new CoFlatMapFunction<第一条流的数据类型, 第二条流的数据类型, 输出类型>
|
||||
.flatMap(new CoFlatMapFunction<Event, String, Event>() {
|
||||
|
||||
//TODO 当第一条流元素来时进入flatMap1,第二条流元素来时进入flatMap2
|
||||
|
||||
private String query = "";
|
||||
|
||||
@Override
|
||||
public void flatMap1(Event value, Collector<Event> collector) throws Exception {
|
||||
if(value.url.equals(query)){
|
||||
collector.collect(value);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flatMap2(String value, Collector<Event> collector) throws Exception {
|
||||
query = value;
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
//Event{user='Mary', url='./cart', timestamp=2022-01-06 14:27:05.647}
|
||||
//Event{user='liz', url='./cart', timestamp=2022-01-06 14:27:22.655}
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
|
||||
//sourceFunction并行度只能为1
|
||||
//自定义并行化版本的数据源,需要使用ParallelSourceFunction
|
||||
public static class ClickSource implements SourceFunction<Event> {
|
||||
|
||||
private boolean running = true;
|
||||
private String[] userArr ={"Mary","Bob","Alice","liz"};
|
||||
private String[] urlArr = {"./home","./cart","./fav","./prod?id=1","prod?id=2"};
|
||||
private Random random = new Random();
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
//向下游发送数据
|
||||
while(running){
|
||||
//ctx上下文对象
|
||||
//collect方法,向下游发送数据
|
||||
ctx.collect(
|
||||
new Event(
|
||||
userArr[random.nextInt(userArr.length)],
|
||||
urlArr[random.nextInt(urlArr.length)],
|
||||
Calendar.getInstance().getTimeInMillis()
|
||||
)
|
||||
|
||||
);
|
||||
Thread.sleep(1000L);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
running = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 自定义POJO类
|
||||
*/
|
||||
public static class Event{
|
||||
public String user;
|
||||
public String url;
|
||||
public Long timestamp;
|
||||
|
||||
public Event(){
|
||||
|
||||
}
|
||||
|
||||
public Event(String user, String url, Long timestamp) {
|
||||
this.user = user;
|
||||
this.url = url;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", url='" + url + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
package day06;
|
||||
|
||||
import org.apache.flink.api.common.state.ListState;
|
||||
import org.apache.flink.api.common.state.ListStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
//在FlinkSQL里面有SELECT * FROM A INNER JOIN B WHERE A.id=B.id;
|
||||
//其中存在笛卡尔积,这种在flink的流处理里面是如何实现的呢?
|
||||
|
||||
/**
|
||||
* CoProcessFunction的使用
|
||||
* 通过API实现等值内连接
|
||||
*/
|
||||
public class Example1 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
DataStreamSource<Tuple2<String, Integer>> stream1 = env
|
||||
.fromElements(
|
||||
Tuple2.of("a", 1),
|
||||
Tuple2.of("b", 2),
|
||||
Tuple2.of("a", 2)
|
||||
);
|
||||
DataStreamSource<Tuple2<String, String>> stream2 = env
|
||||
.fromElements(
|
||||
Tuple2.of("a", "a"),
|
||||
Tuple2.of("b", "b"),
|
||||
Tuple2.of("a", "aaaa")
|
||||
);
|
||||
|
||||
stream1
|
||||
.keyBy(r -> r.f0)
|
||||
.connect(stream2.keyBy(r -> r.f0))
|
||||
//CoProcessFunction<第一条流的泛型, 第二条流的泛型, 输出>
|
||||
.process(new CoProcessFunction<Tuple2<String, Integer>, Tuple2<String, String>, String>() {
|
||||
|
||||
//分别保存两条流的数据
|
||||
private ListState<Tuple2<String, Integer>> listState1;
|
||||
private ListState<Tuple2<String, String>> listState2;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
listState1 = getRuntimeContext().getListState(
|
||||
new ListStateDescriptor<Tuple2<String, Integer>>("list1", Types.TUPLE(Types.STRING, Types.INT))
|
||||
);
|
||||
|
||||
listState2 = getRuntimeContext().getListState(
|
||||
new ListStateDescriptor<Tuple2<String, String>>("list2", Types.TUPLE(Types.STRING, Types.STRING))
|
||||
);
|
||||
}
|
||||
|
||||
//用来处理第一条流的数据
|
||||
@Override
|
||||
public void processElement1(Tuple2<String, Integer> value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
listState1.add(value);
|
||||
for (Tuple2<String, String> e : listState2.get()) {
|
||||
collector.collect(value + "=>" + e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//用来处理第二条流的数据
|
||||
@Override
|
||||
public void processElement2(Tuple2<String, String> value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
listState2.add(value);
|
||||
for (Tuple2<String, Integer> e : listState1.get()) {
|
||||
collector.collect(e + "=>" + value);
|
||||
}
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
env.execute();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,161 @@
|
|||
package day06;
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
/**
|
||||
* 实时对账
|
||||
*/
|
||||
public class Example2 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<Event> orderStream = env.fromElements(
|
||||
Event.of("order-1", "order", 1000L),
|
||||
Event.of("order-2", "order", 2000L)
|
||||
).assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Event>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
|
||||
@Override
|
||||
public long extractTimestamp(Event element, long l) {
|
||||
|
||||
return element.timestamp;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
SingleOutputStreamOperator<Event> weixinStream = env.fromElements(
|
||||
Event.of("order-1", "weixin", 30000L),
|
||||
Event.of("order-3", "weixin", 4000L)
|
||||
).assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Event>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
|
||||
@Override
|
||||
public long extractTimestamp(Event element, long l) {
|
||||
|
||||
return element.timestamp;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
orderStream.keyBy(r -> r.orderId)
|
||||
.connect(weixinStream.keyBy(r -> r.orderId))
|
||||
.process(new MatchFunction())
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static class MatchFunction extends CoProcessFunction<Event,Event,String>{
|
||||
|
||||
//初始化两个状态变量,一个用来保存下订单事件,一个用来保存微信的支付时间
|
||||
private ValueState<Event> orderState;
|
||||
private ValueState<Event> weixinState;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
orderState =getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Event>("orderState", Types.POJO(Event.class))
|
||||
);
|
||||
weixinState =getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Event>("weixinState", Types.POJO(Event.class))
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement1(Event value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
if(weixinState.value() == null){
|
||||
|
||||
//下订单order事件先到达,因为如果weixin事件先到达,那么就不为空了
|
||||
orderState.update(value);
|
||||
ctx.timerService().registerEventTimeTimer(value.timestamp+5000L);
|
||||
|
||||
}else {
|
||||
|
||||
//如果不为空,且到达了这里,证明对账成功,直接输出
|
||||
collector.collect("订单ID是"+value.orderId+"对账成功,微信事件先到达");
|
||||
weixinState.clear();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement2(Event value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
if(orderState.value() == null){
|
||||
weixinState.update(value);
|
||||
ctx.timerService().registerEventTimeTimer(value.timestamp+5000L);
|
||||
}else {
|
||||
collector.collect("订单ID是:"+value.orderId+"对账成功,order事件先到达");
|
||||
orderState.clear();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
|
||||
if(orderState.value() !=null){
|
||||
out.collect("订单ID"+orderState.value().orderId+"对账失败,微信事件5s内未到达");
|
||||
orderState.clear();
|
||||
}
|
||||
if(weixinState.value() !=null){
|
||||
out.collect("订单ID"+weixinState.value().orderId+"对账失败,订单事件5s内未到达");
|
||||
weixinState.clear();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static class Event{
|
||||
|
||||
public String orderId;
|
||||
public String eventType;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
}
|
||||
|
||||
public Event(String orderId, String eventType, Long timestamp) {
|
||||
this.orderId = orderId;
|
||||
this.eventType = eventType;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
public static Event of(String orderId, String eventType, Long timestamp){
|
||||
return new Event(orderId,eventType,timestamp);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"orderId='" + orderId + '\'' +
|
||||
", eventType='" + eventType + '\'' +
|
||||
", timestamp=" + timestamp +
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,173 @@
|
|||
package day06;
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.time.Time;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.streaming.api.watermark.Watermark;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
/**
|
||||
* 实时对账 ——模拟对账不成功
|
||||
*/
|
||||
public class Example3 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<Event> orderStream = env
|
||||
.addSource(new SourceFunction<Event>() {
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
ctx.collectWithTimestamp(Event.of("order-1", "order", 1000L), 1000L);
|
||||
ctx.emitWatermark(new Watermark(999L));
|
||||
ctx.collectWithTimestamp(Event.of("order-2", "order", 3000L), 3000L);
|
||||
ctx.emitWatermark(new Watermark(8001L));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
SingleOutputStreamOperator<Event> weixinStream = env
|
||||
.addSource(new SourceFunction<Event>() {
|
||||
@Override
|
||||
public void run(SourceContext<Event> ctx) throws Exception {
|
||||
ctx.collectWithTimestamp(Event.of("order-1", "weixin", 4000L), 4000L);
|
||||
ctx.emitWatermark(new Watermark(3999L));
|
||||
ctx.emitWatermark(new Watermark(8001L));
|
||||
ctx.collectWithTimestamp(Event.of("order-2", "weixin", 9000L), 9000L);
|
||||
//ctx.emitWatermark(new Watermark(4000L));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
|
||||
orderStream.keyBy(r -> r.orderId)
|
||||
.connect(weixinStream.keyBy(r -> r.orderId))
|
||||
.process(new MatchFunction())
|
||||
.print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static class MatchFunction extends CoProcessFunction<Event,Event,String>{
|
||||
|
||||
//初始化两个状态变量,一个用来保存下订单事件,一个用来保存微信的支付时间
|
||||
private ValueState<Event> orderState;
|
||||
private ValueState<Event> weixinState;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
orderState =getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Event>("orderState", Types.POJO(Event.class))
|
||||
);
|
||||
weixinState =getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<Event>("weixinState", Types.POJO(Event.class))
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement1(Event value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
if(weixinState.value() == null){
|
||||
|
||||
//下订单order事件先到达,因为如果weixin事件先到达,那么就不为空了
|
||||
orderState.update(value);
|
||||
ctx.timerService().registerEventTimeTimer(value.timestamp+5000L);
|
||||
|
||||
}else {
|
||||
|
||||
//如果不为空,且到达了这里,证明对账成功,直接输出
|
||||
collector.collect("订单ID是"+value.orderId+"对账成功,微信事件先到达");
|
||||
weixinState.clear();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement2(Event value, Context ctx, Collector<String> collector) throws Exception {
|
||||
|
||||
if(orderState.value() == null){
|
||||
weixinState.update(value);
|
||||
ctx.timerService().registerEventTimeTimer(value.timestamp+5000L);
|
||||
}else {
|
||||
collector.collect("订单ID是:"+value.orderId+"对账成功,order事件先到达");
|
||||
orderState.clear();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
|
||||
|
||||
if(orderState.value() !=null){
|
||||
out.collect("订单ID"+orderState.value().orderId+"对账失败,微信事件5s内未到达");
|
||||
orderState.clear();
|
||||
}
|
||||
if(weixinState.value() !=null){
|
||||
out.collect("订单ID"+weixinState.value().orderId+"对账失败,订单事件5s内未到达");
|
||||
weixinState.clear();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static class Event{
|
||||
|
||||
public String orderId;
|
||||
public String eventType;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
}
|
||||
|
||||
public Event(String orderId, String eventType, Long timestamp) {
|
||||
this.orderId = orderId;
|
||||
this.eventType = eventType;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
public static Event of(String orderId, String eventType, Long timestamp){
|
||||
return new Event(orderId,eventType,timestamp);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"orderId='" + orderId + '\'' +
|
||||
", eventType='" + eventType + '\'' +
|
||||
", timestamp=" + timestamp +
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
package day06;
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
|
||||
/**
|
||||
* 基于间隔的join
|
||||
*/
|
||||
public class Example4 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<Event> orderStream = env
|
||||
.fromElements(
|
||||
Event.of("user-1", "order", 20 * 60 * 1000L)
|
||||
)
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Event>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
|
||||
@Override
|
||||
public long extractTimestamp(Event event, long l) {
|
||||
|
||||
return event.timestamp;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
SingleOutputStreamOperator<Event> pvStream = env.fromElements(
|
||||
Event.of("user-1", "pv", 5 * 60 * 1000L),
|
||||
Event.of("user-1", "pv", 10 * 60 * 1000L),
|
||||
Event.of("user-1", "pv", 12 * 60 * 1000L),
|
||||
Event.of("user-1", "pv", 22 * 60 * 1000L)
|
||||
).assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Event>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
|
||||
@Override
|
||||
public long extractTimestamp(Event event, long l) {
|
||||
|
||||
return event.timestamp;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
orderStream.keyBy(r ->r.userId)
|
||||
.intervalJoin(pvStream.keyBy(r -> r.userId))
|
||||
//第一条流和第二条流的哪一段join
|
||||
//最近10min和未来10min以内的
|
||||
.between(Time.minutes(-10),Time.minutes(5))
|
||||
.process(new ProcessJoinFunction<Event, Event, String>() {
|
||||
@Override
|
||||
public void processElement(Event left, Event right, Context ctx, Collector<String> collector) throws Exception {
|
||||
collector.collect(left + "=>"+right);
|
||||
}
|
||||
})
|
||||
.print("orderStream join pvStream");
|
||||
|
||||
pvStream.keyBy(r ->r.userId)
|
||||
.intervalJoin(orderStream.keyBy(r -> r.userId))
|
||||
//第一条流和第二条流的哪一段join
|
||||
//最近10min和未来10min以内的
|
||||
.between(Time.minutes(-5),Time.minutes(10))
|
||||
.process(new ProcessJoinFunction<Event, Event, String>() {
|
||||
@Override
|
||||
public void processElement(Event left, Event right, Context ctx, Collector<String> collector) throws Exception {
|
||||
collector.collect(right + "=>"+left);
|
||||
}
|
||||
})
|
||||
.print("pvStream join orderStream");
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static class Event{
|
||||
|
||||
public String userId;
|
||||
public String eventType;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
}
|
||||
|
||||
public Event(String userId, String eventType, Long timestamp) {
|
||||
this.userId = userId;
|
||||
this.eventType = eventType;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
public static Event of(String userId, String eventType, Long timestamp){
|
||||
return new Event(userId,eventType,timestamp);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"userId='" + userId + '\'' +
|
||||
", eventType='" + eventType + '\'' +
|
||||
", timestamp=" + new Timestamp(timestamp) +
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
package day06;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
|
||||
import org.apache.flink.api.common.functions.JoinFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
|
||||
import javax.print.DocFlavor;
|
||||
|
||||
/**
|
||||
* 基于窗口的join
|
||||
*/
|
||||
public class Example5 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<Tuple2<String, Integer>> stream1 = env.fromElements(
|
||||
Tuple2.of("a", 1), Tuple2.of("b", 1)
|
||||
).assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple2<String, Integer>>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Integer>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Integer> stringIntegerTuple2, long l) {
|
||||
|
||||
return stringIntegerTuple2.f1;
|
||||
}
|
||||
})
|
||||
);
|
||||
SingleOutputStreamOperator<Tuple2<String, Integer>> stream2 = env.fromElements(
|
||||
Tuple2.of("a", 2), Tuple2.of("b", 2),Tuple2.of("b", 3)
|
||||
).assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple2<String, Integer>>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Integer>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple2<String, Integer> stringIntegerTuple2, long l) {
|
||||
|
||||
return stringIntegerTuple2.f1;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
stream1
|
||||
.join(stream2)
|
||||
.where(r -> r.f0)
|
||||
.equalTo(r -> r.f0)
|
||||
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
|
||||
.apply(new JoinFunction<Tuple2<String, Integer>, Tuple2<String, Integer>, String>() {
|
||||
@Override
|
||||
public String join(Tuple2<String, Integer> first, Tuple2<String, Integer> second) throws Exception {
|
||||
return first +"=>"+second;
|
||||
}
|
||||
})
|
||||
.print();
|
||||
//基于相同窗口的笛卡尔积:
|
||||
//(a,1)=>(a,2)
|
||||
//(b,1)=>(b,2)
|
||||
//(b,1)=>(b,3)
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
package day06;
|
||||
|
||||
import day05.Example9;
|
||||
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
|
||||
/**
|
||||
* 设置检查点的保存位置
|
||||
*/
|
||||
public class Example6 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
//保存检查点到指定文件
|
||||
env.setStateBackend(new FsStateBackend("file:///E:\\Big_data_example\\Flink\\src\\main\\resources\\ckpt",false));
|
||||
//隔多久保存一次
|
||||
env.enableCheckpointing(10*1000L);
|
||||
env
|
||||
.addSource(new Example9.ClickSource())
|
||||
.print();
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
package day06;
|
||||
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
|
||||
/**
|
||||
* 一致性检查点
|
||||
*/
|
||||
public class Example7 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.fromElements(1,2,3,4,5)
|
||||
.keyBy(r -> r%2)
|
||||
.sum(0)
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
package day07;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.serialization.SimpleStringSchema;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* flink写入kafka
|
||||
*/
|
||||
public class Example1 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
Properties properties =new Properties();
|
||||
properties.put("bootstrap.servers","Ding202:9092");
|
||||
|
||||
env
|
||||
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
|
||||
.addSink(new FlinkKafkaProducer<String>(
|
||||
//Topic
|
||||
"dingjiawen1",
|
||||
//写入的数据类型
|
||||
new SimpleStringSchema(),
|
||||
//kafka producer的相关位置等配置信息
|
||||
properties
|
||||
));
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
package day07;
|
||||
|
||||
|
||||
import com.typesafe.sslconfig.ssl.FakeChainedKeyStore;
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.common.serialization.SimpleStringSchema;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* 读取kafka中的数据
|
||||
*/
|
||||
public class Example2 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
Properties properties = new Properties();
|
||||
//kafka消费者的位置
|
||||
properties.setProperty("bootstrap.servers", "Ding202:9092");
|
||||
//消费者组
|
||||
properties.setProperty("group.id", "consumer-group");
|
||||
//key和value的反序列化机制
|
||||
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
|
||||
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
|
||||
//自动提交的等级
|
||||
properties.setProperty("auto.offset.reset", "latest");
|
||||
|
||||
env
|
||||
.addSource(
|
||||
new FlinkKafkaConsumer<String>(
|
||||
"dingjiawen1",
|
||||
new SimpleStringSchema(),
|
||||
properties
|
||||
)
|
||||
)
|
||||
.map(new MapFunction<String, UserBehavior>() {
|
||||
@Override
|
||||
public UserBehavior map(String s) throws Exception {
|
||||
|
||||
String[] arr = s.split(",");
|
||||
return new UserBehavior(arr[0],arr[1],arr[2],arr[3],Long.parseLong(arr[4])*1000L);
|
||||
|
||||
}
|
||||
})
|
||||
.filter(r -> r.behavior.equals("pv"))
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<UserBehavior>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
|
||||
@Override
|
||||
public long extractTimestamp(UserBehavior userBehavior, long l) {
|
||||
|
||||
return userBehavior.timeStamp;
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy(r -> true)
|
||||
.window(SlidingEventTimeWindows.of(Time.hours(1),Time.minutes(5)))
|
||||
//这种方式较以前的实现方式相比,没有分布式特性,
|
||||
// 以前的实现方式由于keyBy后面又按结束时间KeyBy,由于分布式特性会把不同的分组尽量放在不同的插槽进行计算
|
||||
//而这种方式容易内存爆炸
|
||||
.process(new ProcessWindowFunction<UserBehavior, String, Boolean, TimeWindow>() {
|
||||
@Override
|
||||
public void process(Boolean aBoolean, Context context, Iterable<UserBehavior> iterable, Collector<String> collector) throws Exception {
|
||||
|
||||
//使用一个map来存储不同商品及其点击次数
|
||||
HashMap<String, Long> hashMap = new HashMap<>();
|
||||
for (UserBehavior e : iterable) {
|
||||
if(hashMap.containsKey(e.itemId)){
|
||||
hashMap.put(e.itemId,hashMap.get(e.itemId)+1L);
|
||||
}else {
|
||||
hashMap.put(e.itemId,1L);
|
||||
}
|
||||
}
|
||||
//构造一个list来排序
|
||||
ArrayList<Tuple2<String,Long>> arrayList =new ArrayList<Tuple2<String, Long>>();
|
||||
for (String key : hashMap.keySet()) {
|
||||
arrayList.add(Tuple2.of(key,hashMap.get(key)));
|
||||
}
|
||||
|
||||
arrayList.sort(new Comparator<Tuple2<String, Long>>() {
|
||||
@Override
|
||||
public int compare(Tuple2<String, Long> t1, Tuple2<String, Long> t2) {
|
||||
return t2.f1.intValue()-t1.f1.intValue();
|
||||
}
|
||||
});
|
||||
|
||||
StringBuilder result = new StringBuilder();
|
||||
result
|
||||
.append("========================\n")
|
||||
.append("窗口"+new Timestamp(context.window().getStart())+"~"+new Timestamp(context.window().getEnd()))
|
||||
.append("\n");
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
Tuple2<String, Long> currElement = arrayList.get(i);
|
||||
result.append("第"+(i+1)+"名的商品ID是"+currElement.f0+";浏览次数是:"+currElement.f1)
|
||||
.append("\n");
|
||||
|
||||
}
|
||||
collector.collect(result.toString());
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户行为POJO类
|
||||
*/
|
||||
public static class UserBehavior{
|
||||
public String userId;
|
||||
public String itemId;
|
||||
public String categoryId;
|
||||
public String behavior;
|
||||
public Long timeStamp;
|
||||
|
||||
public UserBehavior(){
|
||||
|
||||
}
|
||||
|
||||
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
|
||||
this.userId = userId;
|
||||
this.itemId = itemId;
|
||||
this.categoryId = categoryId;
|
||||
this.behavior = behavior;
|
||||
this.timeStamp = timeStamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "UserBehavior{" +
|
||||
"userId='" + userId + '\'' +
|
||||
", itemId='" + itemId + '\'' +
|
||||
", categoryId='" + categoryId + '\'' +
|
||||
", behavior='" + behavior + '\'' +
|
||||
", timeStamp=" + new Timestamp(timeStamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,141 @@
|
|||
package day08;
|
||||
|
||||
import day04.Example7;
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.AggregateFunction;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.time.Duration;
|
||||
import java.util.HashSet;
|
||||
|
||||
|
||||
/**
|
||||
* 独立访客数量UV
|
||||
* 用用户去重之后的
|
||||
*/
|
||||
public class Example1 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
|
||||
.map(new MapFunction<String, UserBehavior>() {
|
||||
@Override
|
||||
public UserBehavior map(String value) throws Exception {
|
||||
|
||||
String[] arr = value.split(",");
|
||||
return new UserBehavior(arr[0],arr[1],arr[2],arr[3],Long.parseLong(arr[4])*1000L);
|
||||
|
||||
}
|
||||
})
|
||||
.filter(r -> r.behavior.equals("pv"))
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(0))
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
|
||||
@Override
|
||||
public long extractTimestamp(UserBehavior element, long recordTimestamp) {
|
||||
return element.timeStamp;
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy( r ->true)
|
||||
.window(TumblingEventTimeWindows.of(Time.hours(1)))
|
||||
.aggregate(new CountAgg(),new WindowResult())
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static class WindowResult extends ProcessWindowFunction<Long,String,Boolean, TimeWindow>{
|
||||
|
||||
@Override
|
||||
public void process(Boolean aBoolean, Context context, Iterable<Long> iterable, Collector<String> collector) throws Exception {
|
||||
String windowStart = new Timestamp(context.window().getStart()).toString();
|
||||
String windowStop = new Timestamp(context.window().getEnd()).toString();
|
||||
Long count = iterable.iterator().next();
|
||||
|
||||
collector.collect("窗口"+windowStart+"~"+windowStop+"的独立访客的数量为:"+count);
|
||||
}
|
||||
}
|
||||
|
||||
//实现去重
|
||||
public static class CountAgg implements AggregateFunction<UserBehavior, HashSet<String>,Long>{
|
||||
|
||||
//hashSet的实现方式,由于每加一次访客都会到内存,因此当访客数量多时,可能出现内存过大的情况,考虑优化
|
||||
//使用布隆过滤器
|
||||
@Override
|
||||
public HashSet<String> createAccumulator() {
|
||||
return new HashSet<String>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HashSet<String> add(UserBehavior userBehavior, HashSet<String> accumulator) {
|
||||
accumulator.add(userBehavior.userId);
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getResult(HashSet<String> strings) {
|
||||
return (long)strings.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HashSet<String> merge(HashSet<String> strings, HashSet<String> acc1) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 用户行为POJO类
|
||||
*/
|
||||
public static class UserBehavior{
|
||||
public String userId;
|
||||
public String itemId;
|
||||
public String categoryId;
|
||||
public String behavior;
|
||||
public Long timeStamp;
|
||||
|
||||
public UserBehavior(){
|
||||
|
||||
}
|
||||
|
||||
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
|
||||
this.userId = userId;
|
||||
this.itemId = itemId;
|
||||
this.categoryId = categoryId;
|
||||
this.behavior = behavior;
|
||||
this.timeStamp = timeStamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "UserBehavior{" +
|
||||
"userId='" + userId + '\'' +
|
||||
", itemId='" + itemId + '\'' +
|
||||
", categoryId='" + categoryId + '\'' +
|
||||
", behavior='" + behavior + '\'' +
|
||||
", timeStamp=" + new Timestamp(timeStamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,129 @@
|
|||
package day08;
|
||||
|
||||
|
||||
import com.typesafe.sslconfig.ssl.FakeChainedKeyStore;
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.table.api.EnvironmentSettings;
|
||||
import org.apache.flink.table.api.Table;
|
||||
import org.apache.flink.table.api.TableEnvironment;
|
||||
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
|
||||
import static org.apache.flink.table.api.Expressions.$;
|
||||
|
||||
/**
|
||||
* 使用Flink SQL实现实时TOPN
|
||||
*/
|
||||
public class Example10 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception{
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<UserBehavior> stream = env
|
||||
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
|
||||
.map(
|
||||
new MapFunction<String, UserBehavior>() {
|
||||
@Override
|
||||
public UserBehavior map(String s) throws Exception {
|
||||
String[] arr = s.split(",");
|
||||
|
||||
return new UserBehavior(arr[0], arr[1], arr[2], arr[3], Long.parseLong(arr[4]));
|
||||
}
|
||||
}
|
||||
)
|
||||
.filter(r -> r.behavior.equals("pv"))
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<UserBehavior>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
|
||||
@Override
|
||||
public long extractTimestamp(UserBehavior userBehavior, long l) {
|
||||
|
||||
return userBehavior.timeStamp;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
//注册表环境
|
||||
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
|
||||
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env, settings);
|
||||
|
||||
//将数据流转化为动态表
|
||||
Table table = tableEnvironment
|
||||
.fromDataStream(
|
||||
stream,
|
||||
$("userId"),
|
||||
$("itemId"),
|
||||
$("categoryId"),
|
||||
$("behavior"),
|
||||
$("timeStamp").rowtime().as("ts")
|
||||
);
|
||||
|
||||
tableEnvironment.createTemporaryView("userBehavior",table);
|
||||
|
||||
//按照itemId和滑动窗口进行分组
|
||||
String innerSQL = "select itemId,COUNT(itemId) as cnt ,HOP_END(ts , INTERVAL '5' MINUTE ,INTERVAL '1' HOUR) as windowEnd " +
|
||||
"from userBehavior group by itemId,HOP(ts,INTERVAL '5' MINUTE ,INTERVAL '1' HOUR)";
|
||||
|
||||
//聚合结果在分组-over
|
||||
//按照窗口分组,降序排列
|
||||
String midSQL = "select * ,ROW_NUMBER() OVER(PARTITION BY windowEnd ORDER BY cnt DESC) as row_num "+
|
||||
"FROM ("+innerSQL+")";
|
||||
|
||||
//取出前三名
|
||||
String outerSQL = "select * from ("+midSQL+") WHERE row_num <= 3";
|
||||
|
||||
|
||||
Table itemViewCount = tableEnvironment
|
||||
.sqlQuery(outerSQL);
|
||||
|
||||
tableEnvironment.toChangelogStream(itemViewCount).print();
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户行为POJO类
|
||||
*/
|
||||
public static class UserBehavior{
|
||||
public String userId;
|
||||
public String itemId;
|
||||
public String categoryId;
|
||||
public String behavior;
|
||||
public Long timeStamp;
|
||||
|
||||
public UserBehavior(){
|
||||
|
||||
}
|
||||
|
||||
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
|
||||
this.userId = userId;
|
||||
this.itemId = itemId;
|
||||
this.categoryId = categoryId;
|
||||
this.behavior = behavior;
|
||||
this.timeStamp = timeStamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "UserBehavior{" +
|
||||
"userId='" + userId + '\'' +
|
||||
", itemId='" + itemId + '\'' +
|
||||
", categoryId='" + categoryId + '\'' +
|
||||
", behavior='" + behavior + '\'' +
|
||||
", timeStamp=" + new Timestamp(timeStamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
package day08;
|
||||
|
||||
|
||||
import org.apache.flink.runtime.operators.resettable.SpillingResettableIterator;
|
||||
import scala.collection.mutable.ArrayLike;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* 实现一个链表
|
||||
*/
|
||||
public class Example11 {
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
//TODO 链表
|
||||
ListNode node1 = new ListNode(12, null);
|
||||
ListNode node2 = new ListNode(97, null);
|
||||
ListNode node3 = new ListNode(34, null);
|
||||
|
||||
|
||||
node1.next = node2;
|
||||
node2.next = node3;
|
||||
|
||||
ListNode head =node1;
|
||||
|
||||
//链表的遍历
|
||||
while (head != null){
|
||||
|
||||
System.out.println(head.val);
|
||||
head = head.next;
|
||||
}
|
||||
|
||||
System.out.println("===================");
|
||||
|
||||
|
||||
//TODO 树
|
||||
TreeNode root = new TreeNode(5);
|
||||
root.left = new TreeNode(3);
|
||||
root.right = new TreeNode(6);
|
||||
root.left.left = new TreeNode(1);
|
||||
root.left.right = new TreeNode(4);
|
||||
|
||||
//TODO 树形结构:
|
||||
// 5
|
||||
// / \
|
||||
// 3 6
|
||||
// / \
|
||||
// 1 4
|
||||
|
||||
|
||||
//先序遍历
|
||||
preOrderTraversal(root); // 5 3 1 4 6
|
||||
System.out.println("======================");
|
||||
|
||||
//中序遍历
|
||||
midOrderTraversal(root); // 1 3 4 5 6
|
||||
System.out.println("======================");
|
||||
|
||||
//后序遍历
|
||||
postOrderTraversal(root); //1 4 3 6 5
|
||||
System.out.println("======================");
|
||||
|
||||
//查找树
|
||||
Boolean result = treeSearch(root, 2);
|
||||
System.out.println(result);
|
||||
|
||||
//有向有环图
|
||||
GraphNode nodeA = new GraphNode(1);
|
||||
GraphNode nodeB = new GraphNode(2);
|
||||
GraphNode nodeC = new GraphNode(3);
|
||||
|
||||
nodeA.nabeur.add(nodeB);
|
||||
nodeB.nabeur.add(nodeC);
|
||||
nodeC.nabeur.add(nodeA);
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 对于二叉查找树 - 任意节点的左边一定小于该节点,右边一定大于该节点
|
||||
* 在查找一个这个数是否含有这个一个值可以实现的是
|
||||
* @param root
|
||||
* @param val
|
||||
* @return
|
||||
*/
|
||||
public static Boolean treeSearch(TreeNode root , int val){
|
||||
|
||||
if(root == null){
|
||||
return false;
|
||||
}else {
|
||||
if (root.val == val){
|
||||
return true;
|
||||
}else if(root.val < val){
|
||||
return treeSearch(root.right,val);
|
||||
}else {
|
||||
return treeSearch(root.left, val);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 先序遍历
|
||||
* 遍历方式:
|
||||
* 1.遍历根节点;
|
||||
* 2.对左子树进行先序遍历;
|
||||
* 3.对右子数进行先序遍历
|
||||
*/
|
||||
public static void preOrderTraversal(TreeNode root){
|
||||
|
||||
if(root != null){
|
||||
System.out.println(root.val);
|
||||
preOrderTraversal(root.left);
|
||||
preOrderTraversal(root.right);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 中序遍历
|
||||
* 遍历方式:
|
||||
* 1.对左子树进行中序遍历;
|
||||
* 2.遍历根节点;
|
||||
* 3.对右子数进行中序遍历
|
||||
*/
|
||||
public static void midOrderTraversal(TreeNode root){
|
||||
|
||||
if(root != null){
|
||||
midOrderTraversal(root.left);
|
||||
System.out.println(root.val);
|
||||
midOrderTraversal(root.right);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 后序遍历
|
||||
* 遍历方式:
|
||||
* 1.对左子树进行后序遍历;
|
||||
* 2.对右子数进行后序遍历
|
||||
* 3.遍历根节点;
|
||||
*/
|
||||
public static void postOrderTraversal(TreeNode root){
|
||||
|
||||
if(root != null){
|
||||
postOrderTraversal(root.left);
|
||||
postOrderTraversal(root.right);
|
||||
System.out.println(root.val);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//扩展成一个树
|
||||
public static class TreeNode{
|
||||
|
||||
public int val;
|
||||
public TreeNode left;
|
||||
public TreeNode right;
|
||||
|
||||
public TreeNode() {
|
||||
}
|
||||
|
||||
public TreeNode(int val) {
|
||||
this.val = val;
|
||||
}
|
||||
|
||||
public TreeNode(int val, TreeNode left, TreeNode right) {
|
||||
this.val = val;
|
||||
this.left = left;
|
||||
this.right = right;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//链表
|
||||
public static class ListNode{
|
||||
|
||||
public int val;
|
||||
public ListNode next;
|
||||
|
||||
public ListNode(){
|
||||
|
||||
}
|
||||
|
||||
public ListNode(int val) {
|
||||
this.val = val;
|
||||
}
|
||||
|
||||
public ListNode(int val, ListNode next) {
|
||||
this.val = val;
|
||||
this.next = next;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 有向无环图
|
||||
*/
|
||||
public static class GraphNode{
|
||||
|
||||
public int val;
|
||||
public ArrayList<GraphNode> nabeur =new ArrayList<>();
|
||||
|
||||
public GraphNode() {
|
||||
}
|
||||
|
||||
public GraphNode(int val) {
|
||||
this.val = val;
|
||||
}
|
||||
|
||||
public GraphNode(int val, ArrayList<GraphNode> nabeur) {
|
||||
this.val = val;
|
||||
this.nabeur = nabeur;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
package day08;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.functions.AggregateFunction;
|
||||
import org.apache.flink.api.common.functions.MapFunction;
|
||||
import org.apache.flink.api.java.tuple.Tuple;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.calcite.shaded.com.google.common.base.Charsets;
|
||||
|
||||
import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter;
|
||||
import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
|
||||
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
||||
import org.apache.flink.util.Collector;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.time.Duration;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* 用布隆过滤器实现去重
|
||||
*/
|
||||
public class Example2 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
env
|
||||
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
|
||||
.map(new MapFunction<String, UserBehavior>() {
|
||||
@Override
|
||||
public UserBehavior map(String value) throws Exception {
|
||||
|
||||
String[] arr = value.split(",");
|
||||
return new UserBehavior(arr[0],arr[1],arr[2],arr[3],Long.parseLong(arr[4])*1000L);
|
||||
|
||||
}
|
||||
})
|
||||
.filter(r -> r.behavior.equals("pv"))
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(0))
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
|
||||
@Override
|
||||
public long extractTimestamp(UserBehavior element, long recordTimestamp) {
|
||||
return element.timeStamp;
|
||||
}
|
||||
})
|
||||
)
|
||||
.keyBy( r ->true)
|
||||
.window(TumblingEventTimeWindows.of(Time.hours(1)))
|
||||
.aggregate(new CountAgg(),new WindowResult())
|
||||
.print();
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static class WindowResult extends ProcessWindowFunction<Long,String,Boolean, TimeWindow> {
|
||||
|
||||
@Override
|
||||
public void process(Boolean aBoolean, Context context, Iterable<Long> iterable, Collector<String> collector) throws Exception {
|
||||
String windowStart = new Timestamp(context.window().getStart()).toString();
|
||||
String windowStop = new Timestamp(context.window().getEnd()).toString();
|
||||
Long count = iterable.iterator().next();
|
||||
|
||||
collector.collect("窗口"+windowStart+"~"+windowStop+"的独立访客的数量为:"+count);
|
||||
}
|
||||
}
|
||||
|
||||
//布隆顾虑器实现去重
|
||||
public static class CountAgg implements AggregateFunction<UserBehavior, Tuple2<Long,BloomFilter<String>>,Long>{
|
||||
|
||||
|
||||
@Override
|
||||
public Tuple2<Long, BloomFilter<String>> createAccumulator() {
|
||||
//BloomFilter.create(输入数据类型,期望插入数据数量,误判率)
|
||||
return Tuple2.of(0L,BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8),100000,0.01));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<Long, BloomFilter<String>> add(UserBehavior userBehavior, Tuple2<Long, BloomFilter<String>> accumulator) {
|
||||
//如果布隆过滤器绝对不包含这个ID
|
||||
if(!accumulator.f1.mightContain(userBehavior.userId)){
|
||||
accumulator.f1.put(userBehavior.userId); //将对应位置置为一
|
||||
accumulator.f0 +=1L;
|
||||
|
||||
}
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getResult(Tuple2<Long, BloomFilter<String>> accumulator) {
|
||||
return accumulator.f0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<Long, BloomFilter<String>> merge(Tuple2<Long, BloomFilter<String>> longBloomFilterTuple2, Tuple2<Long, BloomFilter<String>> acc1) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 用户行为POJO类
|
||||
*/
|
||||
public static class UserBehavior{
|
||||
public String userId;
|
||||
public String itemId;
|
||||
public String categoryId;
|
||||
public String behavior;
|
||||
public Long timeStamp;
|
||||
|
||||
public UserBehavior(){
|
||||
|
||||
}
|
||||
|
||||
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
|
||||
this.userId = userId;
|
||||
this.itemId = itemId;
|
||||
this.categoryId = categoryId;
|
||||
this.behavior = behavior;
|
||||
this.timeStamp = timeStamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "UserBehavior{" +
|
||||
"userId='" + userId + '\'' +
|
||||
", itemId='" + itemId + '\'' +
|
||||
", categoryId='" + categoryId + '\'' +
|
||||
", behavior='" + behavior + '\'' +
|
||||
", timeStamp=" + new Timestamp(timeStamp) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
package day08;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.cep.CEP;
|
||||
import org.apache.flink.cep.PatternSelectFunction;
|
||||
import org.apache.flink.cep.PatternStream;
|
||||
import org.apache.flink.cep.pattern.Pattern;
|
||||
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 使用Flink-CEP检测连续三次登录失败
|
||||
*/
|
||||
public class Example3 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
|
||||
SingleOutputStreamOperator<Event> stream = env
|
||||
.fromElements(
|
||||
new Event("user-1", "fail", 1000L),
|
||||
new Event("user-1", "fail", 2000L),
|
||||
new Event("user-1", "fail", 3000L),
|
||||
new Event("user-2", "success", 3000L),
|
||||
new Event("user-1", "fail", 4000L)
|
||||
)
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Event>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
|
||||
@Override
|
||||
public long extractTimestamp(Event event, long l) {
|
||||
|
||||
return event.timestamp;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
//定义模板
|
||||
Pattern<Event, Event> pattern = Pattern
|
||||
.<Event>begin("first") //为第一个匹配事件起名字
|
||||
.where(new SimpleCondition<Event>() {
|
||||
@Override
|
||||
public boolean filter(Event event) throws Exception {
|
||||
|
||||
return event.eventType.equals("fail");
|
||||
|
||||
}
|
||||
})
|
||||
.next("second") //next表示严格紧邻
|
||||
.where(new SimpleCondition<Event>() {
|
||||
@Override
|
||||
public boolean filter(Event event) throws Exception {
|
||||
|
||||
return event.eventType.equals("fail");
|
||||
}
|
||||
})
|
||||
.next("third")
|
||||
.where(new SimpleCondition<Event>() {
|
||||
@Override
|
||||
public boolean filter(Event event) throws Exception {
|
||||
|
||||
return event.eventType.equals("fail");
|
||||
}
|
||||
});
|
||||
|
||||
//在流上匹配模板-获取到匹配到的流
|
||||
PatternStream<Event> patternStream = CEP.pattern(stream.keyBy(r -> r.user), pattern);
|
||||
|
||||
//使用select方法将匹配到的事件取出
|
||||
patternStream
|
||||
.select(new PatternSelectFunction<Event, String>() {
|
||||
@Override
|
||||
public String select(Map<String, List<Event>> map) throws Exception {
|
||||
//Map的key是给事件起的名字
|
||||
//列表是名字对应的事件所构成的列表
|
||||
Event first = map.get("first").get(0);
|
||||
Event second = map.get("second").get(0);
|
||||
Event third = map.get("third").get(0);
|
||||
String result = "用户:"+first.user+"在事件:"+first.timestamp+";"
|
||||
+second.timestamp+";"+third.timestamp+"登录失败了!";
|
||||
return result;
|
||||
}
|
||||
}).print();
|
||||
//用户:user-1在事件:1000;2000;3000登录失败了!
|
||||
//用户:user-1在事件:2000;3000;4000登录失败了!
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 登陆事件POJO类
|
||||
*/
|
||||
public static class Event{
|
||||
|
||||
public String user;
|
||||
public String eventType;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
}
|
||||
|
||||
public Event(String user, String eventType, Long timestamp) {
|
||||
this.user = user;
|
||||
this.eventType = eventType;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", eventType='" + eventType + '\'' +
|
||||
", timestamp=" + timestamp +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
package day08;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.cep.CEP;
|
||||
import org.apache.flink.cep.PatternSelectFunction;
|
||||
import org.apache.flink.cep.PatternStream;
|
||||
import org.apache.flink.cep.pattern.Pattern;
|
||||
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 使用Flink-CEP检测连续三次登录失败
|
||||
*/
|
||||
public class Example4 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
|
||||
SingleOutputStreamOperator<Event> stream = env
|
||||
.fromElements(
|
||||
new Event("user-1", "fail", 1000L),
|
||||
new Event("user-1", "fail", 2000L),
|
||||
new Event("user-1", "fail", 3000L),
|
||||
new Event("user-2", "success", 3000L),
|
||||
new Event("user-1", "fail", 4000L)
|
||||
)
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Event>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
|
||||
@Override
|
||||
public long extractTimestamp(Event event, long l) {
|
||||
|
||||
return event.timestamp;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
//定义模板
|
||||
//模板的简单写法
|
||||
Pattern<Event, Event> pattern = Pattern
|
||||
.<Event>begin("fail") //为第一个匹配事件起名字
|
||||
.where(new SimpleCondition<Event>() {
|
||||
@Override
|
||||
public boolean filter(Event event) throws Exception {
|
||||
|
||||
return event.eventType.equals("fail");
|
||||
|
||||
}
|
||||
})
|
||||
.times(3);
|
||||
|
||||
|
||||
//在流上匹配模板-获取到匹配到的流
|
||||
PatternStream<Event> patternStream = CEP.pattern(stream.keyBy(r -> r.user), pattern);
|
||||
|
||||
//使用select方法将匹配到的事件取出
|
||||
patternStream
|
||||
.select(new PatternSelectFunction<Event, String>() {
|
||||
@Override
|
||||
public String select(Map<String, List<Event>> map) throws Exception {
|
||||
//Map的key是给事件起的名字
|
||||
//列表是名字对应的事件所构成的列表
|
||||
Event first = map.get("fail").get(0);
|
||||
Event second = map.get("fail").get(1);
|
||||
Event third = map.get("fail").get(2);
|
||||
String result = "用户:"+first.user+"在事件:"+first.timestamp+";"
|
||||
+second.timestamp+";"+third.timestamp+"登录失败了!";
|
||||
return result;
|
||||
}
|
||||
}).print();
|
||||
//用户:user-1在事件:1000;2000;3000登录失败了!
|
||||
//用户:user-1在事件:2000;3000;4000登录失败了!
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 登陆事件POJO类
|
||||
*/
|
||||
public static class Event{
|
||||
|
||||
public String user;
|
||||
public String eventType;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
}
|
||||
|
||||
public Event(String user, String eventType, Long timestamp) {
|
||||
this.user = user;
|
||||
this.eventType = eventType;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", eventType='" + eventType + '\'' +
|
||||
", timestamp=" + timestamp +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
package day08;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.common.state.ValueState;
|
||||
import org.apache.flink.api.common.state.ValueStateDescriptor;
|
||||
import org.apache.flink.api.common.typeinfo.Types;
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
|
||||
import org.apache.flink.util.Collector;
|
||||
import sun.awt.geom.AreaOp;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* 使用状态机来实现检测连续三次登录失败
|
||||
*/
|
||||
public class Example5 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<Event> stream = env
|
||||
.fromElements(
|
||||
new Event("user-1", "fail", 1000L),
|
||||
new Event("user-1", "fail", 2000L),
|
||||
new Event("user-1", "fail", 3000L),
|
||||
new Event("user-2", "success", 3000L),
|
||||
new Event("user-1", "fail", 4000L)
|
||||
)
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Event>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
|
||||
@Override
|
||||
public long extractTimestamp(Event event, long l) {
|
||||
|
||||
return event.timestamp;
|
||||
}
|
||||
}));
|
||||
|
||||
stream
|
||||
.keyBy(r -> r.user)
|
||||
.process(new KeyedProcessFunction<String, Event, String>() {
|
||||
|
||||
//有限状态机
|
||||
private HashMap<Tuple2<String,String>,String> stateMachine =new HashMap<>();
|
||||
//初始化一个状态函数,来保存当前状态
|
||||
private ValueState<String> currentState;
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws Exception {
|
||||
|
||||
//状态转移矩阵
|
||||
//key : Tuple2<当前状态,接收到的信息>
|
||||
//value: 下一个状态
|
||||
stateMachine.put(Tuple2.of("INITIAL","success"),"SUCCESS");
|
||||
stateMachine.put(Tuple2.of("INITIAL","fail"),"S1");
|
||||
stateMachine.put(Tuple2.of("S1","fail"),"S2");
|
||||
stateMachine.put(Tuple2.of("S2","fail"),"FAIL");
|
||||
stateMachine.put(Tuple2.of("S1","success"),"SUCCESS");
|
||||
stateMachine.put(Tuple2.of("S2","success"),"SUCCESS");
|
||||
|
||||
currentState=getRuntimeContext().getState(
|
||||
new ValueStateDescriptor<String>("current-State", Types.STRING)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(Event value, Context context, Collector<String> collector) throws Exception {
|
||||
|
||||
if(currentState.value() == null){
|
||||
currentState.update("INITIAL");
|
||||
}
|
||||
|
||||
//计算将要跳转到的状态
|
||||
String nextState = stateMachine.get(Tuple2.of(currentState.value(),value.eventType));
|
||||
|
||||
if(nextState.equals("FAIL")){
|
||||
collector.collect("用户"+value.user+"连续三次登陆失败了");
|
||||
currentState.update("S2");
|
||||
}else if(nextState.equals("SUCCESS")){
|
||||
currentState.clear();
|
||||
|
||||
}else {
|
||||
currentState.update(nextState);
|
||||
}
|
||||
|
||||
}
|
||||
})
|
||||
.print();
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 登陆事件POJO类
|
||||
*/
|
||||
public static class Event{
|
||||
|
||||
public String user;
|
||||
public String eventType;
|
||||
public Long timestamp;
|
||||
|
||||
public Event() {
|
||||
}
|
||||
|
||||
public Event(String user, String eventType, Long timestamp) {
|
||||
this.user = user;
|
||||
this.eventType = eventType;
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Event{" +
|
||||
"user='" + user + '\'' +
|
||||
", eventType='" + eventType + '\'' +
|
||||
", timestamp=" + timestamp +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
package day08;
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.cep.CEP;
|
||||
import org.apache.flink.cep.PatternFlatSelectFunction;
|
||||
import org.apache.flink.cep.PatternFlatTimeoutFunction;
|
||||
import org.apache.flink.cep.PatternStream;
|
||||
import org.apache.flink.cep.pattern.Pattern;
|
||||
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.windowing.time.Time;
|
||||
import org.apache.flink.util.Collector;
|
||||
import org.apache.flink.util.OutputTag;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* 订单超时检测
|
||||
*/
|
||||
public class Example6 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
SingleOutputStreamOperator<OrderEvent> stream = env
|
||||
.fromElements(
|
||||
new OrderEvent("order-1", "create", 1000L),
|
||||
new OrderEvent("order-2", "create", 2000L),
|
||||
new OrderEvent("order-1", "pay", 3000L)
|
||||
)
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<OrderEvent>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<OrderEvent>() {
|
||||
@Override
|
||||
public long extractTimestamp(OrderEvent orderEvent, long l) {
|
||||
|
||||
return orderEvent.eventTime;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
Pattern<OrderEvent, OrderEvent> pattern = Pattern
|
||||
.<OrderEvent>begin("create")
|
||||
.where(new SimpleCondition<OrderEvent>() {
|
||||
@Override
|
||||
public boolean filter(OrderEvent orderEvent) throws Exception {
|
||||
|
||||
return orderEvent.eventType.equals("create");
|
||||
}
|
||||
})
|
||||
.next("pay")
|
||||
.where(new SimpleCondition<OrderEvent>() {
|
||||
@Override
|
||||
public boolean filter(OrderEvent orderEvent) throws Exception {
|
||||
|
||||
return orderEvent.eventType.equals("pay");
|
||||
}
|
||||
})
|
||||
.within(Time.seconds(5));//要求两个事件在5秒后发生
|
||||
|
||||
PatternStream<OrderEvent> patternStream = CEP.pattern(stream.keyBy(r -> r.orderId), pattern);
|
||||
|
||||
//匹配到的正常的事件给他输出,未支付的事件给他输出
|
||||
SingleOutputStreamOperator<String> result = patternStream
|
||||
.flatSelect(
|
||||
new OutputTag<String>("timeout") {
|
||||
}, //超时时间将发送到侧输出流
|
||||
new PatternFlatTimeoutFunction<OrderEvent, String>() {
|
||||
@Override
|
||||
public void timeout(Map<String, List<OrderEvent>> map, long l, Collector<String> collector) throws Exception {
|
||||
//用来处理超时的、没有匹配的数据
|
||||
OrderEvent create = map.get("create").get(0);
|
||||
//发送到侧输出流中去
|
||||
collector.collect("订单:" + create.orderId + "超时了");
|
||||
}
|
||||
},
|
||||
new PatternFlatSelectFunction<OrderEvent, String>() {
|
||||
@Override
|
||||
public void flatSelect(Map<String, List<OrderEvent>> map, Collector<String> collector) throws Exception {
|
||||
OrderEvent pay = map.get("pay").get(0);
|
||||
collector.collect("订单:" + pay.orderId + "已支付");
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
result.print("主输出流:");
|
||||
result.getSideOutput(new OutputTag<String>("timeout"){}).print("侧输出流:");
|
||||
|
||||
//主输出流:> 订单:order-1已支付
|
||||
//侧输出流:> 订单:order-2超时了
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static class OrderEvent {
|
||||
|
||||
public String orderId;
|
||||
public String eventType;
|
||||
public Long eventTime;
|
||||
|
||||
public OrderEvent() {
|
||||
}
|
||||
|
||||
public OrderEvent(String orderId, String eventType, Long eventTime) {
|
||||
this.orderId = orderId;
|
||||
this.eventType = eventType;
|
||||
this.eventTime = eventTime;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "OrderEvent{" +
|
||||
"orderId='" + orderId + '\'' +
|
||||
", EventType='" + eventType + '\'' +
|
||||
", eventTime=" + new Timestamp(eventTime) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
package day08;
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.java.tuple.Tuple3;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.table.api.EnvironmentSettings;
|
||||
import org.apache.flink.table.api.Table;
|
||||
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
|
||||
|
||||
import static org.apache.flink.table.api.Expressions.$;
|
||||
|
||||
/**
|
||||
* 将流转化为动态表
|
||||
*/
|
||||
public class Example7 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
//TODO 设置数据源和水位线
|
||||
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = env
|
||||
.fromElements(
|
||||
Tuple3.of("Mary", "./home", 12 * 60 * 60 * 1000L),
|
||||
Tuple3.of("Bob", "./cart", 12 * 60 * 60 * 1000L),
|
||||
Tuple3.of("Mary", "./prod?id=1", 12 * 60 * 60 * 1000L + 5 * 1000L),
|
||||
Tuple3.of("liz", "./home", 12 * 60 * 60 * 1000L + 60 * 100L),
|
||||
Tuple3.of("Bob", "./prod?id=3", 12 * 60 * 60 * 1000L + 90 * 1000L),
|
||||
Tuple3.of("Mary", "./prod?id=7", 12 * 60 * 60 * 1000L + 105 * 1000l)
|
||||
)
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple3<String, String, Long>>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple3<String, String, Long> stringStringLongTuple3, long l) {
|
||||
|
||||
return stringStringLongTuple3.f2;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
|
||||
//TODO 创建表环境
|
||||
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
|
||||
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env, settings);
|
||||
|
||||
|
||||
//TODO 数据流转化为动态表 -会随着流的到来而不断增大
|
||||
Table table = tableEnvironment
|
||||
.fromDataStream(
|
||||
stream,
|
||||
$("f0").as("user"),
|
||||
$("f1").as("url"),
|
||||
//使用rowTime方法,指定f2是事件时间
|
||||
$("f2").rowtime().as("cTime")
|
||||
);
|
||||
|
||||
//如果想把结果打印出来,需要将数据流转化为动态表
|
||||
//TODO 数据流 -> 动态表
|
||||
tableEnvironment.toDataStream(table).print();
|
||||
/*
|
||||
+I[Mary, ./home, 1970-01-01 12:00:00.0]
|
||||
+I[Bob, ./cart, 1970-01-01 12:00:00.0]
|
||||
+I[Mary, ./prod?id=1, 1970-01-01 12:00:05.0]
|
||||
+I[liz, ./home, 1970-01-01 12:00:06.0]
|
||||
+I[Bob, ./prod?id=3, 1970-01-01 12:01:30.0]
|
||||
+I[Mary, ./prod?id=7, 1970-01-01 12:01:45.0]
|
||||
*/
|
||||
|
||||
env.execute();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
package day08;
|
||||
|
||||
|
||||
import org.apache.flink.api.java.tuple.Tuple2;
|
||||
import org.apache.flink.streaming.api.datastream.DataStreamSource;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.table.api.EnvironmentSettings;
|
||||
import org.apache.flink.table.api.Table;
|
||||
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
|
||||
|
||||
import static org.apache.flink.table.api.Expressions.$;
|
||||
|
||||
/**
|
||||
* 使用FlinkSQL对数据进行连续查询
|
||||
*/
|
||||
public class Example8 {
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
//创建数据流
|
||||
DataStreamSource<Tuple2<String, String>> stream = env
|
||||
.fromElements(
|
||||
Tuple2.of("Mary", "./home"),
|
||||
Tuple2.of("Bob", "./cart"),
|
||||
Tuple2.of("Mary", "./prod?id=1"),
|
||||
Tuple2.of("liz", "./home")
|
||||
);
|
||||
|
||||
//创建表环境
|
||||
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
|
||||
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env, settings);
|
||||
|
||||
|
||||
//创建动态表
|
||||
Table table = tableEnvironment
|
||||
.fromDataStream(
|
||||
stream,
|
||||
$("f0").as("user"),
|
||||
$("f1").as("url")
|
||||
);
|
||||
|
||||
//注册临时视图
|
||||
tableEnvironment.createTemporaryView("clicks",table);
|
||||
|
||||
//sql查询
|
||||
Table result = tableEnvironment
|
||||
.sqlQuery(
|
||||
"select user,COUNT(url) as cnt FROM clicks GROUP BY user"
|
||||
);
|
||||
|
||||
//查询结果转换成数据流
|
||||
//更新日志流(用于查询中有聚合操作的情况)
|
||||
tableEnvironment.toChangelogStream(result).print();
|
||||
|
||||
/*
|
||||
+I[Mary, 1]
|
||||
+I[Bob, 1]
|
||||
-U[Mary, 1] //由于以前的Mary已经向下游发送了,所以这里-是告诉下游,减号是旧结果,要作废,相当于逻辑删除
|
||||
+U[Mary, 2] //新插入一条新结果,update更新一下,添加了一条[Mary, 2]这样的数据
|
||||
+I[liz, 1]
|
||||
|
||||
|
||||
|
||||
*/
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
package day08;
|
||||
|
||||
|
||||
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
|
||||
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
|
||||
import org.apache.flink.api.java.tuple.Tuple3;
|
||||
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.table.api.EnvironmentSettings;
|
||||
import org.apache.flink.table.api.Table;
|
||||
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
|
||||
|
||||
import static org.apache.flink.table.api.Expressions.$;
|
||||
|
||||
/**
|
||||
* FlinkSQL中的开窗操作
|
||||
*/
|
||||
public class Example9 {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
|
||||
env.setParallelism(1);
|
||||
|
||||
|
||||
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = env
|
||||
.fromElements(
|
||||
Tuple3.of("Mary", "./home", 12 * 60 * 60 * 1000L),
|
||||
Tuple3.of("Bob", "./cart", 12 * 60 * 60 * 1000L),
|
||||
Tuple3.of("Mary", "./prod?id=1", 12 * 60 * 60 * 1000L + 2 * 60 * 1000L),
|
||||
Tuple3.of("Mary", "./prod?id=4", 12 * 60 * 60 * 1000L + 55 * 60 * 1000L),
|
||||
Tuple3.of("Bob", "./prod?id=5", 13 * 60 * 60 * 1000L + 60 * 1000L),
|
||||
Tuple3.of("liz", "./home", 13 * 60 * 60 * 1000L + 30 * 60 * 100L),
|
||||
Tuple3.of("liz", "./prod?id=7", 13 * 60 * 60 * 1000L + 59 * 60 * 100L),
|
||||
Tuple3.of("Mary", "./cart", 14 * 60 * 60 * 1000L),
|
||||
Tuple3.of("liz", "./home", 14 * 60 * 60 * 1000L + 2 * 60 * 1000L),
|
||||
Tuple3.of("Bob", "./prod?id=3", 14 * 60 * 60 * 1000L + 30 * 60 * 1000L),
|
||||
Tuple3.of("Bob", "./home", 14 * 60 * 60 * 1000L + 40 * 60 * 1000l)
|
||||
)
|
||||
.assignTimestampsAndWatermarks(
|
||||
WatermarkStrategy.<Tuple3<String, String, Long>>forMonotonousTimestamps()
|
||||
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
|
||||
@Override
|
||||
public long extractTimestamp(Tuple3<String, String, Long> stringStringLongTuple3, long l) {
|
||||
|
||||
return stringStringLongTuple3.f2;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
|
||||
//TODO 创建表环境
|
||||
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
|
||||
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env, settings);
|
||||
|
||||
|
||||
//TODO 数据流转化为动态表 -会随着流的到来而不断增大
|
||||
Table table = tableEnvironment
|
||||
.fromDataStream(
|
||||
stream,
|
||||
$("f0").as("user"),
|
||||
$("f1").as("url"),
|
||||
//使用rowTime方法,指定f2是事件时间
|
||||
$("f2").rowtime().as("cTime")
|
||||
);
|
||||
|
||||
tableEnvironment.createTemporaryView("clicks",table);
|
||||
|
||||
//以cTime开窗,窗口的大小是一小时,滚动窗口
|
||||
Table result = tableEnvironment
|
||||
.sqlQuery(
|
||||
"select user ,COUNT(url) as cnt ,TUMBLE_END(cTime,INTERVAL '1' HOUR) AS endT " +
|
||||
"from clicks group by user,TUMBLE(cTime,INTERVAL '1' HOUR)"
|
||||
);
|
||||
|
||||
tableEnvironment.toChangelogStream(result).print();
|
||||
|
||||
/*
|
||||
+I[Mary, 3, 1970-01-01T13:00]
|
||||
+I[Bob, 1, 1970-01-01T13:00]
|
||||
+I[liz, 2, 1970-01-01T14:00]
|
||||
+I[Bob, 1, 1970-01-01T14:00]
|
||||
+I[Bob, 2, 1970-01-01T15:00]
|
||||
+I[Mary, 1, 1970-01-01T15:00]
|
||||
+I[liz, 1, 1970-01-01T15:00]
|
||||
|
||||
|
||||
*/
|
||||
|
||||
|
||||
env.execute();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
|
@ -0,0 +1,20 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.atguigu.hive</groupId>
|
||||
<artifactId>Gulivideo</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
<version>3.1.3</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
package com.atguigu.gulivideo.etl;
|
||||
|
||||
/**
|
||||
* 封装一个工具类,用于所有的数据清洗方法
|
||||
*/
|
||||
public class ETLUtils {
|
||||
|
||||
|
||||
/**
|
||||
* 清洗视频数据
|
||||
*
|
||||
* 规则:
|
||||
* 1.数据长度必须大于等于9
|
||||
* 2.将数据的类别中的空格去掉
|
||||
* 3.将关联视频通过&拼接
|
||||
*
|
||||
* @param line
|
||||
* @return 如果数据合法,返回清洗完的数据
|
||||
* 如果数据不合法,返回null
|
||||
*
|
||||
* 测试数据:
|
||||
* RX24KLBhwMI lemonette 697 People & Blogs 512 24149 4.22 315 474 t60tW0WevkE WZgoejVDZlo Xa_op4MhSkg MwynZ8qTwXA sfG2rtAkAcg j72VLPwzd_c 24Qfs69Al3U EGWutOjVx4M KVkseZR5coU R6OaRcsfnY4 dGM3k_4cNhE ai-cSq6APLQ 73M0y-iD9WE 3uKOSjE79YA 9BBu5N0iFBg 7f9zwx52xgA ncEV0tSC7xM H-J8Kbx9o68 s8xf4QX1UvA 2cKd9ERh5-8
|
||||
*/
|
||||
public static String etlGulivideoData(String line){
|
||||
StringBuffer sbs =new StringBuffer();
|
||||
//1.切割数据
|
||||
String[] splits = line.split("\t");
|
||||
//2.规则一
|
||||
if(splits.length<9){
|
||||
return null;
|
||||
}
|
||||
//3.规则二
|
||||
splits[3]= splits[3].replaceAll(" ", "");
|
||||
|
||||
//4.规则三
|
||||
for (int i = 0; i < splits.length; i++) {
|
||||
//有相关视频 或 没有相关视频
|
||||
if(i<=8){
|
||||
if(i==splits.length-1){
|
||||
sbs.append(splits[i]);
|
||||
}else {
|
||||
sbs.append(splits[i]).append("\t");
|
||||
}
|
||||
}else {
|
||||
if(i==splits.length-1){
|
||||
sbs.append(splits[i]);
|
||||
}else {
|
||||
sbs.append(splits[i]).append("&");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return sbs.toString();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
String line = "RX24KLBhwMI\tlemonette\t697\tPeople & Blogs\t512\t24149\t4.22\t315";
|
||||
String result=etlGulivideoData(line);
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
package com.atguigu.gulivideo.etl;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class GulivideoETLDriver {
|
||||
|
||||
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
|
||||
|
||||
Configuration conf =new Configuration();
|
||||
Job job =Job.getInstance(conf);
|
||||
|
||||
job.setJarByClass(GulivideoETLDriver.class);
|
||||
job.setMapperClass(GulivideoETLMapper.class);
|
||||
job.setMapOutputKeyClass(Text.class);
|
||||
job.setMapOutputValueClass(NullWritable.class);
|
||||
job.setOutputKeyClass(Text.class);
|
||||
job.setOutputValueClass(NullWritable.class);
|
||||
//如果不指定,则使用的是系统自带的reduce
|
||||
job.setNumReduceTasks(0);
|
||||
|
||||
FileInputFormat.setInputPaths(job,new Path(args[0]));
|
||||
FileOutputFormat.setOutputPath(job,new Path(args[1]));
|
||||
job.waitForCompletion(true);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
package com.atguigu.gulivideo.etl;
|
||||
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class GulivideoETLMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
|
||||
|
||||
Text outk= new Text();
|
||||
@Override
|
||||
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
|
||||
|
||||
String line = value.toString();
|
||||
String result = ETLUtils.etlGulivideoData(line);
|
||||
if(result==null){
|
||||
return ;
|
||||
}
|
||||
outk.set(result);
|
||||
context.write(outk,NullWritable.get());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.atguigu.hbase</groupId>
|
||||
<artifactId>HBase</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hbase</groupId>
|
||||
<artifactId>hbase-server</artifactId>
|
||||
<version>2.0.5</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hbase</groupId>
|
||||
<artifactId>hbase-client</artifactId>
|
||||
<version>2.0.5</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,278 @@
|
|||
package com.atguigu.hbase;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.*;
|
||||
import org.apache.hadoop.hbase.client.*;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* ctrl+P查看需要传入什么对象,ctrl+O查看成员方法,ctrl+H查看类和实现类
|
||||
*/
|
||||
|
||||
/**
|
||||
* Connection : 通过ConnectionFactory获取,是重量级实现,因此不需要每次都打开,只需要打开一次
|
||||
* Table : 主要负责DML操作 ,轻量级实现 ,每次打开,用完以后关闭
|
||||
* Admin : 主要负责DDL操作 ,轻量级实现,每次打开,用完以后关闭
|
||||
*/
|
||||
public class HBaseDemo {
|
||||
|
||||
private static Connection connection;
|
||||
|
||||
static {
|
||||
//创建hadoop的conf,用HBaseConfiguration去create
|
||||
Configuration conf = HBaseConfiguration.create();
|
||||
//在conf中指定hbase地址,去连接;在hbase-site.xml中有指定过zookeeper中hbase位置
|
||||
conf.set("hbase.zookeeper.quorum","Ding202,Ding203,Ding204");
|
||||
try {
|
||||
connection= ConnectionFactory.createConnection(conf);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
//createTable("","t1","info1","info2");
|
||||
// dropTable("","t1");
|
||||
//putData("","stu","1003","info","name","wangwu");
|
||||
//deleteData("","stu","1003","info","name");
|
||||
//getData("","stu","1001","info","name");
|
||||
//scanData("","stu","1001","1003");
|
||||
createTableWithRegions("","staff4","info");
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断表是否存在
|
||||
*/
|
||||
public static boolean existTable(String nameSpaceName,String tableName) throws IOException {
|
||||
Admin admin=connection.getAdmin();
|
||||
return admin.tableExists(TableName.valueOf(nameSpaceName,tableName));
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建table,带预分区
|
||||
*/
|
||||
public static void createTableWithRegions(String nameSpaceName,String tableName,String ... cfs) throws IOException {
|
||||
|
||||
if(existTable(nameSpaceName,tableName)){
|
||||
System.err.println((nameSpaceName == null ||nameSpaceName.equals("")? "default" : nameSpaceName)+":"+tableName+"已经存在");
|
||||
return;
|
||||
}
|
||||
Admin admin = connection.getAdmin();
|
||||
//不知道怎么写就一点点的去找
|
||||
TableDescriptorBuilder tableDescriptorBuilder =
|
||||
TableDescriptorBuilder.newBuilder(TableName.valueOf(nameSpaceName,tableName));
|
||||
|
||||
if(cfs == null || cfs.length < 1){
|
||||
System.err.println("至少指定一个列组");
|
||||
return;
|
||||
}
|
||||
for (String cf : cfs) {
|
||||
ColumnFamilyDescriptorBuilder columnFamilyDescriptorBuilder = ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes(cf));
|
||||
ColumnFamilyDescriptor columnFamilyDescriptor = columnFamilyDescriptorBuilder.build();
|
||||
//setColumnFamily需要传入一个ColumnFamilyDescriptor对象
|
||||
tableDescriptorBuilder.setColumnFamily(columnFamilyDescriptor);
|
||||
}
|
||||
TableDescriptor tableDescriptor = tableDescriptorBuilder.build();
|
||||
//创建表,需要传入一个TableDescriptor对象
|
||||
//传入一个二维的分区字节数组
|
||||
//本质上就是传入一个一位数组,二维数组就是一位数组挂了一个数组,就是要求把值转成一个字节数组了
|
||||
byte[][] splitkeys=new byte[4][];
|
||||
splitkeys[0]=Bytes.toBytes("1000");
|
||||
splitkeys[1]=Bytes.toBytes("2000");
|
||||
splitkeys[2]=Bytes.toBytes("3000");
|
||||
splitkeys[3]=Bytes.toBytes("4000");
|
||||
admin.createTable(tableDescriptor,splitkeys);
|
||||
admin.close();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* scan
|
||||
*/
|
||||
public static void scanData(String nameSpaceName,String tableName,String startRow,String stopRow) throws IOException {
|
||||
|
||||
Table table = connection.getTable(TableName.valueOf(nameSpaceName, tableName));
|
||||
|
||||
Scan scan = new Scan();
|
||||
//需要传入一个scanner对象
|
||||
ResultScanner scanner = table.getScanner(scan);
|
||||
//scan.withStartRow(Bytes.toBytes(startRow));
|
||||
//scan.withStopRow(Bytes.toBytes(stopRow));
|
||||
//上述代码可以简写为
|
||||
scan.withStartRow(Bytes.toBytes(startRow)).withStopRow(Bytes.toBytes(stopRow));
|
||||
|
||||
|
||||
for (Result result : scanner) {
|
||||
//一个result就是一条数据
|
||||
//通过每一个数据在获取他的cells
|
||||
//cells中相当于数据中每一个字段的值
|
||||
Cell[] cells = result.rawCells();
|
||||
for (Cell cell : cells) {
|
||||
String cellString = Bytes.toString(CellUtil.cloneRow(cell))+":"+
|
||||
Bytes.toString(CellUtil.cloneFamily(cell))+":"+
|
||||
Bytes.toString(CellUtil.cloneQualifier(cell))+":"+
|
||||
Bytes.toString(CellUtil.cloneValue(cell));
|
||||
System.out.println(cellString);
|
||||
}
|
||||
System.out.println("--------------------------------------------");
|
||||
|
||||
}
|
||||
|
||||
table.close();
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get
|
||||
*/
|
||||
public static void getData(String nameSpaceName,String tableName,String rowkey,String cf,String cl) throws IOException {
|
||||
|
||||
Table table=connection.getTable(TableName.valueOf(nameSpaceName,tableName));
|
||||
|
||||
Get get = new Get(Bytes.toBytes(rowkey));
|
||||
//get.addFamily(Bytes.toBytes(cf)); //获取某一个列组的数据
|
||||
get.addColumn(Bytes.toBytes(cf),Bytes.toBytes(cl)); //获取整行数据
|
||||
//需要传入一个get对象
|
||||
Result result = table.get(get);
|
||||
Cell[] cells = result.rawCells();
|
||||
for (Cell cell : cells) {
|
||||
String cellString = Bytes.toString(CellUtil.cloneRow(cell))+":"+
|
||||
Bytes.toString(CellUtil.cloneFamily(cell))+":"+
|
||||
Bytes.toString(CellUtil.cloneQualifier(cell))+":"+
|
||||
Bytes.toString(CellUtil.cloneValue(cell));
|
||||
System.out.println(cellString);
|
||||
|
||||
}
|
||||
table.close();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* delete
|
||||
*/
|
||||
public static void deleteData(String nameSpaceName,String tableName,String rowkey,String cf,String cl ) throws IOException {
|
||||
|
||||
Table table=connection.getTable(TableName.valueOf(nameSpaceName,tableName));
|
||||
|
||||
Delete delete = new Delete(Bytes.toBytes(rowkey)); //如果至指定rowkey,就是删除整条数据
|
||||
|
||||
//delete.addFamily(Bytes.toBytes(cf)); 指定删除某个列组的数据type:DeleteFamily
|
||||
//delete.addColumn(Bytes.toBytes(cf),Bytes.toBytes(cl)); //type:Delete
|
||||
delete.addColumns(Bytes.toBytes(cf),Bytes.toBytes(cl)); //type:DeleteColumn
|
||||
//需要传入一个delete对象
|
||||
table.delete(delete);
|
||||
|
||||
table.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 添加或修改数据
|
||||
* put
|
||||
*/
|
||||
public static void putData(String nameSpaceName,String tableName,String rowkey,String cf,String cl ,String value) throws IOException {
|
||||
|
||||
Table table=connection.getTable(TableName.valueOf(nameSpaceName,tableName));
|
||||
|
||||
Put put = new Put(Bytes.toBytes(rowkey));
|
||||
put.addColumn(Bytes.toBytes(cf),Bytes.toBytes(cl),Bytes.toBytes(value));
|
||||
//需要准备一个put对象
|
||||
table.put(put);
|
||||
table.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 删除表
|
||||
*/
|
||||
public static void dropTable(String nameSpaceName,String tableName) throws IOException {
|
||||
|
||||
if(!existTable(nameSpaceName,tableName)){
|
||||
System.err.println("表不存在");
|
||||
}
|
||||
|
||||
Admin admin = connection.getAdmin();
|
||||
TableName tn = TableName.valueOf(nameSpaceName, tableName);
|
||||
admin.disableTable(tn);
|
||||
admin.deleteTable(tn);
|
||||
admin.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建table
|
||||
*/
|
||||
public static void createTable(String nameSpaceName,String tableName,String ... cfs) throws IOException {
|
||||
|
||||
if(existTable(nameSpaceName,tableName)){
|
||||
System.err.println((nameSpaceName == null ||nameSpaceName.equals("")? "default" : nameSpaceName)+":"+tableName+"已经存在");
|
||||
return;
|
||||
}
|
||||
Admin admin = connection.getAdmin();
|
||||
//不知道怎么写就一点点的去找
|
||||
TableDescriptorBuilder tableDescriptorBuilder =
|
||||
TableDescriptorBuilder.newBuilder(TableName.valueOf(nameSpaceName,tableName));
|
||||
|
||||
if(cfs == null || cfs.length < 1){
|
||||
System.err.println("至少指定一个列组");
|
||||
return;
|
||||
}
|
||||
for (String cf : cfs) {
|
||||
ColumnFamilyDescriptorBuilder columnFamilyDescriptorBuilder = ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes(cf));
|
||||
ColumnFamilyDescriptor columnFamilyDescriptor = columnFamilyDescriptorBuilder.build();
|
||||
//setColumnFamily需要传入一个ColumnFamilyDescriptor对象
|
||||
tableDescriptorBuilder.setColumnFamily(columnFamilyDescriptor);
|
||||
}
|
||||
TableDescriptor tableDescriptor = tableDescriptorBuilder.build();
|
||||
//创建表,需要传入一个TableDescriptor对象
|
||||
admin.createTable(tableDescriptor);
|
||||
admin.close();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* 创建NameSpace
|
||||
*/
|
||||
public static void createNameSpace(String nameSpace) throws IOException {
|
||||
//1.基本的判空操作
|
||||
if(nameSpace==null||nameSpace.equals("")){
|
||||
System.err.println("nameSpace名字不能为空");
|
||||
}
|
||||
//2.获取Admin对象
|
||||
Admin admin = connection.getAdmin();
|
||||
//查看源码发现NamespaceDescriptor需要使用以下方式获取
|
||||
NamespaceDescriptor.Builder builder = NamespaceDescriptor.create(nameSpace);
|
||||
NamespaceDescriptor namespaceDescriptor = builder.build();
|
||||
try{
|
||||
//调用方法,需要传入一个namespaceDescriptor对象
|
||||
admin.createNamespace(namespaceDescriptor);
|
||||
System.out.println(nameSpace+"创建成功");
|
||||
}catch (NamespaceExistException e){
|
||||
System.err.println(nameSpace+"已存在");
|
||||
}finally {
|
||||
admin.close();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.atguigu.hdfs</groupId>
|
||||
<artifactId>HdfsClient</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.12</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j-impl</artifactId>
|
||||
<version>2.12.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
<version>3.1.3</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,173 @@
|
|||
package com.atguigu.hdfs;
|
||||
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
|
||||
/**
|
||||
* 1.和HDFS建立连接
|
||||
* 2.调用API完成具体功能
|
||||
* 3.关闭连接
|
||||
*/
|
||||
|
||||
public class HdfsClientTest {
|
||||
|
||||
|
||||
private FileSystem fs;
|
||||
|
||||
|
||||
/**
|
||||
* 上传文件
|
||||
* 测试配置的优先级 configuration > hdfs-site.xml > hdfs-default.xml
|
||||
* 第一个参数delSrc 上传之后是否将文件删除
|
||||
* 第二个参数overwrite 当目的路径存在与要上传的文件的名字相同的时候是否覆盖
|
||||
* 第三个个参数pathString 要上传的本地文件路径
|
||||
* 第四个参数pathString 上传到hdfs上的nn的路径
|
||||
*/
|
||||
@Test
|
||||
public void testCopyFromLocal() throws IOException {
|
||||
|
||||
fs.copyFromLocalFile(false,true,
|
||||
new Path("E:\\尚硅谷 大数据\\2021年大数据\\07.Hadoop\\01.笔记\\hello.txt"),
|
||||
new Path("/client_test"));
|
||||
}
|
||||
|
||||
/**
|
||||
* 下载文件
|
||||
*第一个参数delSrc 下载之后是否将文件删除
|
||||
*第二个个参数pathString 要下载的在hdfs上的nn的文件路径
|
||||
*第三个参数pathString 要下载的文件的目的地路径
|
||||
*第四个参数useRawLocalFileSystem 是否生成一个rcc密文,校验文件是否损坏
|
||||
*/
|
||||
@Test
|
||||
public void testcopyToLOCAL() throws IOException {
|
||||
|
||||
fs.copyToLocalFile(false,
|
||||
new Path("/client_test/hello.txt"),
|
||||
new Path("E:\\尚硅谷 大数据\\2021年大数据\\07.Hadoop\\02.资料"),
|
||||
true);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除文件和目录
|
||||
* 第一个参数 pathString 要删除的在hdfs上的nn的文件的路径
|
||||
*第二个参数 recursive 是否要使用递归删除,即当文件夹中非空时是否全部删除
|
||||
*/
|
||||
@Test
|
||||
public void testDelete() throws IOException {
|
||||
|
||||
fs.delete(new Path("/client_test/hello.txt"),
|
||||
true);
|
||||
}
|
||||
|
||||
/**
|
||||
* 文件的更名或者移动
|
||||
* 第一个参数 pathString 要移动的在hdfs上的nn的文件的路径
|
||||
*第二个参数 pathString 目的地在hdfs上的nn的文件的路径
|
||||
*/
|
||||
@Test
|
||||
public void testRename() throws IOException {
|
||||
|
||||
//移动文件
|
||||
// fs.rename(new Path("/sanguo/zhangfei.txt"),new Path("/client_test/"));
|
||||
|
||||
//更名文件
|
||||
fs.rename(new Path("/client_test/zhangfei.txt"),
|
||||
new Path("/client_test/sunshangxaing.txt"));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查看文件详情
|
||||
* 第一个参数 pathString 要移动的在hdfs上的nn的文件的路径
|
||||
*第二个参数 recursive 是否要使用递归查询,即当文件夹中非空时是否全部查询
|
||||
*/
|
||||
@Test
|
||||
public void testListFiles() throws IOException {
|
||||
|
||||
RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);
|
||||
while(listFiles.hasNext()){
|
||||
LocatedFileStatus fileStatus = listFiles.next();
|
||||
System.out.println("文件名称:"+fileStatus.getPath().getName());
|
||||
System.out.println("块大小:"+fileStatus.getBlockSize());
|
||||
System.out.println("副本数:"+fileStatus.getReplication());
|
||||
System.out.println("权限信息:"+fileStatus.getPermission());
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断一个路径是文件还是目录
|
||||
* 第一个参数 pathString 要判断的路径名
|
||||
*/
|
||||
@Test
|
||||
public void testListStatus() throws IOException {
|
||||
|
||||
FileStatus[] listStatus = fs.listStatus(new Path("/"));
|
||||
for (FileStatus status : listStatus) {
|
||||
if(status.isDirectory()){
|
||||
System.out.println("DIR:"+status.getPath().getName());
|
||||
}else{
|
||||
System.out.println("FILE:"+status.getPath().getName());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 获取FileSystem对象
|
||||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
@Before
|
||||
public void init() throws IOException, InterruptedException {
|
||||
// HDFS的访问路径 hfds://Ding202:9820
|
||||
URI uri = URI.create("hdfs://Ding202:9820");
|
||||
// conf 配置对象
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("dfs.replication","6");
|
||||
// 操作的用户(用哪个用户操作HDFS)
|
||||
String user="dingjiawen";
|
||||
//获取HDFS的客户端连接对象(文件系统对象)
|
||||
fs = FileSystem.get(uri, conf, user);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 关闭资源
|
||||
* @throws IOException
|
||||
*/
|
||||
@After
|
||||
public void close() throws IOException {
|
||||
fs.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取HDFS的客户端连接对象
|
||||
* * @param uri HDFS的访问路径 hfds://Ding202:9820
|
||||
* * @param conf 配置对象
|
||||
* * @param user 操作的用户(用哪个用户操作HDFS)
|
||||
*/
|
||||
@Test
|
||||
public void testCreateHdfsClient() throws IOException, InterruptedException {
|
||||
// HDFS的访问路径 hfds://Ding202:9820
|
||||
URI uri = URI.create("hdfs://Ding202:9820");
|
||||
// conf 配置对象
|
||||
Configuration conf = new Configuration();
|
||||
// 操作的用户(用哪个用户操作HDFS)
|
||||
String user="dingjiawen";
|
||||
//获取HDFS的客户端连接对象(文件系统对象)
|
||||
FileSystem fileSystem = FileSystem.get(uri, conf, user);
|
||||
System.out.println(fileSystem.getClass().getName());
|
||||
//关闭资源
|
||||
fileSystem.close();
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,644 @@
|
|||
<?xml version="1.0"?>
|
||||
<fsimage>
|
||||
<version>
|
||||
<layoutVersion>-64</layoutVersion>
|
||||
<onDiskVersion>1</onDiskVersion>
|
||||
<oivRevision>ba631c436b806728f8ec2f54ab1e289526c90579</oivRevision>
|
||||
</version>
|
||||
<NameSection>
|
||||
<namespaceId>1750466601</namespaceId>
|
||||
<genstampV1>1000</genstampV1>
|
||||
<genstampV2>1030</genstampV2>
|
||||
<genstampV1Limit>0</genstampV1Limit>
|
||||
<lastAllocatedBlockId>1073741853</lastAllocatedBlockId>
|
||||
<txid>291</txid>
|
||||
</NameSection>
|
||||
<ErasureCodingSection>
|
||||
<erasureCodingPolicy>
|
||||
<policyId>1</policyId>
|
||||
<policyName>RS-6-3-1024k</policyName>
|
||||
<cellSize>1048576</cellSize>
|
||||
<policyState>DISABLED</policyState>
|
||||
<ecSchema>
|
||||
<codecName>rs</codecName>
|
||||
<dataUnits>6</dataUnits>
|
||||
<parityUnits>3</parityUnits>
|
||||
</ecSchema>
|
||||
</erasureCodingPolicy>
|
||||
|
||||
<erasureCodingPolicy>
|
||||
<policyId>2</policyId>
|
||||
<policyName>RS-3-2-1024k</policyName>
|
||||
<cellSize>1048576</cellSize>
|
||||
<policyState>DISABLED</policyState>
|
||||
<ecSchema>
|
||||
<codecName>rs</codecName>
|
||||
<dataUnits>3</dataUnits>
|
||||
<parityUnits>2</parityUnits>
|
||||
</ecSchema>
|
||||
</erasureCodingPolicy>
|
||||
|
||||
<erasureCodingPolicy>
|
||||
<policyId>3</policyId>
|
||||
<policyName>RS-LEGACY-6-3-1024k</policyName>
|
||||
<cellSize>1048576</cellSize>
|
||||
<policyState>DISABLED</policyState>
|
||||
<ecSchema>
|
||||
<codecName>rs-legacy</codecName>
|
||||
<dataUnits>6</dataUnits>
|
||||
<parityUnits>3</parityUnits>
|
||||
</ecSchema>
|
||||
</erasureCodingPolicy>
|
||||
|
||||
<erasureCodingPolicy>
|
||||
<policyId>4</policyId>
|
||||
<policyName>XOR-2-1-1024k</policyName>
|
||||
<cellSize>1048576</cellSize>
|
||||
<policyState>DISABLED</policyState>
|
||||
<ecSchema>
|
||||
<codecName>xor</codecName>
|
||||
<dataUnits>2</dataUnits>
|
||||
<parityUnits>1</parityUnits>
|
||||
</ecSchema>
|
||||
</erasureCodingPolicy>
|
||||
|
||||
<erasureCodingPolicy>
|
||||
<policyId>5</policyId>
|
||||
<policyName>RS-10-4-1024k</policyName>
|
||||
<cellSize>1048576</cellSize>
|
||||
<policyState>DISABLED</policyState>
|
||||
<ecSchema>
|
||||
<codecName>rs</codecName>
|
||||
<dataUnits>10</dataUnits>
|
||||
<parityUnits>4</parityUnits>
|
||||
</ecSchema>
|
||||
</erasureCodingPolicy>
|
||||
|
||||
</ErasureCodingSection>
|
||||
|
||||
<INodeSection>
|
||||
<lastInodeId>16455</lastInodeId>
|
||||
<numInodes>37</numInodes>
|
||||
<inode>
|
||||
<id>16385</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name></name>
|
||||
<mtime>1634992639432</mtime>
|
||||
<permission>dingjiawen:supergroup:0755</permission>
|
||||
<nsquota>9223372036854775807</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16386</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>wcinput</name>
|
||||
<mtime>1634898562368</mtime>
|
||||
<permission>dingjiawen:supergroup:0755</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16387</id>
|
||||
<type>FILE</type>
|
||||
<name>hello.txt</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634898562869</mtime>
|
||||
<atime>1634902318394</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0644</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741825</id>
|
||||
<genstamp>1001</genstamp>
|
||||
<numBytes>92</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16388</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>tmp</name>
|
||||
<mtime>1634902310022</mtime>
|
||||
<permission>dingjiawen:supergroup:0700</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16389</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>hadoop-yarn</name>
|
||||
<mtime>1634898718216</mtime>
|
||||
<permission>dingjiawen:supergroup:0700</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16390</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>staging</name>
|
||||
<mtime>1634898723502</mtime>
|
||||
<permission>dingjiawen:supergroup:0700</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16391</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>dingjiawen</name>
|
||||
<mtime>1634898718216</mtime>
|
||||
<permission>dingjiawen:supergroup:0700</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16392</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>.staging</name>
|
||||
<mtime>1634902324657</mtime>
|
||||
<permission>dingjiawen:supergroup:0700</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16398</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>history</name>
|
||||
<mtime>1634899864728</mtime>
|
||||
<permission>dingjiawen:supergroup:0755</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16399</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>done_intermediate</name>
|
||||
<mtime>1634898723520</mtime>
|
||||
<permission>dingjiawen:supergroup:1777</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16400</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>dingjiawen</name>
|
||||
<mtime>1634902422435</mtime>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16401</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>wcoutput</name>
|
||||
<mtime>1634898742662</mtime>
|
||||
<permission>dingjiawen:supergroup:0755</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16408</id>
|
||||
<type>FILE</type>
|
||||
<name>part-r-00000</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634898742564</mtime>
|
||||
<atime>1634898742434</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0644</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741832</id>
|
||||
<genstamp>1008</genstamp>
|
||||
<numBytes>78</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16410</id>
|
||||
<type>FILE</type>
|
||||
<name>_SUCCESS</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634898742664</mtime>
|
||||
<atime>1634898742662</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0644</permission>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16413</id>
|
||||
<type>FILE</type>
|
||||
<name>
|
||||
job_1634897835344_0001-1634898720164-dingjiawen-word+count-1634898743224-1-1-SUCCEEDED-default-1634898727228.jhist
|
||||
</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634898742783</mtime>
|
||||
<atime>1634898742760</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741834</id>
|
||||
<genstamp>1010</genstamp>
|
||||
<numBytes>22368</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16414</id>
|
||||
<type>FILE</type>
|
||||
<name>job_1634897835344_0001_conf.xml</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634898742822</mtime>
|
||||
<atime>1634898742799</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741835</id>
|
||||
<genstamp>1011</genstamp>
|
||||
<numBytes>214785</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16415</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>done</name>
|
||||
<mtime>1634899899078</mtime>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16416</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>2021</name>
|
||||
<mtime>1634899899078</mtime>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16417</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>10</name>
|
||||
<mtime>1634899899078</mtime>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16418</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>22</name>
|
||||
<mtime>1634899899078</mtime>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16419</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>000000</name>
|
||||
<mtime>1634902422435</mtime>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16425</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>logs</name>
|
||||
<mtime>1634902310063</mtime>
|
||||
<permission>dingjiawen:dingjiawen:1777</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16426</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>dingjiawen</name>
|
||||
<mtime>1634902310069</mtime>
|
||||
<permission>dingjiawen:dingjiawen:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16427</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>logs-tfile</name>
|
||||
<mtime>1634902310074</mtime>
|
||||
<permission>dingjiawen:dingjiawen:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16428</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>application_1634902054411_0001</name>
|
||||
<mtime>1634902331189</mtime>
|
||||
<permission>dingjiawen:dingjiawen:0770</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16430</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>wcoutput2</name>
|
||||
<mtime>1634902323450</mtime>
|
||||
<permission>dingjiawen:supergroup:0755</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16436</id>
|
||||
<type>FILE</type>
|
||||
<name>part-r-00000</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634902323357</mtime>
|
||||
<atime>1634902323238</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0644</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741842</id>
|
||||
<genstamp>1018</genstamp>
|
||||
<numBytes>78</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16438</id>
|
||||
<type>FILE</type>
|
||||
<name>_SUCCESS</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634902323452</mtime>
|
||||
<atime>1634902323450</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0644</permission>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16441</id>
|
||||
<type>FILE</type>
|
||||
<name>
|
||||
job_1634902054411_0001-1634902309668-dingjiawen-word+count-1634902324404-1-1-SUCCEEDED-default-1634902314351.jhist
|
||||
</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634902323558</mtime>
|
||||
<atime>1634902323524</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741844</id>
|
||||
<genstamp>1020</genstamp>
|
||||
<numBytes>22336</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16442</id>
|
||||
<type>FILE</type>
|
||||
<name>job_1634902054411_0001_conf.xml</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634902323591</mtime>
|
||||
<atime>1634902323566</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0770</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741845</id>
|
||||
<genstamp>1021</genstamp>
|
||||
<numBytes>214958</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16443</id>
|
||||
<type>FILE</type>
|
||||
<name>Ding203_43285</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634902331185</mtime>
|
||||
<atime>1634906546779</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:dingjiawen:0640</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741846</id>
|
||||
<genstamp>1022</genstamp>
|
||||
<numBytes>133945</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16444</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>sanguo</name>
|
||||
<mtime>1634994579596</mtime>
|
||||
<permission>dingjiawen:supergroup:0755</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16445</id>
|
||||
<type>FILE</type>
|
||||
<name>shuguo.txt</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634910351043</mtime>
|
||||
<atime>1634988132639</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0644</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741847</id>
|
||||
<genstamp>1023</genstamp>
|
||||
<numBytes>19</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16447</id>
|
||||
<type>FILE</type>
|
||||
<name>sunshangxaing.txt</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634910725360</mtime>
|
||||
<atime>1634988132668</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0644</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741849</id>
|
||||
<genstamp>1026</genstamp>
|
||||
<numBytes>16</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16448</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>xiyou</name>
|
||||
<mtime>1634988717255</mtime>
|
||||
<permission>dingjiawen:supergroup:0755</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16451</id>
|
||||
<type>FILE</type>
|
||||
<name>zhangfei.txt</name>
|
||||
<replication>3</replication>
|
||||
<mtime>1634988432270</mtime>
|
||||
<atime>1634988431098</atime>
|
||||
<preferredBlockSize>134217728</preferredBlockSize>
|
||||
<permission>dingjiawen:supergroup:0644</permission>
|
||||
<blocks>
|
||||
<block>
|
||||
<id>1073741850</id>
|
||||
<genstamp>1027</genstamp>
|
||||
<numBytes>16</numBytes>
|
||||
</block>
|
||||
</blocks>
|
||||
<storagePolicyId>0</storagePolicyId>
|
||||
</inode>
|
||||
<inode>
|
||||
<id>16452</id>
|
||||
<type>DIRECTORY</type>
|
||||
<name>client_test</name>
|
||||
<mtime>1634994678194</mtime>
|
||||
<permission>dingjiawen:supergroup:0755</permission>
|
||||
<nsquota>-1</nsquota>
|
||||
<dsquota>-1</dsquota>
|
||||
</inode>
|
||||
</INodeSection>
|
||||
<INodeReferenceSection></INodeReferenceSection>
|
||||
<SnapshotSection>
|
||||
<snapshotCounter>0</snapshotCounter>
|
||||
<numSnapshots>0</numSnapshots>
|
||||
</SnapshotSection>
|
||||
<INodeDirectorySection>
|
||||
<directory>
|
||||
<parent>16385</parent>
|
||||
<child>16452</child>
|
||||
<child>16444</child>
|
||||
<child>16388</child>
|
||||
<child>16386</child>
|
||||
<child>16401</child>
|
||||
<child>16430</child>
|
||||
<child>16448</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16386</parent>
|
||||
<child>16387</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16388</parent>
|
||||
<child>16389</child>
|
||||
<child>16425</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16389</parent>
|
||||
<child>16390</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16390</parent>
|
||||
<child>16391</child>
|
||||
<child>16398</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16391</parent>
|
||||
<child>16392</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16398</parent>
|
||||
<child>16415</child>
|
||||
<child>16399</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16399</parent>
|
||||
<child>16400</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16401</parent>
|
||||
<child>16410</child>
|
||||
<child>16408</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16415</parent>
|
||||
<child>16416</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16416</parent>
|
||||
<child>16417</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16417</parent>
|
||||
<child>16418</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16418</parent>
|
||||
<child>16419</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16419</parent>
|
||||
<child>16413</child>
|
||||
<child>16414</child>
|
||||
<child>16441</child>
|
||||
<child>16442</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16425</parent>
|
||||
<child>16426</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16426</parent>
|
||||
<child>16427</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16427</parent>
|
||||
<child>16428</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16428</parent>
|
||||
<child>16443</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16430</parent>
|
||||
<child>16438</child>
|
||||
<child>16436</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16444</parent>
|
||||
<child>16445</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16448</parent>
|
||||
<child>16451</child>
|
||||
</directory>
|
||||
<directory>
|
||||
<parent>16452</parent>
|
||||
<child>16447</child>
|
||||
</directory>
|
||||
</INodeDirectorySection>
|
||||
<FileUnderConstructionSection></FileUnderConstructionSection>
|
||||
<SecretManagerSection>
|
||||
<currentId>0</currentId>
|
||||
<tokenSequenceNumber>0</tokenSequenceNumber>
|
||||
<numDelegationKeys>0</numDelegationKeys>
|
||||
<numTokens>0</numTokens>
|
||||
</SecretManagerSection>
|
||||
<CacheManagerSection>
|
||||
<nextDirectiveId>1</nextDirectiveId>
|
||||
<numDirectives>0</numDirectives>
|
||||
<numPools>0</numPools>
|
||||
</CacheManagerSection>
|
||||
</fsimage>
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||
|
||||
<configuration>
|
||||
<property>
|
||||
<name>dfs.replication</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration status="error" strict="true" name="XMLConfig">
|
||||
<Appenders>
|
||||
<!-- 类型名为Console,名称为必须属性 -->
|
||||
<Appender type="Console" name="STDOUT">
|
||||
<!-- 布局为PatternLayout的方式,
|
||||
输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here -->
|
||||
<Layout type="PatternLayout"
|
||||
pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" />
|
||||
</Appender>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<!-- 可加性为false -->
|
||||
<Logger name="test" level="info" additivity="false">
|
||||
<AppenderRef ref="STDOUT" />
|
||||
</Logger>
|
||||
|
||||
<!-- root loggerConfig设置 -->
|
||||
<Root level="info">
|
||||
<AppenderRef ref="STDOUT" />
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<!--当前Maven模块的坐标信息-->
|
||||
<groupId>com.atguigu.maven</groupId>
|
||||
<artifactId>Hello</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
|
||||
<!--添加依赖-->
|
||||
<dependencies>
|
||||
|
||||
<!--Junit依赖坐标-->
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.12</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
package com.atguigu.maven;
|
||||
|
||||
public class Hello {
|
||||
public static void main(String[] args) {
|
||||
String STU="SHI";
|
||||
System.out.println(STU);
|
||||
}
|
||||
|
||||
public String sayHello(String name){
|
||||
return "Hello "+ name +"!";
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
package com.atguigu.maven;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class Hello {
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
package com.atguigu.maven;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class HelloTest {
|
||||
@Test
|
||||
public void testHello(){
|
||||
Hello hello=new Hello();
|
||||
String maven =hello.sayHello("maven");
|
||||
System.out.println(maven);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<!--实现继承-->
|
||||
<parent>
|
||||
<groupId>com.atguigu.maven</groupId>
|
||||
<artifactId>parent</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<!--相对路径-->
|
||||
<relativePath>../parent/pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<groupId>com.atguigu.maven</groupId>
|
||||
<artifactId>HelloFriend</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<!--自定义变量,可以便于统一管理版本号,当版本号需要相同时直接用${spring_version}替代即可-->
|
||||
<properties>
|
||||
<spring_version>4.12</spring_version>
|
||||
|
||||
</properties>
|
||||
|
||||
<!--当前工程的依赖信息-->
|
||||
<dependencies>
|
||||
<!--Hello的依赖-->
|
||||
<dependency>
|
||||
<groupId>com.atguigu.maven</groupId>
|
||||
<artifactId>Hello</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>${spring_version}</version>
|
||||
<!--scope标签决定了当前依赖的生效范围(main和test),如果不写就默认是compile-->
|
||||
<scope>compile</scope>
|
||||
|
||||
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
package com.atguigu.maven;
|
||||
|
||||
public class HelloFriend {
|
||||
|
||||
public String sayHelloToFriend(String name){
|
||||
Hello hello = new Hello();
|
||||
String str = hello.sayHello(name)+" I am "+this.getMyName();
|
||||
return str;
|
||||
}
|
||||
|
||||
public String getMyName(){
|
||||
return "Idea";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
package com.atguigu.maven;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class HelloFriendTest {
|
||||
|
||||
@Test
|
||||
public void testHelloFriend(){
|
||||
HelloFriend helloFriend = new HelloFriend();
|
||||
String results = helloFriend.sayHelloToFriend("Maven");
|
||||
System.out.println(results);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<!--实现继承-->
|
||||
<parent>
|
||||
<groupId>com.atguigu.maven</groupId>
|
||||
<artifactId>parent</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<!--相对路径-->
|
||||
<relativePath>../parent/pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
<groupId>com.atguigu.maven</groupId>
|
||||
<artifactId>Hello_new</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<!--添加依赖-->
|
||||
<dependencies>
|
||||
|
||||
<!--Junit依赖坐标-->
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.12</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<version>RELEASE</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
import java.util.Scanner;
|
||||
|
||||
public class Hello {
|
||||
public String sayHello(String name){
|
||||
|
||||
|
||||
return "Hello "+ name +"!";
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
import org.junit.Test;
|
||||
|
||||
import java.util.Scanner;
|
||||
|
||||
public class TestHello {
|
||||
@Test
|
||||
public void testHello(){
|
||||
Hello hello=new Hello();
|
||||
String maven =hello.sayHello("maven");
|
||||
System.out.println(maven);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.atguigu.hive</groupId>
|
||||
<artifactId>Hive</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hive</groupId>
|
||||
<artifactId>hive-exec</artifactId>
|
||||
<version>3.1.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
package com.atguigu.hive.udf;
|
||||
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
|
||||
import org.apache.hadoop.hive.ql.metadata.HiveException;
|
||||
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
|
||||
|
||||
/**
|
||||
* 插件性质的开发:
|
||||
* 1.实现接口或者继承类
|
||||
* 2.重写相应的方法
|
||||
* 3.打包
|
||||
*
|
||||
*
|
||||
* 自定义UDF函数类
|
||||
* 继承Hive提供的GenericUDF类
|
||||
*/
|
||||
public class CalStringLengthUDF extends GenericUDF {
|
||||
|
||||
/**
|
||||
* 初始化方法
|
||||
* @param objectInspectors 传入到函数中的参数对应的类型的鉴别器对象
|
||||
* @return 指定函数的返回值类型对象的鉴别器对象
|
||||
* @throws UDFArgumentException
|
||||
*/
|
||||
@Override
|
||||
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
|
||||
//1.校验函数的参数个数
|
||||
if(objectInspectors==null||objectInspectors.length!=1){
|
||||
throw new UDFArgumentLengthException("函数的参数个数不正确");
|
||||
}
|
||||
//2.校验函数的参数类型,getCategory()返回的是传入的类型,PRIMITIVE表示基本类型
|
||||
if(!objectInspectors[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)){
|
||||
throw new UDFArgumentTypeException(0,"磺酸钠会参数类型不正确");
|
||||
}
|
||||
//3.返回函数的返回值类型对应的鉴别器类型
|
||||
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
|
||||
}
|
||||
|
||||
/**
|
||||
* 函数核心处理方法
|
||||
* @param deferredObjects 传入到函数的参数
|
||||
* @return 函数的返回值
|
||||
* @throws HiveException
|
||||
*/
|
||||
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
|
||||
|
||||
//1.获取参数
|
||||
Object argument = deferredObjects[0].get();
|
||||
if(argument==null){
|
||||
return 0;
|
||||
}
|
||||
return argument.toString().length();
|
||||
}
|
||||
|
||||
/**
|
||||
* 用于以后sql函数需要显示哪些内容
|
||||
* @param strings
|
||||
* @return
|
||||
*/
|
||||
public String getDisplayString(String[] strings) {
|
||||
return "";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
package com.atguigu.hive.udtf;
|
||||
|
||||
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
|
||||
import org.apache.hadoop.hive.ql.metadata.HiveException;
|
||||
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 自定义UDTF
|
||||
* 继承Hive提供的GenericUDTF类
|
||||
*
|
||||
* select myudtf("hello-5,world-6,hadoop-7,hive-8",",","-");
|
||||
*返回结果
|
||||
* hello 5
|
||||
* world 6
|
||||
* hadoop 7
|
||||
* hive 8
|
||||
*/
|
||||
public class SplitStringToColRowsUDTF extends GenericUDTF {
|
||||
|
||||
private List<String> outs = new ArrayList<String>();
|
||||
/**
|
||||
* 初始化方法
|
||||
* @param argOIs
|
||||
* @return
|
||||
* @throws UDFArgumentException
|
||||
*/
|
||||
@Override
|
||||
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
|
||||
//1.判断参数的个数
|
||||
List<? extends StructField> allStructFieldRefs = argOIs.getAllStructFieldRefs();
|
||||
if(allStructFieldRefs.size()!=3){
|
||||
throw new UDFArgumentLengthException("函数的参数个数不正确");
|
||||
}
|
||||
//2.判断参数的类型
|
||||
for (int i = 0; i < allStructFieldRefs.size(); i++) {
|
||||
StructField structField = allStructFieldRefs.get(i);
|
||||
if (!structField.getFieldObjectInspector().getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
|
||||
throw new UDFArgumentTypeException(i,"函数参数类型不正确");
|
||||
}
|
||||
}
|
||||
//3.返回
|
||||
//用于知道列的名字
|
||||
List<String> structFieldNames = new ArrayList<String>();
|
||||
structFieldNames.add("word");
|
||||
structFieldNames.add("num");
|
||||
//用于指定列的类型
|
||||
List<ObjectInspector> structFieldObjectInspectors = new ArrayList<ObjectInspector>();
|
||||
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
|
||||
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
|
||||
//不会写就看源码怎么写的
|
||||
//需要列的名字和类的类型
|
||||
return ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames,structFieldObjectInspectors);
|
||||
}
|
||||
|
||||
/**
|
||||
* 函数核心处理方法
|
||||
* @param args 传入到函数中的参数
|
||||
* @throws HiveException
|
||||
*/
|
||||
public void process(Object[] args) throws HiveException {
|
||||
|
||||
//1.获取第一个参数
|
||||
String words = args[0].toString(); //"hello-5,world-6,hadoop-7,hive-8"
|
||||
//2.获取第二个参数
|
||||
String rowSplit = args[1].toString(); //","
|
||||
//2.获取第三个参数
|
||||
String colSplit = args[2].toString(); //"-"
|
||||
|
||||
//3.切割
|
||||
String[] rows = words.split(rowSplit); //[hello-5,world-6,hadoop-7,hive-8]
|
||||
for (String row : rows) {
|
||||
String[] cols = row.split(colSplit); //hello 5
|
||||
outs.clear();
|
||||
for (String col : cols) {
|
||||
outs.add(col);
|
||||
}
|
||||
forward(outs);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//做一些资源释放的收尾工作
|
||||
public void close() throws HiveException {
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
package com.atguigu.hive.udtf;
|
||||
|
||||
|
||||
import org.apache.avro.generic.GenericArray;
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
|
||||
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
|
||||
import org.apache.hadoop.hive.ql.metadata.HiveException;
|
||||
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
|
||||
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 自定义UDTF
|
||||
* 继承Hive提供的GenericUDTF类
|
||||
*
|
||||
* select myudtf("hello,world,hadoop,hive",",");
|
||||
*/
|
||||
public class SplitStringToRowsUDTF extends GenericUDTF {
|
||||
|
||||
private List<String> outs = new ArrayList<String>();
|
||||
/**
|
||||
* 初始化方法
|
||||
* @param argOIs
|
||||
* @return
|
||||
* @throws UDFArgumentException
|
||||
*/
|
||||
@Override
|
||||
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
|
||||
//1.判断参数的个数
|
||||
List<? extends StructField> allStructFieldRefs = argOIs.getAllStructFieldRefs();
|
||||
if(allStructFieldRefs.size()!=2){
|
||||
throw new UDFArgumentLengthException("函数的参数个数不正确");
|
||||
}
|
||||
//2.判断参数的类型
|
||||
for (int i = 0; i < allStructFieldRefs.size(); i++) {
|
||||
StructField structField = allStructFieldRefs.get(i);
|
||||
if (!structField.getFieldObjectInspector().getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
|
||||
throw new UDFArgumentTypeException(i,"函数参数类型不正确");
|
||||
}
|
||||
}
|
||||
//3.返回
|
||||
//用于知道列的名字
|
||||
List<String> structFieldNames = new ArrayList<String>();
|
||||
structFieldNames.add("word");
|
||||
//用于指定列的类型
|
||||
List<ObjectInspector> structFieldObjectInspectors = new ArrayList<ObjectInspector>();
|
||||
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
|
||||
//不会写就看源码怎么写的
|
||||
//需要列的名字和类的类型
|
||||
return ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames,structFieldObjectInspectors);
|
||||
}
|
||||
|
||||
/**
|
||||
* 函数核心处理方法
|
||||
* @param args 传入到函数中的参数
|
||||
* @throws HiveException
|
||||
*/
|
||||
public void process(Object[] args) throws HiveException {
|
||||
|
||||
//1.获取第一个参数
|
||||
String words = args[0].toString(); //"hello,world,hadoop,hive"
|
||||
//2.获取第二个参数
|
||||
String split = args[1].toString(); //","
|
||||
//3.切割
|
||||
String[] splitWord = words.split(split); //[hello,world,hadoop,hive]
|
||||
//4.将每个单词作为一行数据写出去
|
||||
for (String word : splitWord) {
|
||||
//不会写就看源码的process怎么写的,发现其频繁的使用forward
|
||||
//至于后续是怎么打印到控制台的,不管,是后续的事情要解决的
|
||||
//forward里面放集合
|
||||
outs.clear();
|
||||
outs.add(word);
|
||||
forward(outs);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//做一些资源释放的收尾工作
|
||||
public void close() throws HiveException {
|
||||
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,7 @@
|
|||
url=jdbc:mysql://localhost:3306/db2
|
||||
username=root
|
||||
password=root
|
||||
driveClassName=com.mysql.jdbc.Driver
|
||||
initialSize=10
|
||||
maxActive=20
|
||||
maxWait=1000
|
||||
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue