big_data_example

This commit is contained in:
markilue 2022-05-15 19:40:58 +08:00
parent 03d2c0b6cc
commit c91f81e123
819 changed files with 525006 additions and 0 deletions

View File

@ -0,0 +1,58 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.gmail</groupId>
<artifactId>Collect</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.9.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,77 @@
package com.atguigu.gmail.interceptor;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONException;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;
/**
* 第一层采集过滤
* 清洗掉数据不完整的event
*/
public class EtlLogInterceptor implements Interceptor {
public void initialize() {
}
public Event intercept(Event event) {
//1.取出body
String body = new String(event.getBody(), StandardCharsets.UTF_8);
//2.通过阿里的fastJson判断数据是否完整
try{
//将数据进行解析如果解析没有问题,则传回,如果catch到异常了则不完整return null
JSON.parseObject(body);
}catch (JSONException e){
return null;
}
return event;
}
public List<Event> intercept(List<Event> events) {
//通过获取迭代器,判断回来的迭代器是否为null,如果是就移除
//这么写逻辑没有问题,但是不能通过集合的形式去移除一个
// for (Event event : events) {
// Event intercept = intercept(event);
// if(intercept==null){
// events.remove(event);
// }
// }
// return events;
//先获取迭代器对象用迭代器的方式去写,用迭代器的方式去移除一个
Iterator<Event> iterator = events.iterator();
while(iterator.hasNext()){
Event event = iterator.next();
Event result = intercept(event);
if(result==null){
iterator.remove();
}
}
return events;
}
public void close() {
}
public static class MyBuilder implements Builder{
@Override
public Interceptor build() {
return new EtlLogInterceptor();
}
@Override
public void configure(Context context) {
}
}
}

View File

@ -0,0 +1,63 @@
package com.atguigu.gmail.interceptor;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.nio.charset.StandardCharsets;
import java.util.List;
/**
* 给每个event加上header
* 每个header存放改数据的在采集时的时间戳
* 这个事件中在采集时已有存放于json格式中
*/
public class TimeStampInterceptor implements Interceptor {
@Override
public void initialize() {
}
@Override
public Event intercept(Event event) {
//1.取出bodyjson格式的普通字符串
String body = new String(event.getBody(), StandardCharsets.UTF_8);
//2.将json字符串解析成对象
JSONObject jsonObject = JSON.parseObject(body);
//3.从对象中获取时间戳ts
String ts = jsonObject.getString("ts");
//4.将ts的值设置到event的header中
event.getHeaders().put("timestamp",ts);
return event;
}
@Override
public List<Event> intercept(List<Event> events) {
//迭代调用event
for (Event event : events) {
intercept(event);
}
return events;
}
@Override
public void close() {
}
public static class MyBuilder implements Builder{
public TimeStampInterceptor build() {
return new TimeStampInterceptor();
}
@Override
public void configure(Context context) {
}
}
}

View File

@ -0,0 +1,172 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.flink</groupId>
<artifactId>Flink</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<flink.verison>1.13.0</flink.verison>
<java.version>1.8</java.version>
<scala.binary.version>2.12</scala.binary.version>
<slf4j.version>1.7.30</slf4j.version>
</properties>
<dependencies>
<!--flink的版本-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.verison}</version>
</dependency>
<!--scala库-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-cep_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-csv</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.21</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-to-slf4j</artifactId>
<version>2.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
<version>${flink.verison}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.verison}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar.with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,120 @@
package day01.java;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* 从socket读取文件然后进行处理
* wordCount
*/
public class Example1 {
//记得抛出异常
public static void main(String[] args) throws Exception {
//TODO 配置环境
//获取流处理的运行时环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置并行任务的数量为1
env.setParallelism(1);
//TODO 执行程序
//读取数据源
//现在终端启动 nc -lp 9999
DataStreamSource<String> stream = env.socketTextStream("localhost",9999);
//1. map操作
//这里使用的是flatMap方法
//map针对流中的每一个元素输出一个元素
//flatMap:针对流中的每一个元素输出0个,1个或者多个元素
SingleOutputStreamOperator<WordWithCount> mappedStream = stream
//输入泛型:string;输出泛型:WordWithCount
.flatMap(new FlatMapFunction<String, WordWithCount>() {
@Override
public void flatMap(String value, Collector<WordWithCount> out) throws Exception {
String[] arr = value.split(" ");
//使用collect方法向下游发送数据
for (String e : arr) {
out.collect(new WordWithCount(e, 1L));
}
}
});
//2.分组:shuffle
KeyedStream<WordWithCount, String> keyedStream = mappedStream
//第一个泛型流中元素的泛型
//第二个反省:key的泛型
.keyBy(new KeySelector<WordWithCount, String>() {
@Override
public String getKey(WordWithCount value) throws Exception {
return value.word;
}
});
//3.reduce操作
//reduce会维护一个累加器
//第一条数据到来作为累加器输出
//第二条数据到来和累加器进行聚合操作然后输出累加器
//累加器和流中元素的类型是一样的
SingleOutputStreamOperator<WordWithCount> result = keyedStream
.reduce(new ReduceFunction<WordWithCount>() {
//
@Override
public WordWithCount reduce(WordWithCount value1, WordWithCount value2) throws Exception {
return new WordWithCount(value1.word, value1.count + value2.count);
}
});
//输出
result.print();
/*
WordWithCount{word='hello', count=1}
WordWithCount{word='world', count=1}
WordWithCount{word='hello', count=2}
WordWithCount{word='world', count=2}
*/
//执行程序
env.execute();
}
//POJO类
//1.必须是共有类
//2.所有字段必须是public
//3.必须有空构造器
//模拟了case class
public static class WordWithCount{
public String word;
public Long count;
public WordWithCount(){
}
public WordWithCount(String word ,Long count){
this.word =word;
this.count = count;
}
@Override
public String toString() {
return "WordWithCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
}

View File

@ -0,0 +1,122 @@
package day01.java;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* 从离线数据读取文件然后进行处理
* wordCount
*/
public class Example2 {
//记得抛出异常
public static void main(String[] args) throws Exception {
//TODO 配置环境
//获取流处理的运行时环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置并行任务的数量为1
// 需要一个任务插槽
env.setParallelism(1);
//TODO 执行程序
//读取数据源
//现在终端启动 nc -lp 9999
//DataStreamSource<String> stream = env.socketTextStream("localhost",9999);
DataStreamSource<String> stream = env.fromElements("hello world", "hello world");
//1. map操作
//这里使用的是flatMap方法
//map针对流中的每一个元素输出一个元素
//flatMap:针对流中的每一个元素输出0个,1个或者多个元素
SingleOutputStreamOperator<WordWithCount> mappedStream = stream
//输入泛型:string;输出泛型:WordWithCount
.flatMap(new FlatMapFunction<String, WordWithCount>() {
@Override
public void flatMap(String value, Collector<WordWithCount> out) throws Exception {
String[] arr = value.split(" ");
//使用collect方法向下游发送数据
for (String e : arr) {
out.collect(new WordWithCount(e, 1L));
}
}
});
//2.分组:shuffle
KeyedStream<WordWithCount, String> keyedStream = mappedStream
//第一个泛型流中元素的泛型
//第二个反省:key的泛型
.keyBy(new KeySelector<WordWithCount, String>() {
@Override
public String getKey(WordWithCount value) throws Exception {
return value.word;
}
});
//3.reduce操作
//reduce会维护一个累加器
//第一条数据到来作为累加器输出
//第二条数据到来和累加器进行聚合操作然后输出累加器
//累加器和流中元素的类型是一样的
SingleOutputStreamOperator<WordWithCount> result = keyedStream
.reduce(new ReduceFunction<WordWithCount>() {
//
@Override
public WordWithCount reduce(WordWithCount value1, WordWithCount value2) throws Exception {
return new WordWithCount(value1.word, value1.count + value2.count);
}
});
//输出
result.print();
/*
WordWithCount{word='hello', count=1}
WordWithCount{word='world', count=1}
WordWithCount{word='hello', count=2}
WordWithCount{word='world', count=2}
*/
//执行程序
env.execute();
}
//POJO类
//1.必须是共有类
//2.所有字段必须是public
//3.必须有空构造器
//模拟了case class
public static class WordWithCount{
public String word;
public Long count;
public WordWithCount(){
}
public WordWithCount(String word ,Long count){
this.word =word;
this.count = count;
}
@Override
public String toString() {
return "WordWithCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
}

View File

@ -0,0 +1,97 @@
package day01.java;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* 并行度的设置
* 针对没有算子设置的并行度的优先级高于全局并行度
* 本程序需要两个任务插槽
*
* 可以设置并行度的位置:
* 1.全局并行度
* 2.算子并行度
* 3.配制文件中有默认并行度
* 4.命令行提交时可以设置并行度(flink run -p 2)
* 2 > 1 > 4 > 3
*/
public class Example3 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//全局并行度设置为1
env.setParallelism(1);
//并行度设置为1
DataStreamSource<String> stream = env.fromElements("hello world", "hello world").setParallelism(1);
//1. map操作
//这里使用的是flatMap方法
//map针对流中的每一个元素输出一个元素
//flatMap:针对流中的每一个元素输出0个,1个或者多个元素
//并行度设置为2
SingleOutputStreamOperator<Example2.WordWithCount> mappedStream = stream
//输入泛型:string;输出泛型:WordWithCount
.flatMap(new FlatMapFunction<String, Example2.WordWithCount>() {
@Override
public void flatMap(String value, Collector<Example2.WordWithCount> out) throws Exception {
String[] arr = value.split(" ");
//使用collect方法向下游发送数据
for (String e : arr) {
out.collect(new Example2.WordWithCount(e, 1L));
}
}
}).setParallelism(2);
//2.分组:shuffle
KeyedStream<Example2.WordWithCount, String> keyedStream = mappedStream
//第一个泛型流中元素的泛型
//第二个反省:key的泛型
.keyBy(new KeySelector<Example2.WordWithCount, String>() {
@Override
public String getKey(Example2.WordWithCount value) throws Exception {
return value.word;
}
});
//3.reduce操作
//reduce会维护一个累加器
//第一条数据到来作为累加器输出
//第二条数据到来和累加器进行聚合操作然后输出累加器
//累加器和流中元素的类型是一样的
//并行度设置为2
SingleOutputStreamOperator<Example2.WordWithCount> result = keyedStream
.reduce(new ReduceFunction<Example2.WordWithCount>() {
//
@Override
public Example2.WordWithCount reduce(Example2.WordWithCount value1, Example2.WordWithCount value2) throws Exception {
return new Example2.WordWithCount(value1.word, value1.count + value2.count);
}
}).setParallelism(2);
//输出
//并行度设置为1
result.print().setParallelism(1);
/*
WordWithCount{word='hello', count=1}
WordWithCount{word='world', count=1}
WordWithCount{word='hello', count=2}
WordWithCount{word='world', count=2}
*/
//执行程序
env.execute();
}
}

View File

@ -0,0 +1,38 @@
package day01.scala
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
object Example_scala1 {
def main(args: Array[String]): Unit = {
//TODO 配置环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//TODO 读取数据源
val stream = env.socketTextStream("localhost", 9999)
// stream.flatMap(
// words =>{
// val word = words.split(" ")
//
// word.map(
// word =>{
// WordWithCount(word,1L)
// }
// )
//
// }
// )
}
case class WordWithCount(var word:String,var count:Long)
}

View File

@ -0,0 +1,99 @@
package day02;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 自定义数据源
*/
public class Example1 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Event> stream = env.addSource(new ClickSource());
stream.print();
env.execute();
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event>{
private boolean running = true;
private String[] userArr ={"Mary","Bob","Alice","liz"};
private String[] urlArr = {"./home","./cart","./fav","./prod?id=1","prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while(running){
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event{
public String user;
public String url;
public Long timestamp;
public Event(){
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,165 @@
package day02;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* MAP
*/
public class Example2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//匿名函数的方式
env
.addSource(new SourceFunction<Integer>() {
private boolean running = true;
private Random random = new Random();
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
while (running) {
ctx.collect(random.nextInt(1000));
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
})
//lambada表达式的形式java8推断不出来返回值类型,因此需要
.map(r -> Tuple2.of(r,r))
//会被擦出成Tuple2<Object,Object>
//需要returns方法来标注一下map函数的输出类型
.returns(Types.TUPLE(Types.INT,Types.INT))
.print();
//匿名内部类的方式
// env
// .addSource(new SourceFunction<Integer>() {
//
// private boolean running = true;
// private Random random = new Random();
//
//
// @Override
// public void run(SourceContext<Integer> ctx) throws Exception {
//
// while (running) {
// ctx.collect(random.nextInt(1000));
// Thread.sleep(1000L);
// }
//
// }
//
// @Override
// public void cancel() {
// running = false;
//
// }
// })
// .map(new MapFunction<Integer, Tuple2<Integer, Integer>>() {
// @Override
// public Tuple2<Integer, Integer> map(Integer value) throws Exception {
// return Tuple2.of(value, value);
// }
// }).print();
//外部类的方式
env
.addSource(new SourceFunction<Integer>() {
private boolean running = true;
private Random random = new Random();
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
while (running) {
ctx.collect(random.nextInt(1000));
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
})
.map(new MyMap()).print();
//flatMap的方式
env
.addSource(new SourceFunction<Integer>() {
private boolean running = true;
private Random random = new Random();
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
while (running) {
ctx.collect(random.nextInt(1000));
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
})
.flatMap(new FlatMapFunction<Integer, Tuple2<Integer, Integer>>() {
@Override
public void flatMap(Integer value, Collector<Tuple2<Integer, Integer>> collector) throws Exception {
collector.collect((Tuple2.of(value,value)));
}
})
.print();
env.execute();
}
public static class MyMap implements MapFunction<Integer,Tuple2<Integer,Integer>>{
@Override
public Tuple2<Integer, Integer> map(Integer value) throws Exception {
return Tuple2.of(value,value);
}
}
}

View File

@ -0,0 +1,132 @@
package day02;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* FILTER
*/
public class Example3 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Example1.Event> stream = env.addSource(new ClickSource());
stream.filter(r -> r.user.equals("Mary")).print();
stream
.filter(new FilterFunction<Example1.Event>() {
@Override
public boolean filter(Example1.Event value) throws Exception {
return value.user.equals("Mary");
}
}).print();
stream
.filter(new MyFilter())
.print();
stream
.flatMap(new FlatMapFunction<Example1.Event, Example1.Event>() {
@Override
public void flatMap(Example1.Event value, Collector<Example1.Event> collector) throws Exception {
if(value.user.equals("Mary")) collector.collect(value);
}
}).print();
env.execute();
}
public static class MyFilter implements FilterFunction<Example1.Event> {
@Override
public boolean filter(Example1.Event value) throws Exception {
return value.user.equals("Mary");
}
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<day02.Example1.Event>{
private boolean running = true;
private String[] userArr ={"Mary","Bob","Alice","liz"};
private String[] urlArr = {"./home","./cart","./fav","./prod?id=1","prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<day02.Example1.Event> ctx) throws Exception {
//向下游发送数据
while(running){
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new day02.Example1.Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event{
public String user;
public String url;
public Long timestamp;
public Event(){
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,53 @@
package day02;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* FlatMap
*/
public class Example4 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> stream = env.fromElements("white", "black", "gray");
stream
.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> collector) throws Exception {
if(value.equals("white")){
collector.collect(value);
}else if(value.equals("black")){
collector.collect(value);
collector.collect(value);
}
}
})
.print();
stream.flatMap(
(String value,Collector<String> collector) ->{
if(value.equals("white")){
collector.collect(value);
}else if(value.equals("black")){
collector.collect(value);
collector.collect(value);
}
}
)
.returns(Types.STRING);
env.execute();
}
}

View File

@ -0,0 +1,45 @@
package day02;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.scala.DataStream;
/**
* sum滚动聚合
*/
public class Example5 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Tuple2<Integer, Integer>> stream = env
.fromElements(
Tuple2.of(1, 2),
Tuple2.of(1, 3)
);
//键控流
KeyedStream<Tuple2<Integer, Integer>, Integer> keyedStream = stream.keyBy(r -> r.f0);
keyedStream.sum(1).print();
//reduce是以上几组算子的泛化实现
keyedStream.reduce(new ReduceFunction<Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> reduce(Tuple2<Integer, Integer> value1, Tuple2<Integer, Integer> value2) throws Exception {
return Tuple2.of(value1.f0, value1.f1 + value2.f1);
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,66 @@
package day02;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 求整数平均值
*/
public class Example6 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.addSource(new SourceFunction<Integer>() {
private boolean running =true;
private Random random =new Random();
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
while (running){
ctx.collect(random.nextInt(10));
Thread.sleep(100L);
}
}
@Override
public void cancel() {
running=false;
}
})
.map(r -> Tuple2.of(r,1))
.returns(Types.TUPLE(Types.INT,Types.INT))
.keyBy(r ->true)
.reduce(new ReduceFunction<Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> reduce(Tuple2<Integer, Integer> value1, Tuple2<Integer, Integer> value2) throws Exception {
return Tuple2.of(value1.f0+value2.f0,value1.f1+value2.f1);
}
})
.map(new MapFunction<Tuple2<Integer, Integer>, Double>() {
@Override
public Double map(Tuple2<Integer, Integer> value) throws Exception {
return (double)value.f0/value.f1;
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,42 @@
package day02;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* shuffle
*/
public class Example7 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//随机向下游分配
env
.fromElements(1,2,3,4).setParallelism(1)
.shuffle()
.print("shuffle:").setParallelism(2);
//平均分配 -底层采用轮询的方式
env
.fromElements(1,2,3,4).setParallelism(1)
.rebalance()
.print("rebalance:").setParallelism(2);
//广播的方式 -两个分区均有复制
env
.fromElements(1,2,3,4).setParallelism(1)
.broadcast()
.print("broadcast:").setParallelism(2);
env.execute();
}
}

View File

@ -0,0 +1,51 @@
package day03;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.operators.resettable.SpillingResettableIterator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* 富函数
*/
public class Example1 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.fromElements(1, 2, 3)
.map(new RichMapFunction<Integer, Integer>() {
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
System.out.println("生命周期开始了");
System.out.println("当前子任务的索引是:" + getRuntimeContext().getIndexOfThisSubtask());
}
@Override
public Integer map(Integer value) throws Exception {
return value * value;
}
@Override
public void close() throws Exception {
super.close();
System.out.println("生命周期结束");
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,59 @@
package day03;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
public class Example2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env
.addSource(new RichParallelSourceFunction<Integer>() {
@Override
public void open(Configuration parameters) throws Exception {
System.out.println("生命周期开始,子任务索引是:"+getRuntimeContext().getIndexOfThisSubtask());
}
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
for (int i = 0; i < 10; i++) {
if(i%2 ==getRuntimeContext().getIndexOfThisSubtask()){
ctx.collect(i);
}
}
}
@Override
public void cancel() {
}
})
.setParallelism(2)
.print()
.setParallelism(2);
/*
生命周期开始子任务索引是0
生命周期开始子任务索引是1
2> 1
2> 3
2> 5
2> 7
2> 9
1> 0
1> 2
1> 4
1> 6
1> 8
*/
env.execute();
}
}

View File

@ -0,0 +1,31 @@
package day03;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
/**
* 自定义输出
*/
public class Example3 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.fromElements(1,2,3,4)
.addSink(new SinkFunction<Integer>() {
//每收到一条数据就会触发一次invoke函数的调用
@Override
public void invoke(Integer value, Context context) throws Exception {
SinkFunction.super.invoke(value,context);
System.out.println(value);
}
});
env.execute();
}
}

View File

@ -0,0 +1,64 @@
package day03;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
/**
* KeyedProcessFunction简单例子
*/
public class Example4 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.socketTextStream("localhost",9999)
.keyBy(r -> 1)
.process( new MyKeyed())
.print();
env.execute();
}
/**
* 继承实现接口KeyedProcessFunction<Key,In,Out>
* key:1 Integer
* In:socket进来的数据 string
* Out:输出数据string
*/
public static class MyKeyed extends KeyedProcessFunction<Integer,String,String>{
//当一条元素到来的时候就会触发这个方法的调用
@Override
public void processElement(String value, Context ctx, Collector<String> out) throws Exception {
//当前机器时间
long ts = ctx.timerService().currentProcessingTime();
out.collect("元素:" + value+""+new Timestamp(ts) + "到达");
//注册一个10秒之后的定时器
long tenSecLater = ts +10*1000L;
out.collect("注册了一个时间在:"+new Timestamp(tenSecLater)+"的定时器");
//注册定时器的语法注意注册的是处理时间(机器时间)
ctx.timerService().registerProcessingTimeTimer(tenSecLater);
}
//定时器触发后执行也可以向下游发送数据
//定时器也是状态每个key独有定时器
//每个key都可以注册自己的定时器
//对于每个key,在某个时间戳只能注册一个定时器
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
super.onTimer(timestamp, ctx, out);
out.collect("定时器触发了!触发事件是:" +new Timestamp(timestamp));
}
}
}

View File

@ -0,0 +1,113 @@
package day03;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.util.Random;
/**
* 状态变量
*/
public class Example5 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.addSource(new SourceFunction<Integer>() {
private boolean running = true;
private Random random =new Random();
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
while(running){
ctx.collect(random.nextInt(10));
Thread.sleep(1000);
}
}
@Override
public void cancel() {
running =false;
}
})
.keyBy( r -> true)
.process(new KeyedProcessFunction<Boolean, Integer, Double>() {
//声明一个状态变量作为累加器
//状态变量的可见范围(作用域)的当前key
//状态变量是单例只能被实例化一次
private ValueState<Tuple2<Integer,Integer>> valueState;
//保存定时器的时间戳
private ValueState<Long> timerTs;
//初始化状态变量
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//实例化状态变量
valueState = getRuntimeContext().getState(
//在checkpoint中用状态描述符去找他
//ValueStateDescriptor状态描述符
new ValueStateDescriptor<Tuple2<Integer, Integer>>("sum-count", Types.TUPLE(Types.INT,Types.INT))
);
timerTs=getRuntimeContext().getState(
new ValueStateDescriptor<Long>("timer",Types.LONG)
);
}
@Override
public void processElement(Integer value, Context ctx, Collector<Double> out) throws Exception {
//当第一条数据到来时状态变量的值为null
//使用.value()方法读取状态变量的值使用.update()方法更新状态变量的值
if(valueState.value() == null){
valueState.update(Tuple2.of(value,1));
}else {
Tuple2<Integer, Integer> tmp = valueState.value();
valueState.update(Tuple2.of(tmp.f0+value,tmp.f1+1));
}
if(timerTs.value() == null){
long tenSecLater =ctx.timerService().currentProcessingTime()+10*1000L;
ctx.timerService().registerProcessingTimeTimer(tenSecLater);
timerTs.update(tenSecLater);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Double> out) throws Exception {
if(valueState.value() != null){
out.collect((double)valueState.value().f0/valueState.value().f1);
timerTs.clear();
}
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,105 @@
package day03;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.util.Random;
/**
* 整数连续一秒上升
*/
public class Example6 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Integer> stream = env.addSource(new SourceFunction<Integer>() {
private boolean running = true;
private Random random = new Random();
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
while (running) {
ctx.collect(random.nextInt());
Thread.sleep(300L);
}
}
@Override
public void cancel() {
running = false;
}
});
stream
.keyBy(r -> 1)
.process(new KeyedProcessFunction<Integer, Integer, String>() {
//初始化两个状态变量
private ValueState<Integer> lastInt; //最后一次的整数类型
private ValueState<Long> timerTs; //时间戳
@Override
public void open(Configuration parameters) throws Exception {
lastInt = getRuntimeContext().getState(
new ValueStateDescriptor<Integer>("last-integer", Types.INT)
);
timerTs = getRuntimeContext().getState(
new ValueStateDescriptor<Long>("timer", Types.LONG)
);
}
@Override
public void processElement(Integer value, Context ctx, Collector<String> out) throws Exception {
Integer prevInt = null;
if (lastInt.value() != null) {
prevInt = lastInt.value();
}
lastInt.update(value);
Long ts = null;
if (timerTs != null) {
ts = timerTs.value();
}
if (prevInt == null || value < prevInt) {
if (ts != null) {
ctx.timerService().deleteEventTimeTimer(ts);
timerTs.clear();
}
} else if (value > prevInt && ts == null) {
long oneSecLater = ctx.timerService().currentProcessingTime() + 1000L;
ctx.timerService().registerProcessingTimeTimer(oneSecLater);
timerTs.update(oneSecLater);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
out.collect("整数连续1S上升了");
timerTs.clear();
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,79 @@
package day03;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.util.Random;
/**
* 使用列表状态变量求平均值
*/
public class Example7 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Integer> stream = env.addSource(new SourceFunction<Integer>() {
private boolean running = true;
private Random random = new Random();
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
while (running) {
ctx.collect(random.nextInt(10));
Thread.sleep(300L);
}
}
@Override
public void cancel() {
running = false;
}
});
stream
.keyBy(r -> 1)
.process(new KeyedProcessFunction<Integer, Integer, Double>() {
//由于列表状态标量会把以前的值都存起来所以列表状态变量会非常的占内存
private ListState<Integer> listState;
@Override
public void open(Configuration parameters) throws Exception {
listState =getRuntimeContext().getListState(
new ListStateDescriptor<Integer>("list-state", Types.INT)
);
}
@Override
public void processElement(Integer value, Context ctx, Collector<Double> out) throws Exception {
listState.add(value);
Integer sum = 0;
Integer count = 0;
for (Integer i : listState.get()) {
sum += i;
count +=1;
}
out.collect((double) sum /count);
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,131 @@
package day03;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 字典状态变量
*/
public class Example8 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Event> stream = env.addSource(new ClickSource());
stream
.keyBy(r -> 1)
.process(new KeyedProcessFunction<Integer, Event, String>() {
private MapState<String, Long> mapState;
@Override
public void open(Configuration parameters) throws Exception {
mapState = getRuntimeContext().getMapState(
new MapStateDescriptor<String, Long>("map", Types.STRING, Types.LONG)
);
}
@Override
public void processElement(Event value, Context ctx, Collector<String> out) throws Exception {
if (mapState.contains(value.user)) {
mapState.put(value.user, mapState.get(value.user) + 1L);
} else {
mapState.put(value.user, 1L);
}
//求pv平均值
long userNum = 0L;
long pvSum = 0L;
for (String user : mapState.keys()) {
userNum += 1L;
pvSum += mapState.get(user);
}
out.collect("当前pv的平均值是" + (double) pvSum / userNum);
}
})
.print();
env.execute();
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event> {
private boolean running = true;
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while (running) {
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event {
public String user;
public String url;
public Long timestamp;
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,127 @@
package day03;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 计算每个用户每5秒的中的pv
*/
public class Example9 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Event> stream = env.addSource(new ClickSource());
stream
.keyBy(r -> r.user)
.window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
.process(new WindowResult())
.print();
env.execute();
}
/**
* 继承ProcessWindowFunction<IN,OUT,KEY,window>
*/
public static class WindowResult extends ProcessWindowFunction<Event, String, String, TimeWindow> {
//在窗口关闭的时候触发调用
//可以发现这种方式也是保留了一个窗口中的所有数据会拖慢速度因此考虑用以前累加器的方式来优化
@Override
public void process(String key, Context context, Iterable<Event> iterable, Collector<String> collector) throws Exception {
//迭代器参数中包含了窗口中所有的元素
long windowStart =context.window().getStart();
long windowEnd = context.window().getEnd();
long count = iterable.spliterator().getExactSizeIfKnown(); //迭代器里面共多少条元素
collector.collect("用户:"+key+"在窗口"
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
+""+"中的pv次数是"+count);
}
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event> {
private boolean running = true;
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while (running) {
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event {
public String user;
public String url;
public Long timestamp;
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,115 @@
package day03.selftry;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.util.Random;
/**
* 状态变量
*/
public class Example5_try {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.addSource(new SourceFunction<Integer>() {
private boolean running = true;
private Random random =new Random();
@Override
public void run(SourceContext<Integer> ctx) throws Exception {
while(running){
ctx.collect(random.nextInt(10));
Thread.sleep(1000);
}
}
@Override
public void cancel() {
running =false;
}
})
.keyBy( r -> true)
.process(new KeyedProcessFunction<Boolean, Integer, Double>() {
//声明一个状态变量作为累加器
//状态变量的可见范围(作用域)的当前key
//状态变量是单例只能被实例化一次
private ValueState<Tuple2<Integer,Integer>> valueState;
//保存定时器的时间戳
private ValueState<Long> timerTs;
//初始化状态变量
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//实例化状态变量
valueState = getRuntimeContext().getState(
//在checkpoint中用状态描述符去找他
//ValueStateDescriptor状态描述符
new ValueStateDescriptor<Tuple2<Integer, Integer>>("sum-count", Types.TUPLE(Types.INT,Types.INT))
);
timerTs=getRuntimeContext().getState(
new ValueStateDescriptor<Long>("timer",Types.LONG)
);
}
@Override
public void processElement(Integer value, Context ctx, Collector<Double> out) throws Exception {
//当第一条数据到来时状态变量的值为null
//使用.value()方法读取状态变量的值使用.update()方法更新状态变量的值
if(valueState.value() == null){
valueState.update(Tuple2.of(value,1));
}else {
Tuple2<Integer, Integer> tmp = valueState.value();
valueState.update(Tuple2.of(tmp.f0+value,tmp.f1+1));
}
if(timerTs.value() == null){
long tenSecLater =ctx.timerService().currentProcessingTime()+10*1000L;
ctx.timerService().registerProcessingTimeTimer(tenSecLater);
timerTs.update(1111L);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Double> out) throws Exception {
if(valueState.value() != null){
out.collect((double)valueState.value().f0/valueState.value().f1);
timerTs.clear();
}
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,154 @@
package day03.selftry;
import day03.Example9;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import sun.awt.SunHints;
import java.sql.Time;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 计算每个用户每5秒的中的pv
* 尚且有问题
*/
public class Example9_try {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Event> stream = env.addSource(new ClickSource());
stream
.keyBy(r -> r.user)
.process(new KeyedProcessFunction<String, Event, String>() {
private ValueState<Tuple2<String, Integer>> count;
private ValueState<Long> timeTs;
@Override
public void open(Configuration parameters) throws Exception {
count = getRuntimeContext().getState(
new ValueStateDescriptor<Tuple2<String, Integer>>("count", Types.TUPLE(Types.STRING, Types.INT)));
timeTs = getRuntimeContext().getState(
new ValueStateDescriptor<Long>("timer", Types.LONG)
);
}
@Override
public void processElement(Event result, Context ctx, Collector<String> collector) throws Exception {
if (count.value() == null) {
count.update(Tuple2.of(result.user, 1));
} else {
Tuple2<String, Integer> value = count.value();
count.update(Tuple2.of(result.user, value.f1 + 1));
}
if (timeTs.value() == null) {
long tenSecLater = ctx.timerService().currentProcessingTime() + 5 * 1000L-1L;
ctx.timerService().registerProcessingTimeTimer(tenSecLater);
timeTs.update(tenSecLater);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
if (count != null) {
out.collect("用户" + count.value().f0 + "" + new Timestamp(timeTs.value()) + "" + new Timestamp(timeTs.value()+ 5 * 1000L) + "的PV次数为" + count.value().f1);
timeTs.clear();
count.clear();
}
}
})
.print();
env.execute();
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event> {
private boolean running = true;
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while (running) {
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event {
public String user;
public String url;
public Long timestamp;
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,146 @@
package day04;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeFactory;
import org.apache.calcite.schema.FunctionParameter;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.List;
import java.util.Random;
/**
* 增量聚合函数
* 实现每个用户每5秒钟窗口的pv
* 但是无法获取窗口信息
*/
public class Example1 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Event> stream = env.addSource(new ClickSource());
stream
.keyBy(r -> r.user)
.window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
.aggregate(new CountAgg())
.print();
env.execute();
}
/**
* 实现AggregateFunction<IN,累加器,out>接口
*/
public static class CountAgg implements AggregateFunction<Event,Integer,Integer> {
//创建累加器
@Override
public Integer createAccumulator() {
return 0;
}
//定义累加规则
@Override
public Integer add(Event event, Integer accumulator) {
return accumulator+1;
}
//在窗口关闭时返回结果
@Override
public Integer getResult(Integer accumulator) {
return accumulator;
}
//在窗口合并的时候merge
@Override
public Integer merge(Integer integer, Integer acc1) {
return null;
}
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event> {
private boolean running = true;
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while (running) {
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event {
public String user;
public String url;
public Long timestamp;
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,156 @@
package day04;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 将增量窗口函数和全量窗口函数结合在一起使用
* 每个用户5秒窗口的pv
*/
public class Example2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Event> stream = env.addSource(new ClickSource());
stream
.keyBy(r -> r.user)
.window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
//给增量聚合函数包裹一层窗口信息
.aggregate(new CountAgg(),new WindowResult())
.print();
env.execute();
}
//输入的泛型是增量聚合函数的输出的类型
public static class WindowResult extends ProcessWindowFunction<Integer,String,String, TimeWindow>{
@Override
public void process(String key, Context context, Iterable<Integer> iterable, Collector<String> collector) throws Exception {
//窗口关闭时触发调用
//迭代器参数中只包含了一个元素就是增量聚合函数发送过来的聚合结果
long windowStart =context.window().getStart();
long windowEnd = context.window().getEnd();
long count = iterable.iterator().next(); //取出增量聚合函数的哪一个元素
collector.collect("用户:"+key+"在窗口"
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
+""+"中的pv次数是"+count);
}
}
/**
* 实现AggregateFunction<IN,累加器,out>接口
*/
public static class CountAgg implements AggregateFunction<Event,Integer,Integer> {
//创建累加器
@Override
public Integer createAccumulator() {
return 0;
}
//定义累加规则
@Override
public Integer add(Event event, Integer accumulator) {
return accumulator+1;
}
//在窗口关闭时返回结果
@Override
public Integer getResult(Integer accumulator) {
return accumulator;
}
//在窗口合并的时候merge
@Override
public Integer merge(Integer integer, Integer acc1) {
return null;
}
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event> {
private boolean running = true;
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while (running) {
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event {
public String user;
public String url;
public Long timestamp;
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,168 @@
package day04;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 使用KeyedProcessFunction模拟5秒滚动窗口模拟的是增量聚合函数和和全窗口聚合函数结合使用的情况
*/
public class Example3 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.addSource(new ClickSource())
.keyBy(r ->r.user)
.process( new FakeWindow())
.print();
env.execute();
}
public static class FakeWindow extends KeyedProcessFunction<String,Event,String>{
//用map模拟窗口
//key是窗口的开始时间value是窗口中的pv数值(累加器)
private MapState<Long,Integer> mapState;
//窗口大小
private Long windowSize = 5000L;
@Override
public void open(Configuration parameters) throws Exception {
mapState = getRuntimeContext().getMapState(
new MapStateDescriptor<Long, Integer>("windowStart-pvCount",Types.LONG, Types.INT)
);
}
@Override
public void processElement(Event event, Context ctx, Collector<String> collector) throws Exception {
//计算当前元素所属的窗口的开始时间
long currTime =ctx.timerService().currentProcessingTime();
//计算窗口开始时间的公式
long windowStart = currTime - currTime % windowSize;
long windowEnd = windowStart +windowSize;
if(mapState.contains(windowStart)){
//之前已经来过数据了
mapState.put(windowStart,mapState.get(windowStart)+1);
}else {
//之前没有来过元素
mapState.put(windowStart,1);
}
//注册一个定时器
ctx.timerService().registerProcessingTimeTimer(windowEnd-1L);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
//timestamp就是触发这个方法的时间即windowEnd-1L
long windowEnd = timestamp +1L;
long windowStart = windowEnd - windowSize;
int count = mapState.get(windowStart);
out.collect("用户:"+ctx.getCurrentKey()+"在窗口"
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
+""+"中的pv次数是"+count);
mapState.remove(windowStart);
}
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event> {
private boolean running = true;
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while (running) {
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event {
public String user;
public String url;
public Long timestamp;
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,76 @@
package day04;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.time.Duration;
/**
* 水位线测试
*/
public class Example4 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
// 'a 1'
.socketTextStream("localhost",9999)
// (a , 1000L)
.map(new MapFunction<String, Tuple2<String,Long>>() {
@Override
public Tuple2 map(String value) throws Exception {
String[] arr = value.split(" ");
return Tuple2.of(arr[0],Long.parseLong(arr[1]) * 1000L);
}
})
//抽取时间戳,分配水位线
//默认每隔200ms的机器时间插入一次水位线
.assignTimestampsAndWatermarks(
//最大延迟时间设置为5秒nc
WatermarkStrategy.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(5))
//时间戳字段
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
return element.f1; //告诉flink事件时间是哪一个字段
}
})
)
.keyBy(r ->r.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(5))) //5秒的时间滚动窗口
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
@Override
public void process(String key, Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
long windowStart =context.window().getStart();
long windowEnd = context.window().getEnd();
long count = iterable.spliterator().getExactSizeIfKnown(); //迭代器里面共多少条元素
collector.collect("用户:"+key+"在窗口"
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
+""+"中的pv次数是"+count);
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,66 @@
package day04;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.time.Duration;
/**
* 水位线测试
*/
public class Example5 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.socketTextStream("localhost",9999)
.map(r -> Tuple2.of(r.split(" ")[0],Long.parseLong(r.split(" ")[1])*1000L))
.returns(Types.TUPLE(Types.STRING,Types.LONG))
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
return element.f1;
}
})
)
.keyBy(r -> r.f0)
.process(new KeyedProcessFunction<String, Tuple2<String, Long>, String>() {
@Override
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<String> collector) throws Exception {
collector.collect("当前的水位线是:"+ctx.timerService().currentWatermark());
ctx.timerService().registerEventTimeTimer(value.f1+5000L);
collector.collect("注册了一个时间戳是:"+new Timestamp(value.f1+5000L)+"的定时器");
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
out.collect("定时器触发了!");
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,79 @@
package day04;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.time.Duration;
/**
* 水位线测试
*/
public class Example6 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//每个一分钟插入一次水位线
env.getConfig().setAutoWatermarkInterval(60*1000L);
env
// 'a 1'
.socketTextStream("localhost",9999)
// (a , 1000L)
.map(new MapFunction<String, Tuple2<String,Long>>() {
@Override
public Tuple2 map(String value) throws Exception {
String[] arr = value.split(" ");
return Tuple2.of(arr[0],Long.parseLong(arr[1]) * 1000L);
}
})
//抽取时间戳,分配水位线
//默认每隔200ms的机器时间插入一次水位线
.assignTimestampsAndWatermarks(
//最大延迟时间设置为5秒nc
WatermarkStrategy.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0))
//时间戳字段
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
return element.f1; //告诉flink事件时间是哪一个字段
}
})
)
.keyBy(r ->r.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(5))) //5秒的时间滚动窗口
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
@Override
public void process(String key, Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
long windowStart =context.window().getStart();
long windowEnd = context.window().getEnd();
long count = iterable.spliterator().getExactSizeIfKnown(); //迭代器里面共多少条元素
collector.collect("用户:"+key+"在窗口"
+""+new Timestamp(windowStart)+"~"+new Timestamp(windowEnd)
+""+"中的pv次数是"+count);
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,240 @@
package day04;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import javax.print.DocFlavor;
import java.sql.Timestamp;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Comparator;
/**
* 每个窗口中最热门的商品是什么
*/
public class Example7 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
.map(new MapFunction<String, UserBehavior>() {
@Override
public UserBehavior map(String value) throws Exception {
String[] arr = value.split(",");
return new UserBehavior(arr[0],arr[1],arr[2],arr[3],Long.parseLong(arr[4])*1000L);
}
})
.filter(r -> r.behavior.equals("pv"))
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(0))
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
@Override
public long extractTimestamp(UserBehavior element, long recordTimestamp) {
return element.timeStamp;
}
})
)
.keyBy( r -> r.itemId)
.window(SlidingEventTimeWindows.of(Time.hours(1),Time.minutes(5)))
.aggregate(new CountAgg(),new WindowResult())
.keyBy(r ->r.windowEnd)
.process(new TopN(3))
.print();
env.execute();
}
/**
* 一段时间内的TopN商品排序函数
*/
public static class TopN extends KeyedProcessFunction<Long,ItemViewCount,String>{
private ListState<ItemViewCount> listState;
private Integer n;
public TopN(Integer n) {
this.n = n;
}
@Override
public void open(Configuration parameters) throws Exception {
listState = getRuntimeContext().getListState(
new ListStateDescriptor<ItemViewCount>("list-state", Types.POJO(ItemViewCount.class)));
}
@Override
public void processElement(ItemViewCount value, Context ctx, Collector<String> collector) throws Exception {
listState.add(value);
ctx.timerService().registerEventTimeTimer(value.windowEnd+100L);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
ArrayList<ItemViewCount> itemViewCountArrayList =new ArrayList<>();
for (ItemViewCount ivc : listState.get()) {
itemViewCountArrayList.add(ivc);
}
listState.clear();
itemViewCountArrayList.sort(new Comparator<ItemViewCount>() {
@Override
public int compare(ItemViewCount t1, ItemViewCount t2) {
return t2.count.intValue() -t1.count.intValue();
}
});
StringBuilder result = new StringBuilder();
result
.append("=====================================\n")
.append("窗口结束时间:"+new Timestamp(timestamp-1L))
.append("\n");
for (int i = 0;i<n ;i++){
ItemViewCount curr = itemViewCountArrayList.get(i);
result
.append(""+(i+1)+"名的商品id是:"+curr.itemId)
.append(",浏览次数是:"+curr.count)
.append("\n");
}
result
.append("=====================================\n\n");
out.collect(result.toString());
}
}
/**
* 全量聚合函数
*/
public static class WindowResult extends ProcessWindowFunction<Long,ItemViewCount,String, TimeWindow>{
@Override
public void process(String key, Context context, Iterable<Long> elements, Collector<ItemViewCount> collector) throws Exception {
collector.collect(new ItemViewCount(key,elements.iterator().next(),context.window().getStart(),context.window().getEnd()));
}
}
/**
* 增量聚合函数
*/
public static class CountAgg implements AggregateFunction<UserBehavior,Long,Long>{
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(UserBehavior value, Long accumulator) {
return accumulator+1L;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long aLong, Long acc1) {
return null;
}
}
/**
* 商品视图POJO类
* 每个商品在每个窗口中的浏览次数
*/
public static class ItemViewCount{
public String itemId;
public Long count;
public Long windowStart;
public Long windowEnd;
public ItemViewCount() {
}
public ItemViewCount(String itemId, Long count, Long windowStart, Long windowEnd) {
this.itemId = itemId;
this.count = count;
this.windowStart = windowStart;
this.windowEnd = windowEnd;
}
@Override
public String toString() {
return "ItemViewCount{" +
"itemId='" + itemId + '\'' +
", count=" + count +
", windowStart=" + new Timestamp(windowStart) +
", windowEnd=" + new Timestamp(windowEnd) +
'}';
}
}
/**
* 用户行为POJO类
*/
public static class UserBehavior{
public String userId;
public String itemId;
public String categoryId;
public String behavior;
public Long timeStamp;
public UserBehavior(){
}
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timeStamp = timeStamp;
}
@Override
public String toString() {
return "UserBehavior{" +
"userId='" + userId + '\'' +
", itemId='" + itemId + '\'' +
", categoryId='" + categoryId + '\'' +
", behavior='" + behavior + '\'' +
", timeStamp=" + new Timestamp(timeStamp) +
'}';
}
}
}

View File

@ -0,0 +1,147 @@
package day04.selftry;
import day04.Example3;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 不用mapState实现增量和全量窗口结合
*/
public class Example3_try {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.addSource(new ClickSource())
.keyBy(r -> r.user)
.process(new KeyedProcessFunction<String, Event, String>() {
private ValueState<Integer> valueState;
private Long windowSize = 5000L;
@Override
public void open(Configuration parameters) throws Exception {
valueState = getRuntimeContext().getState(
new ValueStateDescriptor<Integer>("count", Types.INT)
);
}
@Override
public void processElement(Event event, Context context, Collector<String> collector) throws Exception {
//获取现在的时间戳及其窗口时间
long currTime = context.timerService().currentProcessingTime();
long startWindow = currTime - currTime % windowSize;
long endWindow = startWindow+windowSize;
if(valueState.value() == null){
valueState.update(1);
}else {
valueState.update(valueState.value()+1);
}
//注册定时器
context.timerService().registerProcessingTimeTimer(endWindow-1L);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
long startWindow = timestamp - windowSize + 1L;
long endWindow = timestamp + 1L;
Integer count = valueState.value();
out.collect("用户:"+ctx.getCurrentKey()+"窗口"+new Timestamp(startWindow)+"~"+new Timestamp(endWindow)
+"中的pv次数是"+count);
valueState.clear();
}
})
.print();
env.execute();
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event> {
private boolean running = true;
private String[] userArr = {"Mary", "Bob", "Alice", "liz"};
private String[] urlArr = {"./home", "./cart", "./fav", "./prod?id=1", "prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while (running) {
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event {
public String user;
public String url;
public Long timestamp;
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,74 @@
package day05;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkGenerator;
import org.apache.flink.api.common.eventtime.WatermarkGeneratorSupplier;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* 什么是迟到元素
*/
public class Example1 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> stream = env.socketTextStream("localhost", 9999);
stream
.map(new MapFunction<String, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(String s) throws Exception {
String[] arr = s.split(" ");
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
}
})
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps() //最大延迟时间是0
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long l) {
return element.f1;
}
})
)
.process(
new ProcessFunction<Tuple2<String, Long>, String>() {
@Override
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<String> collector) throws Exception {
if (value.f1 < ctx.timerService().currentWatermark()) {
collector.collect("迟到元素迟到了:" + value);
} else {
collector.collect(value + "元素没有迟到");
}
}
}
)
.print();
env.execute();
}
}

View File

@ -0,0 +1,71 @@
package day05;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.watermark.Watermark;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
/**
* 迟到数据发送到侧输出流中去
* 重定向到侧输出流
*/
public class Example2 {
//定义侧输出流的名字:侧输出标签
private static OutputTag<String> lateElement = new OutputTag<String>("late-element"){};
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<String> result = env
//自定义一个数据源
.addSource(new SourceFunction<Tuple2<String, Long>>() {
@Override
public void run(SourceContext<Tuple2<String, Long>> ctx) throws Exception {
//指定时间戳发送数据
ctx.collectWithTimestamp(Tuple2.of("hello world", 1000L), 1000L);
//发送水位线
ctx.emitWatermark(new Watermark(999L));
ctx.collectWithTimestamp(Tuple2.of("hello flink", 2000L), 2000L);
ctx.emitWatermark(new Watermark(1999L));
ctx.collectWithTimestamp(Tuple2.of("hello late", 1000L), 1000L);
}
@Override
public void cancel() {
}
})
.process(new ProcessFunction<Tuple2<String, Long>, String>() {
@Override
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<String> collector) throws Exception {
if (value.f1 < ctx.timerService().currentWatermark()) {
//发送到侧输出流
ctx.output(lateElement, "迟到元素发送到侧输出流" + value);
} else {
collector.collect("正常到达的元素:" + value);
}
}
});
result.print("主流:");
//打印侧输出流
result.getSideOutput(lateElement).print("侧输出流:");
env.execute();
}
}

View File

@ -0,0 +1,77 @@
package day05;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.watermark.Watermark;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
/**
* 开了窗口之后,如何把迟到元素发送到侧输出流
*/
public class Example3 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<String> result = env
.addSource(new SourceFunction<String>() {
@Override
public void run(SourceContext<String> ctx) throws Exception {
ctx.collectWithTimestamp("a", 1000L);
ctx.emitWatermark(new Watermark(999L));
ctx.collectWithTimestamp("a", 2000L);
ctx.emitWatermark(new Watermark(1999L));
ctx.collectWithTimestamp("a", 4000L);
ctx.emitWatermark(new Watermark(4999L));
//0-5秒数据窗口已关闭
ctx.collectWithTimestamp("a", 3000L); //迟到元素
}
@Override
public void cancel() {
}
})
.keyBy(r -> 1)
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
//发送迟到数据,并且迟到数据的窗口已被销毁的数据到侧输出流
.sideOutputLateData(new OutputTag<String>("late") {
})
.process(new ProcessWindowFunction<String, String, Integer, TimeWindow>() {
@Override
public void process(Integer integer, Context context, Iterable<String> element, Collector<String> collector) throws Exception {
collector.collect("窗口中共有:" + element.spliterator().getExactSizeIfKnown() + "条数据");
}
});
result.print("主输出流:");
//侧输出标签通过 id 保证是单例模式
result.getSideOutput(new OutputTag<String>("late"){}).print("侧输出流:");
/*
主输出流:> 窗口中共有:3条数据
侧输出流:> a
*/
env.execute();
}
}

View File

@ -0,0 +1,90 @@
package day05;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import javax.print.DocFlavor;
import java.time.Duration;
/**
* 使用迟到数据更新窗口计算结果
*/
public class Example4 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> stream = env.socketTextStream("localhost", 9999);
SingleOutputStreamOperator<String> result = stream
.map(new MapFunction<String, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(String s) throws Exception {
String[] arr = s.split(" ");
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
}
})
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
return element.f1;
}
})
)
.keyBy(r -> r.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.allowedLateness(Time.seconds(5)) //允许等待迟到事件5秒
.sideOutputLateData(new OutputTag<Tuple2<String, Long>>("late") {
}) //5秒以后被销毁的数据被发送到的位置
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
@Override
public void process(String s, Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
//初始化一个窗口状态变量注意:窗口状态变量的可见范围是当前窗口
ValueState<Boolean> firstCalculate = context.windowState().getState(new ValueStateDescriptor<Boolean>("first", Types.BOOLEAN));
if (firstCalculate.value() == null) {
collector.collect("窗口第一次触发计算了!水位线是:" + context.currentWatermark() + "窗口中共有" + iterable.spliterator().getExactSizeIfKnown() + "条数据");
firstCalculate.update(true); //第一次触发process执行以后,更新为true
} else {
collector.collect("迟到数据到了,更新以后的计算结果是:" + iterable.spliterator().getExactSizeIfKnown());
}
}
});
result.print("主输出流:");
result.getSideOutput(new OutputTag<Tuple2<String,Long>>("late"){}).print("侧输出流:");
env.execute();
}
}

View File

@ -0,0 +1,81 @@
package day05;
import org.apache.flink.api.common.eventtime.*;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* 自定义水位线的产生
*/
public class Example5 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> stream = env.socketTextStream("localhost", 9999);
stream
.map(new MapFunction<String, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(String s) throws Exception {
String[] arr = s.split(" ");
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
}
})
.assignTimestampsAndWatermarks(new CustomWatermarkGenerator())
.print();
env.execute();
}
public static class CustomWatermarkGenerator implements WatermarkStrategy<Tuple2<String, Long>> {
//用来告诉时间戳是哪一个字段
@Override
public TimestampAssigner<Tuple2<String, Long>> createTimestampAssigner(TimestampAssignerSupplier.Context context) {
return new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long l) {
return element.f1;
}
};
}
@Override
public WatermarkGenerator<Tuple2<String, Long>> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new WatermarkGenerator<Tuple2<String, Long>>() {
//最大延迟时间
private Long bound = 500L;
//最大时间戳防止溢出
private Long maxTs = -Long.MAX_VALUE + bound + 1L;
//更新时间戳
@Override
public void onEvent(Tuple2<String, Long> event, long l, WatermarkOutput watermarkOutput) {
maxTs =Math.max(maxTs,event.f1); //更新观察到的最大事件时间
}
//周期性产生水位线
@Override
public void onPeriodicEmit(WatermarkOutput watermarkOutput) {
//发送水位线注意水位线的计算公式
watermarkOutput.emitWatermark(new Watermark(maxTs-bound-1L));
}
};
}
}
}

View File

@ -0,0 +1,35 @@
package day05;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* 多流合并算子Union
* 有两个要求:
* 1.多条流的合并
* 2.所有流中的时间类型必须是一样的
*/
public class Example6 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Integer> stream1 = env.fromElements(1, 2);
DataStreamSource<Integer> stream2 = env.fromElements(3, 4);
DataStreamSource<Integer> stream3 = env.fromElements(5, 6);
DataStream<Integer> result = stream1.union(stream2, stream3);
result.print(); //3 4 5 6 1 2
env.execute();
}
}

View File

@ -0,0 +1,64 @@
package day05;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
/**
* 分流水位线传递测试
*/
public class Example7 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.socketTextStream("localhost",9999)
.map(new MapFunction<String, Tuple2<String,Long>>() {
@Override
public Tuple2<String, Long> map(String s) throws Exception {
String[] arr = s.split(" ");
return Tuple2.of(arr[0],Long.parseLong(arr[1])*1000L);
}
})
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long l) {
return element.f1;
}
})
)
.keyBy(r -> r.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
@Override
public void process(String s, Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
collector.collect("key:"+s+"的窗口触发了"+"当前的水位线是:"+context.currentWatermark());
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,82 @@
package day05;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
/**
* 合流水位线传递规则
* 传递小的水位线
*/
public class Example8 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//第一条流
SingleOutputStreamOperator<Tuple2<String, Long>> stream1 = env
.socketTextStream("localhost", 9999)
.map(new MapFunction<String, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(String s) throws Exception {
String[] arr = s.split(" ");
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
}
})
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long l) {
return element.f1;
}
})
);
//第二条流
SingleOutputStreamOperator<Tuple2<String, Long>> stream2 = env
.socketTextStream("localhost", 9998)
.map(new MapFunction<String, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(String s) throws Exception {
String[] arr = s.split(" ");
return Tuple2.of(arr[0], Long.parseLong(arr[1]) * 1000L);
}
})
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long l) {
return element.f1;
}
})
);
stream1.union(stream2)
.process(new ProcessFunction<Tuple2<String, Long>, String>() {
@Override
public void processElement(Tuple2<String, Long> value, Context ctx, Collector<String> collector) throws Exception {
collector.collect("当前水位线是:"+ctx.timerService().currentWatermark());
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,134 @@
package day05;
import day02.Example1;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoFlatMapFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.Random;
/**
* 双流join模式另一个算子connect
* connect来连接两条流
* 与union的不同
* 1.只能连接两条流
* 2.两条流中元素类型可以不同
*
* 一般用于:
* 1. 两条流keyBy
* 2. 一条流keyby,一条流广播
*/
public class Example9 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Event> clickStream = env.addSource(new ClickSource());
DataStreamSource<String> queryStream = env.socketTextStream("localhost", 9999).setParallelism(1);
clickStream
.keyBy(r ->r.user)
.connect(queryStream.broadcast())
//new CoFlatMapFunction<第一条流的数据类型, 第二条流的数据类型, 输出类型>
.flatMap(new CoFlatMapFunction<Event, String, Event>() {
//TODO 当第一条流元素来时进入flatMap1,第二条流元素来时进入flatMap2
private String query = "";
@Override
public void flatMap1(Event value, Collector<Event> collector) throws Exception {
if(value.url.equals(query)){
collector.collect(value);
}
}
@Override
public void flatMap2(String value, Collector<Event> collector) throws Exception {
query = value;
}
})
.print();
//Event{user='Mary', url='./cart', timestamp=2022-01-06 14:27:05.647}
//Event{user='liz', url='./cart', timestamp=2022-01-06 14:27:22.655}
env.execute();
}
//sourceFunction并行度只能为1
//自定义并行化版本的数据源需要使用ParallelSourceFunction
public static class ClickSource implements SourceFunction<Event> {
private boolean running = true;
private String[] userArr ={"Mary","Bob","Alice","liz"};
private String[] urlArr = {"./home","./cart","./fav","./prod?id=1","prod?id=2"};
private Random random = new Random();
@Override
public void run(SourceContext<Event> ctx) throws Exception {
//向下游发送数据
while(running){
//ctx上下文对象
//collect方法,向下游发送数据
ctx.collect(
new Event(
userArr[random.nextInt(userArr.length)],
urlArr[random.nextInt(urlArr.length)],
Calendar.getInstance().getTimeInMillis()
)
);
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
/**
* 自定义POJO类
*/
public static class Event{
public String user;
public String url;
public Long timestamp;
public Event(){
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,88 @@
package day06;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.util.Collector;
//在FlinkSQL里面有SELECT * FROM A INNER JOIN B WHERE A.id=B.id;
//其中存在笛卡尔积这种在flink的流处理里面是如何实现的呢?
/**
* CoProcessFunction的使用
* 通过API实现等值内连接
*/
public class Example1 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Tuple2<String, Integer>> stream1 = env
.fromElements(
Tuple2.of("a", 1),
Tuple2.of("b", 2),
Tuple2.of("a", 2)
);
DataStreamSource<Tuple2<String, String>> stream2 = env
.fromElements(
Tuple2.of("a", "a"),
Tuple2.of("b", "b"),
Tuple2.of("a", "aaaa")
);
stream1
.keyBy(r -> r.f0)
.connect(stream2.keyBy(r -> r.f0))
//CoProcessFunction<第一条流的泛型, 第二条流的泛型, 输出>
.process(new CoProcessFunction<Tuple2<String, Integer>, Tuple2<String, String>, String>() {
//分别保存两条流的数据
private ListState<Tuple2<String, Integer>> listState1;
private ListState<Tuple2<String, String>> listState2;
@Override
public void open(Configuration parameters) throws Exception {
listState1 = getRuntimeContext().getListState(
new ListStateDescriptor<Tuple2<String, Integer>>("list1", Types.TUPLE(Types.STRING, Types.INT))
);
listState2 = getRuntimeContext().getListState(
new ListStateDescriptor<Tuple2<String, String>>("list2", Types.TUPLE(Types.STRING, Types.STRING))
);
}
//用来处理第一条流的数据
@Override
public void processElement1(Tuple2<String, Integer> value, Context ctx, Collector<String> collector) throws Exception {
listState1.add(value);
for (Tuple2<String, String> e : listState2.get()) {
collector.collect(value + "=>" + e);
}
}
//用来处理第二条流的数据
@Override
public void processElement2(Tuple2<String, String> value, Context ctx, Collector<String> collector) throws Exception {
listState2.add(value);
for (Tuple2<String, Integer> e : listState1.get()) {
collector.collect(e + "=>" + value);
}
}
})
.print();
env.execute();
}
}

View File

@ -0,0 +1,161 @@
package day06;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.util.Collector;
/**
* 实时对账
*/
public class Example2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> orderStream = env.fromElements(
Event.of("order-1", "order", 1000L),
Event.of("order-2", "order", 2000L)
).assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long l) {
return element.timestamp;
}
})
);
SingleOutputStreamOperator<Event> weixinStream = env.fromElements(
Event.of("order-1", "weixin", 30000L),
Event.of("order-3", "weixin", 4000L)
).assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long l) {
return element.timestamp;
}
})
);
orderStream.keyBy(r -> r.orderId)
.connect(weixinStream.keyBy(r -> r.orderId))
.process(new MatchFunction())
.print();
env.execute();
}
public static class MatchFunction extends CoProcessFunction<Event,Event,String>{
//初始化两个状态变量一个用来保存下订单事件,一个用来保存微信的支付时间
private ValueState<Event> orderState;
private ValueState<Event> weixinState;
@Override
public void open(Configuration parameters) throws Exception {
orderState =getRuntimeContext().getState(
new ValueStateDescriptor<Event>("orderState", Types.POJO(Event.class))
);
weixinState =getRuntimeContext().getState(
new ValueStateDescriptor<Event>("weixinState", Types.POJO(Event.class))
);
}
@Override
public void processElement1(Event value, Context ctx, Collector<String> collector) throws Exception {
if(weixinState.value() == null){
//下订单order事件先到达,因为如果weixin事件先到达那么就不为空了
orderState.update(value);
ctx.timerService().registerEventTimeTimer(value.timestamp+5000L);
}else {
//如果不为空,且到达了这里证明对账成功,直接输出
collector.collect("订单ID是"+value.orderId+"对账成功,微信事件先到达");
weixinState.clear();
}
}
@Override
public void processElement2(Event value, Context ctx, Collector<String> collector) throws Exception {
if(orderState.value() == null){
weixinState.update(value);
ctx.timerService().registerEventTimeTimer(value.timestamp+5000L);
}else {
collector.collect("订单ID是:"+value.orderId+"对账成功,order事件先到达");
orderState.clear();
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
if(orderState.value() !=null){
out.collect("订单ID"+orderState.value().orderId+"对账失败,微信事件5s内未到达");
orderState.clear();
}
if(weixinState.value() !=null){
out.collect("订单ID"+weixinState.value().orderId+"对账失败,订单事件5s内未到达");
weixinState.clear();
}
}
}
public static class Event{
public String orderId;
public String eventType;
public Long timestamp;
public Event() {
}
public Event(String orderId, String eventType, Long timestamp) {
this.orderId = orderId;
this.eventType = eventType;
this.timestamp = timestamp;
}
public static Event of(String orderId, String eventType, Long timestamp){
return new Event(orderId,eventType,timestamp);
}
@Override
public String toString() {
return "Event{" +
"orderId='" + orderId + '\'' +
", eventType='" + eventType + '\'' +
", timestamp=" + timestamp +
'}';
}
}
}

View File

@ -0,0 +1,173 @@
package day06;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.watermark.Watermark;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* 实时对账 模拟对账不成功
*/
public class Example3 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> orderStream = env
.addSource(new SourceFunction<Event>() {
@Override
public void run(SourceContext<Event> ctx) throws Exception {
ctx.collectWithTimestamp(Event.of("order-1", "order", 1000L), 1000L);
ctx.emitWatermark(new Watermark(999L));
ctx.collectWithTimestamp(Event.of("order-2", "order", 3000L), 3000L);
ctx.emitWatermark(new Watermark(8001L));
}
@Override
public void cancel() {
}
});
SingleOutputStreamOperator<Event> weixinStream = env
.addSource(new SourceFunction<Event>() {
@Override
public void run(SourceContext<Event> ctx) throws Exception {
ctx.collectWithTimestamp(Event.of("order-1", "weixin", 4000L), 4000L);
ctx.emitWatermark(new Watermark(3999L));
ctx.emitWatermark(new Watermark(8001L));
ctx.collectWithTimestamp(Event.of("order-2", "weixin", 9000L), 9000L);
//ctx.emitWatermark(new Watermark(4000L));
}
@Override
public void cancel() {
}
});
orderStream.keyBy(r -> r.orderId)
.connect(weixinStream.keyBy(r -> r.orderId))
.process(new MatchFunction())
.print();
env.execute();
}
public static class MatchFunction extends CoProcessFunction<Event,Event,String>{
//初始化两个状态变量一个用来保存下订单事件,一个用来保存微信的支付时间
private ValueState<Event> orderState;
private ValueState<Event> weixinState;
@Override
public void open(Configuration parameters) throws Exception {
orderState =getRuntimeContext().getState(
new ValueStateDescriptor<Event>("orderState", Types.POJO(Event.class))
);
weixinState =getRuntimeContext().getState(
new ValueStateDescriptor<Event>("weixinState", Types.POJO(Event.class))
);
}
@Override
public void processElement1(Event value, Context ctx, Collector<String> collector) throws Exception {
if(weixinState.value() == null){
//下订单order事件先到达,因为如果weixin事件先到达那么就不为空了
orderState.update(value);
ctx.timerService().registerEventTimeTimer(value.timestamp+5000L);
}else {
//如果不为空,且到达了这里证明对账成功,直接输出
collector.collect("订单ID是"+value.orderId+"对账成功,微信事件先到达");
weixinState.clear();
}
}
@Override
public void processElement2(Event value, Context ctx, Collector<String> collector) throws Exception {
if(orderState.value() == null){
weixinState.update(value);
ctx.timerService().registerEventTimeTimer(value.timestamp+5000L);
}else {
collector.collect("订单ID是:"+value.orderId+"对账成功,order事件先到达");
orderState.clear();
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
if(orderState.value() !=null){
out.collect("订单ID"+orderState.value().orderId+"对账失败,微信事件5s内未到达");
orderState.clear();
}
if(weixinState.value() !=null){
out.collect("订单ID"+weixinState.value().orderId+"对账失败,订单事件5s内未到达");
weixinState.clear();
}
}
}
public static class Event{
public String orderId;
public String eventType;
public Long timestamp;
public Event() {
}
public Event(String orderId, String eventType, Long timestamp) {
this.orderId = orderId;
this.eventType = eventType;
this.timestamp = timestamp;
}
public static Event of(String orderId, String eventType, Long timestamp){
return new Event(orderId,eventType,timestamp);
}
@Override
public String toString() {
return "Event{" +
"orderId='" + orderId + '\'' +
", eventType='" + eventType + '\'' +
", timestamp=" + timestamp +
'}';
}
}
}

View File

@ -0,0 +1,117 @@
package day06;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
/**
* 基于间隔的join
*/
public class Example4 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> orderStream = env
.fromElements(
Event.of("user-1", "order", 20 * 60 * 1000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event event, long l) {
return event.timestamp;
}
})
);
SingleOutputStreamOperator<Event> pvStream = env.fromElements(
Event.of("user-1", "pv", 5 * 60 * 1000L),
Event.of("user-1", "pv", 10 * 60 * 1000L),
Event.of("user-1", "pv", 12 * 60 * 1000L),
Event.of("user-1", "pv", 22 * 60 * 1000L)
).assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event event, long l) {
return event.timestamp;
}
})
);
orderStream.keyBy(r ->r.userId)
.intervalJoin(pvStream.keyBy(r -> r.userId))
//第一条流和第二条流的哪一段join
//最近10min和未来10min以内的
.between(Time.minutes(-10),Time.minutes(5))
.process(new ProcessJoinFunction<Event, Event, String>() {
@Override
public void processElement(Event left, Event right, Context ctx, Collector<String> collector) throws Exception {
collector.collect(left + "=>"+right);
}
})
.print("orderStream join pvStream");
pvStream.keyBy(r ->r.userId)
.intervalJoin(orderStream.keyBy(r -> r.userId))
//第一条流和第二条流的哪一段join
//最近10min和未来10min以内的
.between(Time.minutes(-5),Time.minutes(10))
.process(new ProcessJoinFunction<Event, Event, String>() {
@Override
public void processElement(Event left, Event right, Context ctx, Collector<String> collector) throws Exception {
collector.collect(right + "=>"+left);
}
})
.print("pvStream join orderStream");
env.execute();
}
public static class Event{
public String userId;
public String eventType;
public Long timestamp;
public Event() {
}
public Event(String userId, String eventType, Long timestamp) {
this.userId = userId;
this.eventType = eventType;
this.timestamp = timestamp;
}
public static Event of(String userId, String eventType, Long timestamp){
return new Event(userId,eventType,timestamp);
}
@Override
public String toString() {
return "Event{" +
"userId='" + userId + '\'' +
", eventType='" + eventType + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
}

View File

@ -0,0 +1,72 @@
package day06;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import javax.print.DocFlavor;
/**
* 基于窗口的join
*/
public class Example5 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Tuple2<String, Integer>> stream1 = env.fromElements(
Tuple2.of("a", 1), Tuple2.of("b", 1)
).assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Integer>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Integer>>() {
@Override
public long extractTimestamp(Tuple2<String, Integer> stringIntegerTuple2, long l) {
return stringIntegerTuple2.f1;
}
})
);
SingleOutputStreamOperator<Tuple2<String, Integer>> stream2 = env.fromElements(
Tuple2.of("a", 2), Tuple2.of("b", 2),Tuple2.of("b", 3)
).assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Integer>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple2<String, Integer>>() {
@Override
public long extractTimestamp(Tuple2<String, Integer> stringIntegerTuple2, long l) {
return stringIntegerTuple2.f1;
}
})
);
stream1
.join(stream2)
.where(r -> r.f0)
.equalTo(r -> r.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.apply(new JoinFunction<Tuple2<String, Integer>, Tuple2<String, Integer>, String>() {
@Override
public String join(Tuple2<String, Integer> first, Tuple2<String, Integer> second) throws Exception {
return first +"=>"+second;
}
})
.print();
//基于相同窗口的笛卡尔积:
//(a,1)=>(a,2)
//(b,1)=>(b,2)
//(b,1)=>(b,3)
env.execute();
}
}

View File

@ -0,0 +1,29 @@
package day06;
import day05.Example9;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* 设置检查点的保存位置
*/
public class Example6 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//保存检查点到指定文件
env.setStateBackend(new FsStateBackend("file:///E:\\Big_data_example\\Flink\\src\\main\\resources\\ckpt",false));
//隔多久保存一次
env.enableCheckpointing(10*1000L);
env
.addSource(new Example9.ClickSource())
.print();
env.execute();
}
}

View File

@ -0,0 +1,29 @@
package day06;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* 一致性检查点
*/
public class Example7 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.fromElements(1,2,3,4,5)
.keyBy(r -> r%2)
.sum(0)
.print();
env.execute();
}
}

View File

@ -0,0 +1,38 @@
package day07;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import java.util.Properties;
/**
* flink写入kafka
*/
public class Example1 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
Properties properties =new Properties();
properties.put("bootstrap.servers","Ding202:9092");
env
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
.addSink(new FlinkKafkaProducer<String>(
//Topic
"dingjiawen1",
//写入的数据类型
new SimpleStringSchema(),
//kafka producer的相关位置等配置信息
properties
));
env.execute();
}
}

View File

@ -0,0 +1,162 @@
package day07;
import com.typesafe.sslconfig.ssl.FakeChainedKeyStore;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Properties;
/**
* 读取kafka中的数据
*/
public class Example2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
Properties properties = new Properties();
//kafka消费者的位置
properties.setProperty("bootstrap.servers", "Ding202:9092");
//消费者组
properties.setProperty("group.id", "consumer-group");
//key和value的反序列化机制
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
//自动提交的等级
properties.setProperty("auto.offset.reset", "latest");
env
.addSource(
new FlinkKafkaConsumer<String>(
"dingjiawen1",
new SimpleStringSchema(),
properties
)
)
.map(new MapFunction<String, UserBehavior>() {
@Override
public UserBehavior map(String s) throws Exception {
String[] arr = s.split(",");
return new UserBehavior(arr[0],arr[1],arr[2],arr[3],Long.parseLong(arr[4])*1000L);
}
})
.filter(r -> r.behavior.equals("pv"))
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehavior>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
@Override
public long extractTimestamp(UserBehavior userBehavior, long l) {
return userBehavior.timeStamp;
}
})
)
.keyBy(r -> true)
.window(SlidingEventTimeWindows.of(Time.hours(1),Time.minutes(5)))
//这种方式较以前的实现方式相比没有分布式特性
// 以前的实现方式由于keyBy后面又按结束时间KeyBy,由于分布式特性会把不同的分组尽量放在不同的插槽进行计算
//而这种方式容易内存爆炸
.process(new ProcessWindowFunction<UserBehavior, String, Boolean, TimeWindow>() {
@Override
public void process(Boolean aBoolean, Context context, Iterable<UserBehavior> iterable, Collector<String> collector) throws Exception {
//使用一个map来存储不同商品及其点击次数
HashMap<String, Long> hashMap = new HashMap<>();
for (UserBehavior e : iterable) {
if(hashMap.containsKey(e.itemId)){
hashMap.put(e.itemId,hashMap.get(e.itemId)+1L);
}else {
hashMap.put(e.itemId,1L);
}
}
//构造一个list来排序
ArrayList<Tuple2<String,Long>> arrayList =new ArrayList<Tuple2<String, Long>>();
for (String key : hashMap.keySet()) {
arrayList.add(Tuple2.of(key,hashMap.get(key)));
}
arrayList.sort(new Comparator<Tuple2<String, Long>>() {
@Override
public int compare(Tuple2<String, Long> t1, Tuple2<String, Long> t2) {
return t2.f1.intValue()-t1.f1.intValue();
}
});
StringBuilder result = new StringBuilder();
result
.append("========================\n")
.append("窗口"+new Timestamp(context.window().getStart())+"~"+new Timestamp(context.window().getEnd()))
.append("\n");
for (int i = 0; i < 3; i++) {
Tuple2<String, Long> currElement = arrayList.get(i);
result.append(""+(i+1)+"名的商品ID是"+currElement.f0+";浏览次数是:"+currElement.f1)
.append("\n");
}
collector.collect(result.toString());
}
})
.print();
env.execute();
}
/**
* 用户行为POJO类
*/
public static class UserBehavior{
public String userId;
public String itemId;
public String categoryId;
public String behavior;
public Long timeStamp;
public UserBehavior(){
}
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timeStamp = timeStamp;
}
@Override
public String toString() {
return "UserBehavior{" +
"userId='" + userId + '\'' +
", itemId='" + itemId + '\'' +
", categoryId='" + categoryId + '\'' +
", behavior='" + behavior + '\'' +
", timeStamp=" + new Timestamp(timeStamp) +
'}';
}
}
}

View File

@ -0,0 +1,141 @@
package day08;
import day04.Example7;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.time.Duration;
import java.util.HashSet;
/**
* 独立访客数量UV
* 用用户去重之后的
*/
public class Example1 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
.map(new MapFunction<String, UserBehavior>() {
@Override
public UserBehavior map(String value) throws Exception {
String[] arr = value.split(",");
return new UserBehavior(arr[0],arr[1],arr[2],arr[3],Long.parseLong(arr[4])*1000L);
}
})
.filter(r -> r.behavior.equals("pv"))
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(0))
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
@Override
public long extractTimestamp(UserBehavior element, long recordTimestamp) {
return element.timeStamp;
}
})
)
.keyBy( r ->true)
.window(TumblingEventTimeWindows.of(Time.hours(1)))
.aggregate(new CountAgg(),new WindowResult())
.print();
env.execute();
}
public static class WindowResult extends ProcessWindowFunction<Long,String,Boolean, TimeWindow>{
@Override
public void process(Boolean aBoolean, Context context, Iterable<Long> iterable, Collector<String> collector) throws Exception {
String windowStart = new Timestamp(context.window().getStart()).toString();
String windowStop = new Timestamp(context.window().getEnd()).toString();
Long count = iterable.iterator().next();
collector.collect("窗口"+windowStart+"~"+windowStop+"的独立访客的数量为:"+count);
}
}
//实现去重
public static class CountAgg implements AggregateFunction<UserBehavior, HashSet<String>,Long>{
//hashSet的实现方式,由于每加一次访客都会到内存,因此当访客数量多时可能出现内存过大的情况考虑优化
//使用布隆过滤器
@Override
public HashSet<String> createAccumulator() {
return new HashSet<String>();
}
@Override
public HashSet<String> add(UserBehavior userBehavior, HashSet<String> accumulator) {
accumulator.add(userBehavior.userId);
return accumulator;
}
@Override
public Long getResult(HashSet<String> strings) {
return (long)strings.size();
}
@Override
public HashSet<String> merge(HashSet<String> strings, HashSet<String> acc1) {
return null;
}
}
/**
* 用户行为POJO类
*/
public static class UserBehavior{
public String userId;
public String itemId;
public String categoryId;
public String behavior;
public Long timeStamp;
public UserBehavior(){
}
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timeStamp = timeStamp;
}
@Override
public String toString() {
return "UserBehavior{" +
"userId='" + userId + '\'' +
", itemId='" + itemId + '\'' +
", categoryId='" + categoryId + '\'' +
", behavior='" + behavior + '\'' +
", timeStamp=" + new Timestamp(timeStamp) +
'}';
}
}
}

View File

@ -0,0 +1,129 @@
package day08;
import com.typesafe.sslconfig.ssl.FakeChainedKeyStore;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import java.sql.Timestamp;
import static org.apache.flink.table.api.Expressions.$;
/**
* 使用Flink SQL实现实时TOPN
*/
public class Example10 {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<UserBehavior> stream = env
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
.map(
new MapFunction<String, UserBehavior>() {
@Override
public UserBehavior map(String s) throws Exception {
String[] arr = s.split(",");
return new UserBehavior(arr[0], arr[1], arr[2], arr[3], Long.parseLong(arr[4]));
}
}
)
.filter(r -> r.behavior.equals("pv"))
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehavior>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
@Override
public long extractTimestamp(UserBehavior userBehavior, long l) {
return userBehavior.timeStamp;
}
})
);
//注册表环境
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env, settings);
//将数据流转化为动态表
Table table = tableEnvironment
.fromDataStream(
stream,
$("userId"),
$("itemId"),
$("categoryId"),
$("behavior"),
$("timeStamp").rowtime().as("ts")
);
tableEnvironment.createTemporaryView("userBehavior",table);
//按照itemId和滑动窗口进行分组
String innerSQL = "select itemId,COUNT(itemId) as cnt ,HOP_END(ts , INTERVAL '5' MINUTE ,INTERVAL '1' HOUR) as windowEnd " +
"from userBehavior group by itemId,HOP(ts,INTERVAL '5' MINUTE ,INTERVAL '1' HOUR)";
//聚合结果在分组-over
//按照窗口分组,降序排列
String midSQL = "select * ,ROW_NUMBER() OVER(PARTITION BY windowEnd ORDER BY cnt DESC) as row_num "+
"FROM ("+innerSQL+")";
//取出前三名
String outerSQL = "select * from ("+midSQL+") WHERE row_num <= 3";
Table itemViewCount = tableEnvironment
.sqlQuery(outerSQL);
tableEnvironment.toChangelogStream(itemViewCount).print();
env.execute();
}
/**
* 用户行为POJO类
*/
public static class UserBehavior{
public String userId;
public String itemId;
public String categoryId;
public String behavior;
public Long timeStamp;
public UserBehavior(){
}
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timeStamp = timeStamp;
}
@Override
public String toString() {
return "UserBehavior{" +
"userId='" + userId + '\'' +
", itemId='" + itemId + '\'' +
", categoryId='" + categoryId + '\'' +
", behavior='" + behavior + '\'' +
", timeStamp=" + new Timestamp(timeStamp) +
'}';
}
}
}

View File

@ -0,0 +1,223 @@
package day08;
import org.apache.flink.runtime.operators.resettable.SpillingResettableIterator;
import scala.collection.mutable.ArrayLike;
import java.util.ArrayList;
/**
* 实现一个链表
*/
public class Example11 {
public static void main(String[] args) {
//TODO 链表
ListNode node1 = new ListNode(12, null);
ListNode node2 = new ListNode(97, null);
ListNode node3 = new ListNode(34, null);
node1.next = node2;
node2.next = node3;
ListNode head =node1;
//链表的遍历
while (head != null){
System.out.println(head.val);
head = head.next;
}
System.out.println("===================");
//TODO
TreeNode root = new TreeNode(5);
root.left = new TreeNode(3);
root.right = new TreeNode(6);
root.left.left = new TreeNode(1);
root.left.right = new TreeNode(4);
//TODO 树形结构
// 5
// / \
// 3 6
// / \
// 1 4
//先序遍历
preOrderTraversal(root); // 5 3 1 4 6
System.out.println("======================");
//中序遍历
midOrderTraversal(root); // 1 3 4 5 6
System.out.println("======================");
//后序遍历
postOrderTraversal(root); //1 4 3 6 5
System.out.println("======================");
//查找树
Boolean result = treeSearch(root, 2);
System.out.println(result);
//有向有环图
GraphNode nodeA = new GraphNode(1);
GraphNode nodeB = new GraphNode(2);
GraphNode nodeC = new GraphNode(3);
nodeA.nabeur.add(nodeB);
nodeB.nabeur.add(nodeC);
nodeC.nabeur.add(nodeA);
}
/**
* 对于二叉查找树 - 任意节点的左边一定小于该节点右边一定大于该节点
* 在查找一个这个数是否含有这个一个值可以实现的是
* @param root
* @param val
* @return
*/
public static Boolean treeSearch(TreeNode root , int val){
if(root == null){
return false;
}else {
if (root.val == val){
return true;
}else if(root.val < val){
return treeSearch(root.right,val);
}else {
return treeSearch(root.left, val);
}
}
}
/**
* 先序遍历
* 遍历方式:
* 1.遍历根节点
* 2.对左子树进行先序遍历
* 3.对右子数进行先序遍历
*/
public static void preOrderTraversal(TreeNode root){
if(root != null){
System.out.println(root.val);
preOrderTraversal(root.left);
preOrderTraversal(root.right);
}
}
/**
* 中序遍历
* 遍历方式:
* 1.对左子树进行中序遍历
* 2.遍历根节点
* 3.对右子数进行中序遍历
*/
public static void midOrderTraversal(TreeNode root){
if(root != null){
midOrderTraversal(root.left);
System.out.println(root.val);
midOrderTraversal(root.right);
}
}
/**
* 后序遍历
* 遍历方式:
* 1.对左子树进行后序遍历
* 2.对右子数进行后序遍历
* 3.遍历根节点
*/
public static void postOrderTraversal(TreeNode root){
if(root != null){
postOrderTraversal(root.left);
postOrderTraversal(root.right);
System.out.println(root.val);
}
}
//扩展成一个树
public static class TreeNode{
public int val;
public TreeNode left;
public TreeNode right;
public TreeNode() {
}
public TreeNode(int val) {
this.val = val;
}
public TreeNode(int val, TreeNode left, TreeNode right) {
this.val = val;
this.left = left;
this.right = right;
}
}
//链表
public static class ListNode{
public int val;
public ListNode next;
public ListNode(){
}
public ListNode(int val) {
this.val = val;
}
public ListNode(int val, ListNode next) {
this.val = val;
this.next = next;
}
}
/**
* 有向无环图
*/
public static class GraphNode{
public int val;
public ArrayList<GraphNode> nabeur =new ArrayList<>();
public GraphNode() {
}
public GraphNode(int val) {
this.val = val;
}
public GraphNode(int val, ArrayList<GraphNode> nabeur) {
this.val = val;
this.nabeur = nabeur;
}
}
}

View File

@ -0,0 +1,159 @@
package day08;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.calcite.shaded.com.google.common.base.Charsets;
import org.apache.flink.shaded.guava18.com.google.common.hash.BloomFilter;
import org.apache.flink.shaded.guava18.com.google.common.hash.Funnels;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.time.Duration;
import java.util.HashSet;
/**
* 用布隆过滤器实现去重
*/
public class Example2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.readTextFile("E:\\Big_data_example\\Flink\\src\\main\\resources\\UserBehavior.csv")
.map(new MapFunction<String, UserBehavior>() {
@Override
public UserBehavior map(String value) throws Exception {
String[] arr = value.split(",");
return new UserBehavior(arr[0],arr[1],arr[2],arr[3],Long.parseLong(arr[4])*1000L);
}
})
.filter(r -> r.behavior.equals("pv"))
.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(0))
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
@Override
public long extractTimestamp(UserBehavior element, long recordTimestamp) {
return element.timeStamp;
}
})
)
.keyBy( r ->true)
.window(TumblingEventTimeWindows.of(Time.hours(1)))
.aggregate(new CountAgg(),new WindowResult())
.print();
env.execute();
}
public static class WindowResult extends ProcessWindowFunction<Long,String,Boolean, TimeWindow> {
@Override
public void process(Boolean aBoolean, Context context, Iterable<Long> iterable, Collector<String> collector) throws Exception {
String windowStart = new Timestamp(context.window().getStart()).toString();
String windowStop = new Timestamp(context.window().getEnd()).toString();
Long count = iterable.iterator().next();
collector.collect("窗口"+windowStart+"~"+windowStop+"的独立访客的数量为:"+count);
}
}
//布隆顾虑器实现去重
public static class CountAgg implements AggregateFunction<UserBehavior, Tuple2<Long,BloomFilter<String>>,Long>{
@Override
public Tuple2<Long, BloomFilter<String>> createAccumulator() {
//BloomFilter.create(输入数据类型,期望插入数据数量,误判率)
return Tuple2.of(0L,BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8),100000,0.01));
}
@Override
public Tuple2<Long, BloomFilter<String>> add(UserBehavior userBehavior, Tuple2<Long, BloomFilter<String>> accumulator) {
//如果布隆过滤器绝对不包含这个ID
if(!accumulator.f1.mightContain(userBehavior.userId)){
accumulator.f1.put(userBehavior.userId); //将对应位置置为一
accumulator.f0 +=1L;
}
return accumulator;
}
@Override
public Long getResult(Tuple2<Long, BloomFilter<String>> accumulator) {
return accumulator.f0;
}
@Override
public Tuple2<Long, BloomFilter<String>> merge(Tuple2<Long, BloomFilter<String>> longBloomFilterTuple2, Tuple2<Long, BloomFilter<String>> acc1) {
return null;
}
}
/**
* 用户行为POJO类
*/
public static class UserBehavior{
public String userId;
public String itemId;
public String categoryId;
public String behavior;
public Long timeStamp;
public UserBehavior(){
}
public UserBehavior(String userId, String itemId, String categoryId, String behavior, Long timeStamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timeStamp = timeStamp;
}
@Override
public String toString() {
return "UserBehavior{" +
"userId='" + userId + '\'' +
", itemId='" + itemId + '\'' +
", categoryId='" + categoryId + '\'' +
", behavior='" + behavior + '\'' +
", timeStamp=" + new Timestamp(timeStamp) +
'}';
}
}
}

View File

@ -0,0 +1,131 @@
package day08;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.cep.CEP;
import org.apache.flink.cep.PatternSelectFunction;
import org.apache.flink.cep.PatternStream;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.List;
import java.util.Map;
/**
* 使用Flink-CEP检测连续三次登录失败
*/
public class Example3 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> stream = env
.fromElements(
new Event("user-1", "fail", 1000L),
new Event("user-1", "fail", 2000L),
new Event("user-1", "fail", 3000L),
new Event("user-2", "success", 3000L),
new Event("user-1", "fail", 4000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event event, long l) {
return event.timestamp;
}
})
);
//定义模板
Pattern<Event, Event> pattern = Pattern
.<Event>begin("first") //为第一个匹配事件起名字
.where(new SimpleCondition<Event>() {
@Override
public boolean filter(Event event) throws Exception {
return event.eventType.equals("fail");
}
})
.next("second") //next表示严格紧邻
.where(new SimpleCondition<Event>() {
@Override
public boolean filter(Event event) throws Exception {
return event.eventType.equals("fail");
}
})
.next("third")
.where(new SimpleCondition<Event>() {
@Override
public boolean filter(Event event) throws Exception {
return event.eventType.equals("fail");
}
});
//在流上匹配模板-获取到匹配到的流
PatternStream<Event> patternStream = CEP.pattern(stream.keyBy(r -> r.user), pattern);
//使用select方法将匹配到的事件取出
patternStream
.select(new PatternSelectFunction<Event, String>() {
@Override
public String select(Map<String, List<Event>> map) throws Exception {
//Map的key是给事件起的名字
//列表是名字对应的事件所构成的列表
Event first = map.get("first").get(0);
Event second = map.get("second").get(0);
Event third = map.get("third").get(0);
String result = "用户:"+first.user+"在事件:"+first.timestamp+";"
+second.timestamp+";"+third.timestamp+"登录失败了!";
return result;
}
}).print();
//用户:user-1在事件:1000;2000;3000登录失败了!
//用户:user-1在事件:2000;3000;4000登录失败了!
env.execute();
}
/**
* 登陆事件POJO类
*/
public static class Event{
public String user;
public String eventType;
public Long timestamp;
public Event() {
}
public Event(String user, String eventType, Long timestamp) {
this.user = user;
this.eventType = eventType;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", eventType='" + eventType + '\'' +
", timestamp=" + timestamp +
'}';
}
}
}

View File

@ -0,0 +1,118 @@
package day08;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.cep.CEP;
import org.apache.flink.cep.PatternSelectFunction;
import org.apache.flink.cep.PatternStream;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.List;
import java.util.Map;
/**
* 使用Flink-CEP检测连续三次登录失败
*/
public class Example4 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> stream = env
.fromElements(
new Event("user-1", "fail", 1000L),
new Event("user-1", "fail", 2000L),
new Event("user-1", "fail", 3000L),
new Event("user-2", "success", 3000L),
new Event("user-1", "fail", 4000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event event, long l) {
return event.timestamp;
}
})
);
//定义模板
//模板的简单写法
Pattern<Event, Event> pattern = Pattern
.<Event>begin("fail") //为第一个匹配事件起名字
.where(new SimpleCondition<Event>() {
@Override
public boolean filter(Event event) throws Exception {
return event.eventType.equals("fail");
}
})
.times(3);
//在流上匹配模板-获取到匹配到的流
PatternStream<Event> patternStream = CEP.pattern(stream.keyBy(r -> r.user), pattern);
//使用select方法将匹配到的事件取出
patternStream
.select(new PatternSelectFunction<Event, String>() {
@Override
public String select(Map<String, List<Event>> map) throws Exception {
//Map的key是给事件起的名字
//列表是名字对应的事件所构成的列表
Event first = map.get("fail").get(0);
Event second = map.get("fail").get(1);
Event third = map.get("fail").get(2);
String result = "用户:"+first.user+"在事件:"+first.timestamp+";"
+second.timestamp+";"+third.timestamp+"登录失败了!";
return result;
}
}).print();
//用户:user-1在事件:1000;2000;3000登录失败了!
//用户:user-1在事件:2000;3000;4000登录失败了!
env.execute();
}
/**
* 登陆事件POJO类
*/
public static class Event{
public String user;
public String eventType;
public Long timestamp;
public Event() {
}
public Event(String user, String eventType, Long timestamp) {
this.user = user;
this.eventType = eventType;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", eventType='" + eventType + '\'' +
", timestamp=" + timestamp +
'}';
}
}
}

View File

@ -0,0 +1,134 @@
package day08;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import sun.awt.geom.AreaOp;
import java.util.HashMap;
/**
* 使用状态机来实现检测连续三次登录失败
*/
public class Example5 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> stream = env
.fromElements(
new Event("user-1", "fail", 1000L),
new Event("user-1", "fail", 2000L),
new Event("user-1", "fail", 3000L),
new Event("user-2", "success", 3000L),
new Event("user-1", "fail", 4000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event event, long l) {
return event.timestamp;
}
}));
stream
.keyBy(r -> r.user)
.process(new KeyedProcessFunction<String, Event, String>() {
//有限状态机
private HashMap<Tuple2<String,String>,String> stateMachine =new HashMap<>();
//初始化一个状态函数,来保存当前状态
private ValueState<String> currentState;
@Override
public void open(Configuration parameters) throws Exception {
//状态转移矩阵
//key Tuple2<当前状态,接收到的信息>
//value: 下一个状态
stateMachine.put(Tuple2.of("INITIAL","success"),"SUCCESS");
stateMachine.put(Tuple2.of("INITIAL","fail"),"S1");
stateMachine.put(Tuple2.of("S1","fail"),"S2");
stateMachine.put(Tuple2.of("S2","fail"),"FAIL");
stateMachine.put(Tuple2.of("S1","success"),"SUCCESS");
stateMachine.put(Tuple2.of("S2","success"),"SUCCESS");
currentState=getRuntimeContext().getState(
new ValueStateDescriptor<String>("current-State", Types.STRING)
);
}
@Override
public void processElement(Event value, Context context, Collector<String> collector) throws Exception {
if(currentState.value() == null){
currentState.update("INITIAL");
}
//计算将要跳转到的状态
String nextState = stateMachine.get(Tuple2.of(currentState.value(),value.eventType));
if(nextState.equals("FAIL")){
collector.collect("用户"+value.user+"连续三次登陆失败了");
currentState.update("S2");
}else if(nextState.equals("SUCCESS")){
currentState.clear();
}else {
currentState.update(nextState);
}
}
})
.print();
env.execute();
}
/**
* 登陆事件POJO类
*/
public static class Event{
public String user;
public String eventType;
public Long timestamp;
public Event() {
}
public Event(String user, String eventType, Long timestamp) {
this.user = user;
this.eventType = eventType;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", eventType='" + eventType + '\'' +
", timestamp=" + timestamp +
'}';
}
}
}

View File

@ -0,0 +1,132 @@
package day08;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.cep.CEP;
import org.apache.flink.cep.PatternFlatSelectFunction;
import org.apache.flink.cep.PatternFlatTimeoutFunction;
import org.apache.flink.cep.PatternStream;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import java.sql.Timestamp;
import java.util.List;
import java.util.Map;
/**
* 订单超时检测
*/
public class Example6 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<OrderEvent> stream = env
.fromElements(
new OrderEvent("order-1", "create", 1000L),
new OrderEvent("order-2", "create", 2000L),
new OrderEvent("order-1", "pay", 3000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<OrderEvent>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<OrderEvent>() {
@Override
public long extractTimestamp(OrderEvent orderEvent, long l) {
return orderEvent.eventTime;
}
})
);
Pattern<OrderEvent, OrderEvent> pattern = Pattern
.<OrderEvent>begin("create")
.where(new SimpleCondition<OrderEvent>() {
@Override
public boolean filter(OrderEvent orderEvent) throws Exception {
return orderEvent.eventType.equals("create");
}
})
.next("pay")
.where(new SimpleCondition<OrderEvent>() {
@Override
public boolean filter(OrderEvent orderEvent) throws Exception {
return orderEvent.eventType.equals("pay");
}
})
.within(Time.seconds(5));//要求两个事件在5秒后发生
PatternStream<OrderEvent> patternStream = CEP.pattern(stream.keyBy(r -> r.orderId), pattern);
//匹配到的正常的事件给他输出,未支付的事件给他输出
SingleOutputStreamOperator<String> result = patternStream
.flatSelect(
new OutputTag<String>("timeout") {
}, //超时时间将发送到侧输出流
new PatternFlatTimeoutFunction<OrderEvent, String>() {
@Override
public void timeout(Map<String, List<OrderEvent>> map, long l, Collector<String> collector) throws Exception {
//用来处理超时的没有匹配的数据
OrderEvent create = map.get("create").get(0);
//发送到侧输出流中去
collector.collect("订单:" + create.orderId + "超时了");
}
},
new PatternFlatSelectFunction<OrderEvent, String>() {
@Override
public void flatSelect(Map<String, List<OrderEvent>> map, Collector<String> collector) throws Exception {
OrderEvent pay = map.get("pay").get(0);
collector.collect("订单:" + pay.orderId + "已支付");
}
}
);
result.print("主输出流:");
result.getSideOutput(new OutputTag<String>("timeout"){}).print("侧输出流:");
//主输出流:> 订单:order-1已支付
//侧输出流:> 订单:order-2超时了
env.execute();
}
public static class OrderEvent {
public String orderId;
public String eventType;
public Long eventTime;
public OrderEvent() {
}
public OrderEvent(String orderId, String eventType, Long eventTime) {
this.orderId = orderId;
this.eventType = eventType;
this.eventTime = eventTime;
}
@Override
public String toString() {
return "OrderEvent{" +
"orderId='" + orderId + '\'' +
", EventType='" + eventType + '\'' +
", eventTime=" + new Timestamp(eventTime) +
'}';
}
}
}

View File

@ -0,0 +1,78 @@
package day08;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
/**
* 将流转化为动态表
*/
public class Example7 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//TODO 设置数据源和水位线
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = env
.fromElements(
Tuple3.of("Mary", "./home", 12 * 60 * 60 * 1000L),
Tuple3.of("Bob", "./cart", 12 * 60 * 60 * 1000L),
Tuple3.of("Mary", "./prod?id=1", 12 * 60 * 60 * 1000L + 5 * 1000L),
Tuple3.of("liz", "./home", 12 * 60 * 60 * 1000L + 60 * 100L),
Tuple3.of("Bob", "./prod?id=3", 12 * 60 * 60 * 1000L + 90 * 1000L),
Tuple3.of("Mary", "./prod?id=7", 12 * 60 * 60 * 1000L + 105 * 1000l)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple3<String, String, Long>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
@Override
public long extractTimestamp(Tuple3<String, String, Long> stringStringLongTuple3, long l) {
return stringStringLongTuple3.f2;
}
})
);
//TODO 创建表环境
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env, settings);
//TODO 数据流转化为动态表 -会随着流的到来而不断增大
Table table = tableEnvironment
.fromDataStream(
stream,
$("f0").as("user"),
$("f1").as("url"),
//使用rowTime方法,指定f2是事件时间
$("f2").rowtime().as("cTime")
);
//如果想把结果打印出来,需要将数据流转化为动态表
//TODO 数据流 -> 动态表
tableEnvironment.toDataStream(table).print();
/*
+I[Mary, ./home, 1970-01-01 12:00:00.0]
+I[Bob, ./cart, 1970-01-01 12:00:00.0]
+I[Mary, ./prod?id=1, 1970-01-01 12:00:05.0]
+I[liz, ./home, 1970-01-01 12:00:06.0]
+I[Bob, ./prod?id=3, 1970-01-01 12:01:30.0]
+I[Mary, ./prod?id=7, 1970-01-01 12:01:45.0]
*/
env.execute();
}
}

View File

@ -0,0 +1,77 @@
package day08;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
/**
* 使用FlinkSQL对数据进行连续查询
*/
public class Example8 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//创建数据流
DataStreamSource<Tuple2<String, String>> stream = env
.fromElements(
Tuple2.of("Mary", "./home"),
Tuple2.of("Bob", "./cart"),
Tuple2.of("Mary", "./prod?id=1"),
Tuple2.of("liz", "./home")
);
//创建表环境
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env, settings);
//创建动态表
Table table = tableEnvironment
.fromDataStream(
stream,
$("f0").as("user"),
$("f1").as("url")
);
//注册临时视图
tableEnvironment.createTemporaryView("clicks",table);
//sql查询
Table result = tableEnvironment
.sqlQuery(
"select user,COUNT(url) as cnt FROM clicks GROUP BY user"
);
//查询结果转换成数据流
//更新日志流(用于查询中有聚合操作的情况)
tableEnvironment.toChangelogStream(result).print();
/*
+I[Mary, 1]
+I[Bob, 1]
-U[Mary, 1] //由于以前的Mary已经向下游发送了,所以这里-是告诉下游减号是旧结果要作废相当于逻辑删除
+U[Mary, 2] //新插入一条新结果update更新一下添加了一条[Mary, 2]这样的数据
+I[liz, 1]
*/
env.execute();
}
}

View File

@ -0,0 +1,97 @@
package day08;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
/**
* FlinkSQL中的开窗操作
*/
public class Example9 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = env
.fromElements(
Tuple3.of("Mary", "./home", 12 * 60 * 60 * 1000L),
Tuple3.of("Bob", "./cart", 12 * 60 * 60 * 1000L),
Tuple3.of("Mary", "./prod?id=1", 12 * 60 * 60 * 1000L + 2 * 60 * 1000L),
Tuple3.of("Mary", "./prod?id=4", 12 * 60 * 60 * 1000L + 55 * 60 * 1000L),
Tuple3.of("Bob", "./prod?id=5", 13 * 60 * 60 * 1000L + 60 * 1000L),
Tuple3.of("liz", "./home", 13 * 60 * 60 * 1000L + 30 * 60 * 100L),
Tuple3.of("liz", "./prod?id=7", 13 * 60 * 60 * 1000L + 59 * 60 * 100L),
Tuple3.of("Mary", "./cart", 14 * 60 * 60 * 1000L),
Tuple3.of("liz", "./home", 14 * 60 * 60 * 1000L + 2 * 60 * 1000L),
Tuple3.of("Bob", "./prod?id=3", 14 * 60 * 60 * 1000L + 30 * 60 * 1000L),
Tuple3.of("Bob", "./home", 14 * 60 * 60 * 1000L + 40 * 60 * 1000l)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple3<String, String, Long>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
@Override
public long extractTimestamp(Tuple3<String, String, Long> stringStringLongTuple3, long l) {
return stringStringLongTuple3.f2;
}
})
);
//TODO 创建表环境
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
StreamTableEnvironment tableEnvironment = StreamTableEnvironment.create(env, settings);
//TODO 数据流转化为动态表 -会随着流的到来而不断增大
Table table = tableEnvironment
.fromDataStream(
stream,
$("f0").as("user"),
$("f1").as("url"),
//使用rowTime方法,指定f2是事件时间
$("f2").rowtime().as("cTime")
);
tableEnvironment.createTemporaryView("clicks",table);
//以cTime开窗窗口的大小是一小时,滚动窗口
Table result = tableEnvironment
.sqlQuery(
"select user ,COUNT(url) as cnt ,TUMBLE_END(cTime,INTERVAL '1' HOUR) AS endT " +
"from clicks group by user,TUMBLE(cTime,INTERVAL '1' HOUR)"
);
tableEnvironment.toChangelogStream(result).print();
/*
+I[Mary, 3, 1970-01-01T13:00]
+I[Bob, 1, 1970-01-01T13:00]
+I[liz, 2, 1970-01-01T14:00]
+I[Bob, 1, 1970-01-01T14:00]
+I[Bob, 2, 1970-01-01T15:00]
+I[Mary, 1, 1970-01-01T15:00]
+I[liz, 1, 1970-01-01T15:00]
*/
env.execute();
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.hive</groupId>
<artifactId>Gulivideo</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,62 @@
package com.atguigu.gulivideo.etl;
/**
* 封装一个工具类用于所有的数据清洗方法
*/
public class ETLUtils {
/**
* 清洗视频数据
*
* 规则
* 1.数据长度必须大于等于9
* 2.将数据的类别中的空格去掉
* 3.将关联视频通过&拼接
*
* @param line
* @return 如果数据合法返回清洗完的数据
* 如果数据不合法返回null
*
* 测试数据
* RX24KLBhwMI lemonette 697 People & Blogs 512 24149 4.22 315 474 t60tW0WevkE WZgoejVDZlo Xa_op4MhSkg MwynZ8qTwXA sfG2rtAkAcg j72VLPwzd_c 24Qfs69Al3U EGWutOjVx4M KVkseZR5coU R6OaRcsfnY4 dGM3k_4cNhE ai-cSq6APLQ 73M0y-iD9WE 3uKOSjE79YA 9BBu5N0iFBg 7f9zwx52xgA ncEV0tSC7xM H-J8Kbx9o68 s8xf4QX1UvA 2cKd9ERh5-8
*/
public static String etlGulivideoData(String line){
StringBuffer sbs =new StringBuffer();
//1.切割数据
String[] splits = line.split("\t");
//2.规则一
if(splits.length<9){
return null;
}
//3.规则二
splits[3]= splits[3].replaceAll(" ", "");
//4.规则三
for (int i = 0; i < splits.length; i++) {
//有相关视频 没有相关视频
if(i<=8){
if(i==splits.length-1){
sbs.append(splits[i]);
}else {
sbs.append(splits[i]).append("\t");
}
}else {
if(i==splits.length-1){
sbs.append(splits[i]);
}else {
sbs.append(splits[i]).append("&");
}
}
}
return sbs.toString();
}
public static void main(String[] args) {
String line = "RX24KLBhwMI\tlemonette\t697\tPeople & Blogs\t512\t24149\t4.22\t315";
String result=etlGulivideoData(line);
System.out.println(result);
}
}

View File

@ -0,0 +1,33 @@
package com.atguigu.gulivideo.etl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class GulivideoETLDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf =new Configuration();
Job job =Job.getInstance(conf);
job.setJarByClass(GulivideoETLDriver.class);
job.setMapperClass(GulivideoETLMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//如果不指定则使用的是系统自带的reduce
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
}

View File

@ -0,0 +1,24 @@
package com.atguigu.gulivideo.etl;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class GulivideoETLMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
Text outk= new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String result = ETLUtils.etlGulivideoData(line);
if(result==null){
return ;
}
outk.set(result);
context.write(outk,NullWritable.get());
}
}

View File

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.hbase</groupId>
<artifactId>HBase</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>2.0.5</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.0.5</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,278 @@
package com.atguigu.hbase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
/**
* ctrl+P查看需要传入什么对象,ctrl+O查看成员方法,ctrl+H查看类和实现类
*/
/**
* Connection : 通过ConnectionFactory获取,是重量级实现,因此不需要每次都打开,只需要打开一次
* Table : 主要负责DML操作 ,轻量级实现 每次打开,用完以后关闭
* Admin 主要负责DDL操作 轻量级实现每次打开,用完以后关闭
*/
public class HBaseDemo {
private static Connection connection;
static {
//创建hadoop的conf,用HBaseConfiguration去create
Configuration conf = HBaseConfiguration.create();
//在conf中指定hbase地址,去连接;在hbase-site.xml中有指定过zookeeper中hbase位置
conf.set("hbase.zookeeper.quorum","Ding202,Ding203,Ding204");
try {
connection= ConnectionFactory.createConnection(conf);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
//createTable("","t1","info1","info2");
// dropTable("","t1");
//putData("","stu","1003","info","name","wangwu");
//deleteData("","stu","1003","info","name");
//getData("","stu","1001","info","name");
//scanData("","stu","1001","1003");
createTableWithRegions("","staff4","info");
}
/**
* 判断表是否存在
*/
public static boolean existTable(String nameSpaceName,String tableName) throws IOException {
Admin admin=connection.getAdmin();
return admin.tableExists(TableName.valueOf(nameSpaceName,tableName));
}
/**
* 创建table,带预分区
*/
public static void createTableWithRegions(String nameSpaceName,String tableName,String ... cfs) throws IOException {
if(existTable(nameSpaceName,tableName)){
System.err.println((nameSpaceName == null ||nameSpaceName.equals("")? "default" : nameSpaceName)+":"+tableName+"已经存在");
return;
}
Admin admin = connection.getAdmin();
//不知道怎么写就一点点的去找
TableDescriptorBuilder tableDescriptorBuilder =
TableDescriptorBuilder.newBuilder(TableName.valueOf(nameSpaceName,tableName));
if(cfs == null || cfs.length < 1){
System.err.println("至少指定一个列组");
return;
}
for (String cf : cfs) {
ColumnFamilyDescriptorBuilder columnFamilyDescriptorBuilder = ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes(cf));
ColumnFamilyDescriptor columnFamilyDescriptor = columnFamilyDescriptorBuilder.build();
//setColumnFamily需要传入一个ColumnFamilyDescriptor对象
tableDescriptorBuilder.setColumnFamily(columnFamilyDescriptor);
}
TableDescriptor tableDescriptor = tableDescriptorBuilder.build();
//创建表,需要传入一个TableDescriptor对象
//传入一个二维的分区字节数组
//本质上就是传入一个一位数组,二维数组就是一位数组挂了一个数组,就是要求把值转成一个字节数组了
byte[][] splitkeys=new byte[4][];
splitkeys[0]=Bytes.toBytes("1000");
splitkeys[1]=Bytes.toBytes("2000");
splitkeys[2]=Bytes.toBytes("3000");
splitkeys[3]=Bytes.toBytes("4000");
admin.createTable(tableDescriptor,splitkeys);
admin.close();
}
/**
* scan
*/
public static void scanData(String nameSpaceName,String tableName,String startRow,String stopRow) throws IOException {
Table table = connection.getTable(TableName.valueOf(nameSpaceName, tableName));
Scan scan = new Scan();
//需要传入一个scanner对象
ResultScanner scanner = table.getScanner(scan);
//scan.withStartRow(Bytes.toBytes(startRow));
//scan.withStopRow(Bytes.toBytes(stopRow));
//上述代码可以简写为
scan.withStartRow(Bytes.toBytes(startRow)).withStopRow(Bytes.toBytes(stopRow));
for (Result result : scanner) {
//一个result就是一条数据
//通过每一个数据在获取他的cells
//cells中相当于数据中每一个字段的值
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
String cellString = Bytes.toString(CellUtil.cloneRow(cell))+":"+
Bytes.toString(CellUtil.cloneFamily(cell))+":"+
Bytes.toString(CellUtil.cloneQualifier(cell))+":"+
Bytes.toString(CellUtil.cloneValue(cell));
System.out.println(cellString);
}
System.out.println("--------------------------------------------");
}
table.close();
}
/**
* get
*/
public static void getData(String nameSpaceName,String tableName,String rowkey,String cf,String cl) throws IOException {
Table table=connection.getTable(TableName.valueOf(nameSpaceName,tableName));
Get get = new Get(Bytes.toBytes(rowkey));
//get.addFamily(Bytes.toBytes(cf)); //获取某一个列组的数据
get.addColumn(Bytes.toBytes(cf),Bytes.toBytes(cl)); //获取整行数据
//需要传入一个get对象
Result result = table.get(get);
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
String cellString = Bytes.toString(CellUtil.cloneRow(cell))+":"+
Bytes.toString(CellUtil.cloneFamily(cell))+":"+
Bytes.toString(CellUtil.cloneQualifier(cell))+":"+
Bytes.toString(CellUtil.cloneValue(cell));
System.out.println(cellString);
}
table.close();
}
/**
* delete
*/
public static void deleteData(String nameSpaceName,String tableName,String rowkey,String cf,String cl ) throws IOException {
Table table=connection.getTable(TableName.valueOf(nameSpaceName,tableName));
Delete delete = new Delete(Bytes.toBytes(rowkey)); //如果至指定rowkey,就是删除整条数据
//delete.addFamily(Bytes.toBytes(cf)); 指定删除某个列组的数据type:DeleteFamily
//delete.addColumn(Bytes.toBytes(cf),Bytes.toBytes(cl)); //type:Delete
delete.addColumns(Bytes.toBytes(cf),Bytes.toBytes(cl)); //type:DeleteColumn
//需要传入一个delete对象
table.delete(delete);
table.close();
}
/**
* 添加或修改数据
* put
*/
public static void putData(String nameSpaceName,String tableName,String rowkey,String cf,String cl ,String value) throws IOException {
Table table=connection.getTable(TableName.valueOf(nameSpaceName,tableName));
Put put = new Put(Bytes.toBytes(rowkey));
put.addColumn(Bytes.toBytes(cf),Bytes.toBytes(cl),Bytes.toBytes(value));
//需要准备一个put对象
table.put(put);
table.close();
}
/**
* 删除表
*/
public static void dropTable(String nameSpaceName,String tableName) throws IOException {
if(!existTable(nameSpaceName,tableName)){
System.err.println("表不存在");
}
Admin admin = connection.getAdmin();
TableName tn = TableName.valueOf(nameSpaceName, tableName);
admin.disableTable(tn);
admin.deleteTable(tn);
admin.close();
}
/**
* 创建table
*/
public static void createTable(String nameSpaceName,String tableName,String ... cfs) throws IOException {
if(existTable(nameSpaceName,tableName)){
System.err.println((nameSpaceName == null ||nameSpaceName.equals("")? "default" : nameSpaceName)+":"+tableName+"已经存在");
return;
}
Admin admin = connection.getAdmin();
//不知道怎么写就一点点的去找
TableDescriptorBuilder tableDescriptorBuilder =
TableDescriptorBuilder.newBuilder(TableName.valueOf(nameSpaceName,tableName));
if(cfs == null || cfs.length < 1){
System.err.println("至少指定一个列组");
return;
}
for (String cf : cfs) {
ColumnFamilyDescriptorBuilder columnFamilyDescriptorBuilder = ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes(cf));
ColumnFamilyDescriptor columnFamilyDescriptor = columnFamilyDescriptorBuilder.build();
//setColumnFamily需要传入一个ColumnFamilyDescriptor对象
tableDescriptorBuilder.setColumnFamily(columnFamilyDescriptor);
}
TableDescriptor tableDescriptor = tableDescriptorBuilder.build();
//创建表,需要传入一个TableDescriptor对象
admin.createTable(tableDescriptor);
admin.close();
}
/**
* 创建NameSpace
*/
public static void createNameSpace(String nameSpace) throws IOException {
//1.基本的判空操作
if(nameSpace==null||nameSpace.equals("")){
System.err.println("nameSpace名字不能为空");
}
//2.获取Admin对象
Admin admin = connection.getAdmin();
//查看源码发现NamespaceDescriptor需要使用以下方式获取
NamespaceDescriptor.Builder builder = NamespaceDescriptor.create(nameSpace);
NamespaceDescriptor namespaceDescriptor = builder.build();
try{
//调用方法,需要传入一个namespaceDescriptor对象
admin.createNamespace(namespaceDescriptor);
System.out.println(nameSpace+"创建成功");
}catch (NamespaceExistException e){
System.err.println(nameSpace+"已存在");
}finally {
admin.close();
}
}
}

View File

@ -0,0 +1,31 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.hdfs</groupId>
<artifactId>HdfsClient</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,173 @@
package com.atguigu.hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
/**
* 1.和HDFS建立连接
* 2.调用API完成具体功能
* 3.关闭连接
*/
public class HdfsClientTest {
private FileSystem fs;
/**
* 上传文件
* 测试配置的优先级 configuration > hdfs-site.xml > hdfs-default.xml
* 第一个参数delSrc 上传之后是否将文件删除
* 第二个参数overwrite 当目的路径存在与要上传的文件的名字相同的时候是否覆盖
* 第三个个参数pathString 要上传的本地文件路径
* 第四个参数pathString 上传到hdfs上的nn的路径
*/
@Test
public void testCopyFromLocal() throws IOException {
fs.copyFromLocalFile(false,true,
new Path("E:\\尚硅谷 大数据\\2021年大数据\\07.Hadoop\\01.笔记\\hello.txt"),
new Path("/client_test"));
}
/**
* 下载文件
*第一个参数delSrc 下载之后是否将文件删除
*第二个个参数pathString 要下载的在hdfs上的nn的文件路径
*第三个参数pathString 要下载的文件的目的地路径
*第四个参数useRawLocalFileSystem 是否生成一个rcc密文校验文件是否损坏
*/
@Test
public void testcopyToLOCAL() throws IOException {
fs.copyToLocalFile(false,
new Path("/client_test/hello.txt"),
new Path("E:\\尚硅谷 大数据\\2021年大数据\\07.Hadoop\\02.资料"),
true);
}
/**
* 删除文件和目录
* 第一个参数 pathString 要删除的在hdfs上的nn的文件的路径
*第二个参数 recursive 是否要使用递归删除即当文件夹中非空时是否全部删除
*/
@Test
public void testDelete() throws IOException {
fs.delete(new Path("/client_test/hello.txt"),
true);
}
/**
* 文件的更名或者移动
* 第一个参数 pathString 要移动的在hdfs上的nn的文件的路径
*第二个参数 pathString 目的地在hdfs上的nn的文件的路径
*/
@Test
public void testRename() throws IOException {
//移动文件
// fs.rename(new Path("/sanguo/zhangfei.txt"),new Path("/client_test/"));
//更名文件
fs.rename(new Path("/client_test/zhangfei.txt"),
new Path("/client_test/sunshangxaing.txt"));
}
/**
* 查看文件详情
* 第一个参数 pathString 要移动的在hdfs上的nn的文件的路径
*第二个参数 recursive 是否要使用递归查询即当文件夹中非空时是否全部查询
*/
@Test
public void testListFiles() throws IOException {
RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);
while(listFiles.hasNext()){
LocatedFileStatus fileStatus = listFiles.next();
System.out.println("文件名称:"+fileStatus.getPath().getName());
System.out.println("块大小:"+fileStatus.getBlockSize());
System.out.println("副本数:"+fileStatus.getReplication());
System.out.println("权限信息:"+fileStatus.getPermission());
}
}
/**
* 判断一个路径是文件还是目录
* 第一个参数 pathString 要判断的路径名
*/
@Test
public void testListStatus() throws IOException {
FileStatus[] listStatus = fs.listStatus(new Path("/"));
for (FileStatus status : listStatus) {
if(status.isDirectory()){
System.out.println("DIR:"+status.getPath().getName());
}else{
System.out.println("FILE:"+status.getPath().getName());
}
}
}
/**
* 获取FileSystem对象
* @throws IOException
* @throws InterruptedException
*/
@Before
public void init() throws IOException, InterruptedException {
// HDFS的访问路径 hfds://Ding202:9820
URI uri = URI.create("hdfs://Ding202:9820");
// conf 配置对象
Configuration conf = new Configuration();
conf.set("dfs.replication","6");
// 操作的用户用哪个用户操作HDFS
String user="dingjiawen";
//获取HDFS的客户端连接对象文件系统对象
fs = FileSystem.get(uri, conf, user);
}
/**
* 关闭资源
* @throws IOException
*/
@After
public void close() throws IOException {
fs.close();
}
/**
* 获取HDFS的客户端连接对象
* * @param uri HDFS的访问路径 hfds://Ding202:9820
* * @param conf 配置对象
* * @param user 操作的用户用哪个用户操作HDFS
*/
@Test
public void testCreateHdfsClient() throws IOException, InterruptedException {
// HDFS的访问路径 hfds://Ding202:9820
URI uri = URI.create("hdfs://Ding202:9820");
// conf 配置对象
Configuration conf = new Configuration();
// 操作的用户用哪个用户操作HDFS
String user="dingjiawen";
//获取HDFS的客户端连接对象文件系统对象
FileSystem fileSystem = FileSystem.get(uri, conf, user);
System.out.println(fileSystem.getClass().getName());
//关闭资源
fileSystem.close();
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,644 @@
<?xml version="1.0"?>
<fsimage>
<version>
<layoutVersion>-64</layoutVersion>
<onDiskVersion>1</onDiskVersion>
<oivRevision>ba631c436b806728f8ec2f54ab1e289526c90579</oivRevision>
</version>
<NameSection>
<namespaceId>1750466601</namespaceId>
<genstampV1>1000</genstampV1>
<genstampV2>1030</genstampV2>
<genstampV1Limit>0</genstampV1Limit>
<lastAllocatedBlockId>1073741853</lastAllocatedBlockId>
<txid>291</txid>
</NameSection>
<ErasureCodingSection>
<erasureCodingPolicy>
<policyId>1</policyId>
<policyName>RS-6-3-1024k</policyName>
<cellSize>1048576</cellSize>
<policyState>DISABLED</policyState>
<ecSchema>
<codecName>rs</codecName>
<dataUnits>6</dataUnits>
<parityUnits>3</parityUnits>
</ecSchema>
</erasureCodingPolicy>
<erasureCodingPolicy>
<policyId>2</policyId>
<policyName>RS-3-2-1024k</policyName>
<cellSize>1048576</cellSize>
<policyState>DISABLED</policyState>
<ecSchema>
<codecName>rs</codecName>
<dataUnits>3</dataUnits>
<parityUnits>2</parityUnits>
</ecSchema>
</erasureCodingPolicy>
<erasureCodingPolicy>
<policyId>3</policyId>
<policyName>RS-LEGACY-6-3-1024k</policyName>
<cellSize>1048576</cellSize>
<policyState>DISABLED</policyState>
<ecSchema>
<codecName>rs-legacy</codecName>
<dataUnits>6</dataUnits>
<parityUnits>3</parityUnits>
</ecSchema>
</erasureCodingPolicy>
<erasureCodingPolicy>
<policyId>4</policyId>
<policyName>XOR-2-1-1024k</policyName>
<cellSize>1048576</cellSize>
<policyState>DISABLED</policyState>
<ecSchema>
<codecName>xor</codecName>
<dataUnits>2</dataUnits>
<parityUnits>1</parityUnits>
</ecSchema>
</erasureCodingPolicy>
<erasureCodingPolicy>
<policyId>5</policyId>
<policyName>RS-10-4-1024k</policyName>
<cellSize>1048576</cellSize>
<policyState>DISABLED</policyState>
<ecSchema>
<codecName>rs</codecName>
<dataUnits>10</dataUnits>
<parityUnits>4</parityUnits>
</ecSchema>
</erasureCodingPolicy>
</ErasureCodingSection>
<INodeSection>
<lastInodeId>16455</lastInodeId>
<numInodes>37</numInodes>
<inode>
<id>16385</id>
<type>DIRECTORY</type>
<name></name>
<mtime>1634992639432</mtime>
<permission>dingjiawen:supergroup:0755</permission>
<nsquota>9223372036854775807</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16386</id>
<type>DIRECTORY</type>
<name>wcinput</name>
<mtime>1634898562368</mtime>
<permission>dingjiawen:supergroup:0755</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16387</id>
<type>FILE</type>
<name>hello.txt</name>
<replication>3</replication>
<mtime>1634898562869</mtime>
<atime>1634902318394</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0644</permission>
<blocks>
<block>
<id>1073741825</id>
<genstamp>1001</genstamp>
<numBytes>92</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16388</id>
<type>DIRECTORY</type>
<name>tmp</name>
<mtime>1634902310022</mtime>
<permission>dingjiawen:supergroup:0700</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16389</id>
<type>DIRECTORY</type>
<name>hadoop-yarn</name>
<mtime>1634898718216</mtime>
<permission>dingjiawen:supergroup:0700</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16390</id>
<type>DIRECTORY</type>
<name>staging</name>
<mtime>1634898723502</mtime>
<permission>dingjiawen:supergroup:0700</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16391</id>
<type>DIRECTORY</type>
<name>dingjiawen</name>
<mtime>1634898718216</mtime>
<permission>dingjiawen:supergroup:0700</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16392</id>
<type>DIRECTORY</type>
<name>.staging</name>
<mtime>1634902324657</mtime>
<permission>dingjiawen:supergroup:0700</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16398</id>
<type>DIRECTORY</type>
<name>history</name>
<mtime>1634899864728</mtime>
<permission>dingjiawen:supergroup:0755</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16399</id>
<type>DIRECTORY</type>
<name>done_intermediate</name>
<mtime>1634898723520</mtime>
<permission>dingjiawen:supergroup:1777</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16400</id>
<type>DIRECTORY</type>
<name>dingjiawen</name>
<mtime>1634902422435</mtime>
<permission>dingjiawen:supergroup:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16401</id>
<type>DIRECTORY</type>
<name>wcoutput</name>
<mtime>1634898742662</mtime>
<permission>dingjiawen:supergroup:0755</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16408</id>
<type>FILE</type>
<name>part-r-00000</name>
<replication>3</replication>
<mtime>1634898742564</mtime>
<atime>1634898742434</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0644</permission>
<blocks>
<block>
<id>1073741832</id>
<genstamp>1008</genstamp>
<numBytes>78</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16410</id>
<type>FILE</type>
<name>_SUCCESS</name>
<replication>3</replication>
<mtime>1634898742664</mtime>
<atime>1634898742662</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0644</permission>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16413</id>
<type>FILE</type>
<name>
job_1634897835344_0001-1634898720164-dingjiawen-word+count-1634898743224-1-1-SUCCEEDED-default-1634898727228.jhist
</name>
<replication>3</replication>
<mtime>1634898742783</mtime>
<atime>1634898742760</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0770</permission>
<blocks>
<block>
<id>1073741834</id>
<genstamp>1010</genstamp>
<numBytes>22368</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16414</id>
<type>FILE</type>
<name>job_1634897835344_0001_conf.xml</name>
<replication>3</replication>
<mtime>1634898742822</mtime>
<atime>1634898742799</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0770</permission>
<blocks>
<block>
<id>1073741835</id>
<genstamp>1011</genstamp>
<numBytes>214785</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16415</id>
<type>DIRECTORY</type>
<name>done</name>
<mtime>1634899899078</mtime>
<permission>dingjiawen:supergroup:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16416</id>
<type>DIRECTORY</type>
<name>2021</name>
<mtime>1634899899078</mtime>
<permission>dingjiawen:supergroup:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16417</id>
<type>DIRECTORY</type>
<name>10</name>
<mtime>1634899899078</mtime>
<permission>dingjiawen:supergroup:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16418</id>
<type>DIRECTORY</type>
<name>22</name>
<mtime>1634899899078</mtime>
<permission>dingjiawen:supergroup:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16419</id>
<type>DIRECTORY</type>
<name>000000</name>
<mtime>1634902422435</mtime>
<permission>dingjiawen:supergroup:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16425</id>
<type>DIRECTORY</type>
<name>logs</name>
<mtime>1634902310063</mtime>
<permission>dingjiawen:dingjiawen:1777</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16426</id>
<type>DIRECTORY</type>
<name>dingjiawen</name>
<mtime>1634902310069</mtime>
<permission>dingjiawen:dingjiawen:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16427</id>
<type>DIRECTORY</type>
<name>logs-tfile</name>
<mtime>1634902310074</mtime>
<permission>dingjiawen:dingjiawen:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16428</id>
<type>DIRECTORY</type>
<name>application_1634902054411_0001</name>
<mtime>1634902331189</mtime>
<permission>dingjiawen:dingjiawen:0770</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16430</id>
<type>DIRECTORY</type>
<name>wcoutput2</name>
<mtime>1634902323450</mtime>
<permission>dingjiawen:supergroup:0755</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16436</id>
<type>FILE</type>
<name>part-r-00000</name>
<replication>3</replication>
<mtime>1634902323357</mtime>
<atime>1634902323238</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0644</permission>
<blocks>
<block>
<id>1073741842</id>
<genstamp>1018</genstamp>
<numBytes>78</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16438</id>
<type>FILE</type>
<name>_SUCCESS</name>
<replication>3</replication>
<mtime>1634902323452</mtime>
<atime>1634902323450</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0644</permission>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16441</id>
<type>FILE</type>
<name>
job_1634902054411_0001-1634902309668-dingjiawen-word+count-1634902324404-1-1-SUCCEEDED-default-1634902314351.jhist
</name>
<replication>3</replication>
<mtime>1634902323558</mtime>
<atime>1634902323524</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0770</permission>
<blocks>
<block>
<id>1073741844</id>
<genstamp>1020</genstamp>
<numBytes>22336</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16442</id>
<type>FILE</type>
<name>job_1634902054411_0001_conf.xml</name>
<replication>3</replication>
<mtime>1634902323591</mtime>
<atime>1634902323566</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0770</permission>
<blocks>
<block>
<id>1073741845</id>
<genstamp>1021</genstamp>
<numBytes>214958</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16443</id>
<type>FILE</type>
<name>Ding203_43285</name>
<replication>3</replication>
<mtime>1634902331185</mtime>
<atime>1634906546779</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:dingjiawen:0640</permission>
<blocks>
<block>
<id>1073741846</id>
<genstamp>1022</genstamp>
<numBytes>133945</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16444</id>
<type>DIRECTORY</type>
<name>sanguo</name>
<mtime>1634994579596</mtime>
<permission>dingjiawen:supergroup:0755</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16445</id>
<type>FILE</type>
<name>shuguo.txt</name>
<replication>3</replication>
<mtime>1634910351043</mtime>
<atime>1634988132639</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0644</permission>
<blocks>
<block>
<id>1073741847</id>
<genstamp>1023</genstamp>
<numBytes>19</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16447</id>
<type>FILE</type>
<name>sunshangxaing.txt</name>
<replication>3</replication>
<mtime>1634910725360</mtime>
<atime>1634988132668</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0644</permission>
<blocks>
<block>
<id>1073741849</id>
<genstamp>1026</genstamp>
<numBytes>16</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16448</id>
<type>DIRECTORY</type>
<name>xiyou</name>
<mtime>1634988717255</mtime>
<permission>dingjiawen:supergroup:0755</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
<inode>
<id>16451</id>
<type>FILE</type>
<name>zhangfei.txt</name>
<replication>3</replication>
<mtime>1634988432270</mtime>
<atime>1634988431098</atime>
<preferredBlockSize>134217728</preferredBlockSize>
<permission>dingjiawen:supergroup:0644</permission>
<blocks>
<block>
<id>1073741850</id>
<genstamp>1027</genstamp>
<numBytes>16</numBytes>
</block>
</blocks>
<storagePolicyId>0</storagePolicyId>
</inode>
<inode>
<id>16452</id>
<type>DIRECTORY</type>
<name>client_test</name>
<mtime>1634994678194</mtime>
<permission>dingjiawen:supergroup:0755</permission>
<nsquota>-1</nsquota>
<dsquota>-1</dsquota>
</inode>
</INodeSection>
<INodeReferenceSection></INodeReferenceSection>
<SnapshotSection>
<snapshotCounter>0</snapshotCounter>
<numSnapshots>0</numSnapshots>
</SnapshotSection>
<INodeDirectorySection>
<directory>
<parent>16385</parent>
<child>16452</child>
<child>16444</child>
<child>16388</child>
<child>16386</child>
<child>16401</child>
<child>16430</child>
<child>16448</child>
</directory>
<directory>
<parent>16386</parent>
<child>16387</child>
</directory>
<directory>
<parent>16388</parent>
<child>16389</child>
<child>16425</child>
</directory>
<directory>
<parent>16389</parent>
<child>16390</child>
</directory>
<directory>
<parent>16390</parent>
<child>16391</child>
<child>16398</child>
</directory>
<directory>
<parent>16391</parent>
<child>16392</child>
</directory>
<directory>
<parent>16398</parent>
<child>16415</child>
<child>16399</child>
</directory>
<directory>
<parent>16399</parent>
<child>16400</child>
</directory>
<directory>
<parent>16401</parent>
<child>16410</child>
<child>16408</child>
</directory>
<directory>
<parent>16415</parent>
<child>16416</child>
</directory>
<directory>
<parent>16416</parent>
<child>16417</child>
</directory>
<directory>
<parent>16417</parent>
<child>16418</child>
</directory>
<directory>
<parent>16418</parent>
<child>16419</child>
</directory>
<directory>
<parent>16419</parent>
<child>16413</child>
<child>16414</child>
<child>16441</child>
<child>16442</child>
</directory>
<directory>
<parent>16425</parent>
<child>16426</child>
</directory>
<directory>
<parent>16426</parent>
<child>16427</child>
</directory>
<directory>
<parent>16427</parent>
<child>16428</child>
</directory>
<directory>
<parent>16428</parent>
<child>16443</child>
</directory>
<directory>
<parent>16430</parent>
<child>16438</child>
<child>16436</child>
</directory>
<directory>
<parent>16444</parent>
<child>16445</child>
</directory>
<directory>
<parent>16448</parent>
<child>16451</child>
</directory>
<directory>
<parent>16452</parent>
<child>16447</child>
</directory>
</INodeDirectorySection>
<FileUnderConstructionSection></FileUnderConstructionSection>
<SecretManagerSection>
<currentId>0</currentId>
<tokenSequenceNumber>0</tokenSequenceNumber>
<numDelegationKeys>0</numDelegationKeys>
<numTokens>0</numTokens>
</SecretManagerSection>
<CacheManagerSection>
<nextDirectiveId>1</nextDirectiveId>
<numDirectives>0</numDirectives>
<numPools>0</numPools>
</CacheManagerSection>
</fsimage>

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>

View File

@ -0,0 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="error" strict="true" name="XMLConfig">
<Appenders>
<!-- 类型名为Console名称为必须属性 -->
<Appender type="Console" name="STDOUT">
<!-- 布局为PatternLayout的方式
输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here -->
<Layout type="PatternLayout"
pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" />
</Appender>
</Appenders>
<Loggers>
<!-- 可加性为false -->
<Logger name="test" level="info" additivity="false">
<AppenderRef ref="STDOUT" />
</Logger>
<!-- root loggerConfig设置 -->
<Root level="info">
<AppenderRef ref="STDOUT" />
</Root>
</Loggers>
</Configuration>

View File

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<!--当前Maven模块的坐标信息-->
<groupId>com.atguigu.maven</groupId>
<artifactId>Hello</artifactId>
<version>1.0-SNAPSHOT</version>
<!--添加依赖-->
<dependencies>
<!--Junit依赖坐标-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,14 @@
package com.atguigu.maven;
public class Hello {
public static void main(String[] args) {
String STU="SHI";
System.out.println(STU);
}
public String sayHello(String name){
return "Hello "+ name +"!";
}
}

View File

@ -0,0 +1,10 @@
package com.atguigu.maven;
import org.junit.Test;
public class Hello {
}

View File

@ -0,0 +1,12 @@
package com.atguigu.maven;
import org.junit.Test;
public class HelloTest {
@Test
public void testHello(){
Hello hello=new Hello();
String maven =hello.sayHello("maven");
System.out.println(maven);
}
}

View File

@ -0,0 +1,48 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<!--实现继承-->
<parent>
<groupId>com.atguigu.maven</groupId>
<artifactId>parent</artifactId>
<version>1.0-SNAPSHOT</version>
<!--相对路径-->
<relativePath>../parent/pom.xml</relativePath>
</parent>
<groupId>com.atguigu.maven</groupId>
<artifactId>HelloFriend</artifactId>
<version>1.0-SNAPSHOT</version>
<!--自定义变量,可以便于统一管理版本号,当版本号需要相同时直接用${spring_version}替代即可-->
<properties>
<spring_version>4.12</spring_version>
</properties>
<!--当前工程的依赖信息-->
<dependencies>
<!--Hello的依赖-->
<dependency>
<groupId>com.atguigu.maven</groupId>
<artifactId>Hello</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${spring_version}</version>
<!--scope标签决定了当前依赖的生效范围(main和test),如果不写就默认是compile-->
<scope>compile</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,15 @@
package com.atguigu.maven;
public class HelloFriend {
public String sayHelloToFriend(String name){
Hello hello = new Hello();
String str = hello.sayHello(name)+" I am "+this.getMyName();
return str;
}
public String getMyName(){
return "Idea";
}
}

View File

@ -0,0 +1,14 @@
package com.atguigu.maven;
import org.junit.Test;
public class HelloFriendTest {
@Test
public void testHelloFriend(){
HelloFriend helloFriend = new HelloFriend();
String results = helloFriend.sayHelloToFriend("Maven");
System.out.println(results);
}
}

View File

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<!--实现继承-->
<parent>
<groupId>com.atguigu.maven</groupId>
<artifactId>parent</artifactId>
<version>1.0-SNAPSHOT</version>
<!--相对路径-->
<relativePath>../parent/pom.xml</relativePath>
</parent>
<groupId>com.atguigu.maven</groupId>
<artifactId>Hello_new</artifactId>
<version>1.0-SNAPSHOT</version>
<!--添加依赖-->
<dependencies>
<!--Junit依赖坐标-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,9 @@
import java.util.Scanner;
public class Hello {
public String sayHello(String name){
return "Hello "+ name +"!";
}
}

View File

@ -0,0 +1,14 @@
import org.junit.Test;
import java.util.Scanner;
public class TestHello {
@Test
public void testHello(){
Hello hello=new Hello();
String maven =hello.sayHello("maven");
System.out.println(maven);
}
}

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu.hive</groupId>
<artifactId>Hive</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,68 @@
package com.atguigu.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* 插件性质的开发:
* 1.实现接口或者继承类
* 2.重写相应的方法
* 3.打包
*
*
* 自定义UDF函数类
* 继承Hive提供的GenericUDF类
*/
public class CalStringLengthUDF extends GenericUDF {
/**
* 初始化方法
* @param objectInspectors 传入到函数中的参数对应的类型的鉴别器对象
* @return 指定函数的返回值类型对象的鉴别器对象
* @throws UDFArgumentException
*/
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
//1.校验函数的参数个数
if(objectInspectors==null||objectInspectors.length!=1){
throw new UDFArgumentLengthException("函数的参数个数不正确");
}
//2.校验函数的参数类型,getCategory()返回的是传入的类型,PRIMITIVE表示基本类型
if(!objectInspectors[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)){
throw new UDFArgumentTypeException(0,"磺酸钠会参数类型不正确");
}
//3.返回函数的返回值类型对应的鉴别器类型
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
/**
* 函数核心处理方法
* @param deferredObjects 传入到函数的参数
* @return 函数的返回值
* @throws HiveException
*/
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
//1.获取参数
Object argument = deferredObjects[0].get();
if(argument==null){
return 0;
}
return argument.toString().length();
}
/**
* 用于以后sql函数需要显示哪些内容
* @param strings
* @return
*/
public String getDisplayString(String[] strings) {
return "";
}
}

View File

@ -0,0 +1,97 @@
package com.atguigu.hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* 自定义UDTF
* 继承Hive提供的GenericUDTF类
*
* select myudtf("hello-5,world-6,hadoop-7,hive-8",",","-");
*返回结果
* hello 5
* world 6
* hadoop 7
* hive 8
*/
public class SplitStringToColRowsUDTF extends GenericUDTF {
private List<String> outs = new ArrayList<String>();
/**
* 初始化方法
* @param argOIs
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//1.判断参数的个数
List<? extends StructField> allStructFieldRefs = argOIs.getAllStructFieldRefs();
if(allStructFieldRefs.size()!=3){
throw new UDFArgumentLengthException("函数的参数个数不正确");
}
//2.判断参数的类型
for (int i = 0; i < allStructFieldRefs.size(); i++) {
StructField structField = allStructFieldRefs.get(i);
if (!structField.getFieldObjectInspector().getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
throw new UDFArgumentTypeException(i,"函数参数类型不正确");
}
}
//3.返回
//用于知道列的名字
List<String> structFieldNames = new ArrayList<String>();
structFieldNames.add("word");
structFieldNames.add("num");
//用于指定列的类型
List<ObjectInspector> structFieldObjectInspectors = new ArrayList<ObjectInspector>();
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//不会写就看源码怎么写的
//需要列的名字和类的类型
return ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames,structFieldObjectInspectors);
}
/**
* 函数核心处理方法
* @param args 传入到函数中的参数
* @throws HiveException
*/
public void process(Object[] args) throws HiveException {
//1.获取第一个参数
String words = args[0].toString(); //"hello-5,world-6,hadoop-7,hive-8"
//2.获取第二个参数
String rowSplit = args[1].toString(); //","
//2.获取第三个参数
String colSplit = args[2].toString(); //"-"
//3.切割
String[] rows = words.split(rowSplit); //[hello-5,world-6,hadoop-7,hive-8]
for (String row : rows) {
String[] cols = row.split(colSplit); //hello 5
outs.clear();
for (String col : cols) {
outs.add(col);
}
forward(outs);
}
}
//做一些资源释放的收尾工作
public void close() throws HiveException {
}
}

View File

@ -0,0 +1,89 @@
package com.atguigu.hive.udtf;
import org.apache.avro.generic.GenericArray;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* 自定义UDTF
* 继承Hive提供的GenericUDTF类
*
* select myudtf("hello,world,hadoop,hive",",");
*/
public class SplitStringToRowsUDTF extends GenericUDTF {
private List<String> outs = new ArrayList<String>();
/**
* 初始化方法
* @param argOIs
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
//1.判断参数的个数
List<? extends StructField> allStructFieldRefs = argOIs.getAllStructFieldRefs();
if(allStructFieldRefs.size()!=2){
throw new UDFArgumentLengthException("函数的参数个数不正确");
}
//2.判断参数的类型
for (int i = 0; i < allStructFieldRefs.size(); i++) {
StructField structField = allStructFieldRefs.get(i);
if (!structField.getFieldObjectInspector().getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
throw new UDFArgumentTypeException(i,"函数参数类型不正确");
}
}
//3.返回
//用于知道列的名字
List<String> structFieldNames = new ArrayList<String>();
structFieldNames.add("word");
//用于指定列的类型
List<ObjectInspector> structFieldObjectInspectors = new ArrayList<ObjectInspector>();
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//不会写就看源码怎么写的
//需要列的名字和类的类型
return ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames,structFieldObjectInspectors);
}
/**
* 函数核心处理方法
* @param args 传入到函数中的参数
* @throws HiveException
*/
public void process(Object[] args) throws HiveException {
//1.获取第一个参数
String words = args[0].toString(); //"hello,world,hadoop,hive"
//2.获取第二个参数
String split = args[1].toString(); //","
//3.切割
String[] splitWord = words.split(split); //[hello,world,hadoop,hive]
//4.将每个单词作为一行数据写出去
for (String word : splitWord) {
//不会写就看源码的process怎么写的,发现其频繁的使用forward
//至于后续是怎么打印到控制台的,不管,是后续的事情要解决的
//forward里面放集合
outs.clear();
outs.add(word);
forward(outs);
}
}
//做一些资源释放的收尾工作
public void close() throws HiveException {
}
}

Binary file not shown.

View File

@ -0,0 +1,7 @@
url=jdbc:mysql://localhost:3306/db2
username=root
password=root
driveClassName=com.mysql.jdbc.Driver
initialSize=10
maxActive=20
maxWait=1000

Some files were not shown because too many files have changed in this diff Show More