一些改造版RNN

2023-06-13 19:17:33 +08:00 · 2023-06-13 19:17:33 +08:00 · 418b6e6ee1
parent d3b671b006
commit 418b6e6ee1
7 changed files with 446 additions and 1 deletions
--- a/Big_data_example/Spark/src/main/java/com/atguigu/spark/core/rdd/operator/transform/Spark01_RDD_Operator_Transform.scala
+++ b/Big_data_example/Spark/src/main/java/com/atguigu/spark/core/rdd/operator/transform/Spark01_RDD_Operator_Transform.scala
@ -1,5 +1,6 @@
 package com.atguigu.spark.core.rdd.operator.transform

+import org.apache.spark.rdd.RDD
 import org.apache.spark.{SparkConf, SparkContext}

 object Spark01_RDD_Operator_Transform {
@ -11,7 +12,7 @@ object Spark01_RDD_Operator_Transform {
    val sc =new SparkContext(sparkConf)

    //TODO 算子 - map
-    val rdd = sc.makeRDD(
+    val rdd: RDD[Int] = sc.makeRDD(
      List(1,2,3,4)
    )

--- a/TensorFlow_eaxmple/Model_train_test/model/LRU/README.md
+++ b/TensorFlow_eaxmple/Model_train_test/model/LRU/README.md
@ -0,0 +1,18 @@
+# 线性RNN的相关变体
+
+用bert4keras实现三个快速可并行的RNN变体：LRU、SLRU和RWKV。
+
+## 简介
+
+- 中文博客：https://kexue.fm/archives/9554
+- LRU论文：https://arxiv.org/abs/2303.06349
+- RWKV链接：https://github.com/BlinkDL/RWKV-LM
+
+## 并行
+
+线性RNN支持并行算法，可以将O(L)的运算降低到O(log L)，本项目利用的是prefix sum问题的“Upper/Lower算法”来实现RNN并行。
+
+具体细节可以参考中文博客的“[并行化](https://kexue.fm/archives/9554#%E5%B9%B6%E8%A1%8C%E5%8C%96)”一节
+
+## 交流
+QQ交流群：808623966，微信群请加机器人微信号spaces_ac_cn
--- a/TensorFlow_eaxmple/Model_train_test/model/LRU/init.py
+++ b/TensorFlow_eaxmple/Model_train_test/model/LRU/init.py
@ -0,0 +1,8 @@
+#-*- encoding:utf-8 -*-
+
+'''
+@Author : dingjiawen
+@Date : 2023/6/13 19:13
+@Usage : 
+@Desc :
+'''
--- a/TensorFlow_eaxmple/Model_train_test/model/LRU/lru.py
+++ b/TensorFlow_eaxmple/Model_train_test/model/LRU/lru.py
@ -0,0 +1,117 @@
+#! -*- coding: utf-8 -*-
+# 线性循环单元（Linear Recurrent Unit）
+# tensorflow 1.15 + bert4keras 0.11.4 测试通过
+
+from bert4keras.layers import *
+
+
+class LRU(Layer):
+    """线性循环单元
+    链接1：https://arxiv.org/abs/2303.06349
+    链接2：https://kexue.fm/archives/9554
+    """
+    def __init__(
+        self,
+        units,
+        activation='linear',
+        use_bias=True,
+        unroll=True,  # unroll可以加速训练，但是会增加显存消耗
+        kernel_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(LRU, self).__init__(**kwargs)
+        self.units = units
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+        self.unroll = unroll
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(LRU, self).build(input_shape)
+        hidden_size = input_shape[-1]
+        self.i_dense = Dense(
+            units=self.units * 2,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.o_dense = Dense(
+            units=hidden_size,
+            use_bias=self.use_bias,
+            activation=self.activation,
+            kernel_initializer=self.kernel_initializer
+        )
+
+        def initializer(shape, dtype=None):
+            r_min, r_max = 0.9, 0.999
+            u1 = np.random.random(size=shape[1])
+            u2 = np.random.random(size=shape[1])
+            nu_log = np.log(
+                -0.5 * np.log(u1 * (r_max**2 - r_min**2) + r_min**2)
+            )
+            theta_log = np.log(u2 * np.pi * 2)
+            gamma_log = np.log(np.sqrt(1 - np.exp(-np.exp(nu_log))**2))
+            return np.array([nu_log, theta_log, gamma_log])
+
+        self.params_log = self.add_weight(
+            name='params_log', shape=(3, self.units), initializer=initializer
+        )
+
+    @recompute_grad
+    def call(self, inputs, mask=None):
+        u = self.i_dense(inputs)
+        params = K.exp(self.params_log)
+        nu, theta, gamma = params[0], params[1], params[2]
+
+        if self.unroll:
+            L_in = K.int_shape(u)[1]
+            assert L_in is not None, 'input_length can not be None while unroll=True'
+            log2_L = int(np.ceil(np.log2(L_in)))
+        else:
+            L_in = K.shape(u)[1]
+            log2_L = K.log(K.cast(L_in, K.floatx())) / K.log(2.)
+            log2_L = K.cast(tf.ceil(log2_L), 'int32')
+
+        u = tf.complex(u[..., ::2], u[..., 1::2])
+        u = tf.pad(u, [[0, 0], [0, 2**log2_L - K.shape(u)[1]], [0, 0]])
+        B, L, D = K.shape(u)[0], K.shape(u)[1], K.int_shape(u)[-1]
+
+        def lru(i, x):
+            l = 2**i
+            x = K.reshape(x, [B * L // l, l, D])
+            x1, x2 = x[:, :l // 2], x[:, l // 2:]
+
+            pos = K.arange(1, l // 2 + 1, dtype=K.floatx())
+            nus = tf.einsum('n,d->nd', pos, nu)
+            thetas = tf.einsum('n,d->nd', pos, theta)
+            lambs = K.exp(tf.complex(-nus, thetas))
+
+            x2 = x2 + lambs * x1[:, -1:]
+            x = K.concatenate([x1, x2], axis=1)
+            if (not self.unroll) and K.int_shape(u)[1] is not None:
+                x = K.reshape(x, [B, L, D])
+
+            return i + 1, x
+
+        if self.unroll:
+            x = u
+            for i in range(log2_L):
+                _, x = lru(i + 1, x)
+        else:
+            _, x = tf.while_loop(lambda i, x: i <= log2_L, lru, [1, u])
+
+        x = x[:, :L_in] * tf.complex(gamma, 0.)
+        x = K.concatenate([tf.real(x), tf.imag(x)], axis=-1)
+        return self.o_dense(x)
+
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'activation': activations.serialize(self.activation),
+            'use_bias': self.use_bias,
+            'unroll': self.unroll,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(LRU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
--- a/TensorFlow_eaxmple/Model_train_test/model/LRU/models.py
+++ b/TensorFlow_eaxmple/Model_train_test/model/LRU/models.py
@ -0,0 +1,80 @@
+#! -*- coding: utf-8 -*-
+# RNN-α 模型实现
+# tensorflow 1.15 + bert4keras 0.11.4 测试通过
+
+from bert4keras.models import *
+from lru import LRU
+from slru import SLRU
+from rwkv import RWKV
+
+RNN = LRU  # SLRU、RWKV
+
+
+class RNN_alpha(RoFormerV2):
+    """RNN-α
+    改动：基本模块换成RNN
+    """
+    def initializer(self, shape, dtype=None, order=2, gain=1.0):
+        return super(RNN_alpha, self).initializer(shape, dtype, order, gain)
+
+    def apply_main_layers(self, inputs, index):
+        """RNN-α 的主体是基于RNN的模块
+        顺序：RNN --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        rnn_name = 'Transformer-%d-RNN' % index
+        ffn_name = 'Transformer-%d-FFN' % index
+
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=RNN,
+            units=(2 if RNN is SLRU else 1) * self.hidden_size,
+            use_bias=False,
+            kernel_initializer=self.initializer,
+            name=rnn_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % rnn_name
+        )
+        x = self.apply(inputs=[xi, x], layer=Add, name='%s-Add' % rnn_name)
+        x = self.apply(
+            inputs=x,
+            layer=LayerNormalization,
+            zero_mean=False,
+            scale=False,
+            offset=False,
+            epsilon=1e-12,
+            name='%s-Norm' % rnn_name
+        )
+
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            kernel_initializer=self.initializer,
+            use_bias=False,
+            name=ffn_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % ffn_name
+        )
+        x = self.apply(inputs=[xi, x], layer=Add, name='%s-Add' % rnn_name)
+        x = self.apply(
+            inputs=x,
+            layer=LayerNormalization,
+            zero_mean=False,
+            scale=False,
+            offset=False,
+            epsilon=1e-12,
+            name='%s-Norm' % ffn_name
+        )
+
+        return x
--- a/TensorFlow_eaxmple/Model_train_test/model/LRU/rwkv.py
+++ b/TensorFlow_eaxmple/Model_train_test/model/LRU/rwkv.py
@ -0,0 +1,111 @@
+#! -*- coding: utf-8 -*-
+# RWKV
+# tensorflow 1.15 + bert4keras 0.11.4 测试通过
+
+from bert4keras.layers import *
+
+
+class RWKV(Layer):
+    """RWKV
+    链接1：https://github.com/BlinkDL/RWKV-LM
+    链接2：https://kexue.fm/archives/9554
+    """
+    def __init__(
+        self,
+        units,
+        use_bias=True,
+        unroll=True,
+        kernel_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(RWKV, self).__init__(**kwargs)
+        self.units = units
+        self.use_bias = use_bias
+        self.unroll = unroll
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(RWKV, self).build(input_shape)
+        hidden_size = input_shape[-1]
+        self.rkv_dense = Dense(
+            units=self.units * 3,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.o_dense = Dense(
+            units=hidden_size,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+
+        def initializer(shape, dtype=None):
+            r_min, r_max = 0.9, 0.999
+            u = np.random.random(size=shape)
+            return np.log(-0.5 * np.log(u * (r_max**2 - r_min**2) + r_min**2))
+
+        self.nu_log = self.add_weight(
+            name='nu_log', shape=(self.units,), initializer=initializer
+        )
+        self.gamma_log = self.add_weight(
+            name='gamma_log', shape=(self.units,), initializer='zeros'
+        )
+
+    @recompute_grad
+    def call(self, inputs, mask=None):
+        rkv = self.rkv_dense(inputs)
+        r, k, v = tf.split(rkv, 3, axis=-1)
+        r, k = K.sigmoid(r), K.exp(k)
+        kv = k * v
+        u = K.concatenate([kv, k], axis=-1)
+        nu = K.exp(K.concatenate([self.nu_log, self.nu_log], axis=0))
+        gamma = K.exp(self.nu_log + self.gamma_log) - 1
+
+        if self.unroll:
+            L_in = K.int_shape(u)[1]
+            assert L_in is not None, 'input_length can not be None while unroll=True'
+            log2_L = int(np.ceil(np.log2(L_in)))
+        else:
+            L_in = K.shape(u)[1]
+            log2_L = K.log(K.cast(L_in, K.floatx())) / K.log(2.)
+            log2_L = K.cast(tf.ceil(log2_L), 'int32')
+
+        u = tf.pad(u, [[0, 0], [0, 2**log2_L - K.shape(u)[1]], [0, 0]])
+        B, L, D = K.shape(u)[0], K.shape(u)[1], K.int_shape(u)[-1]
+
+        def rwkv(i, x):
+            l = 2**i
+            x = K.reshape(x, [B * L // l, l, D])
+            x1, x2 = x[:, :l // 2], x[:, l // 2:]
+
+            pos = K.arange(1, l // 2 + 1, dtype=K.floatx())
+            nus = tf.einsum('n,d->nd', pos, nu)
+            lambs = K.exp(-nus)
+
+            x2 = x2 + lambs * x1[:, -1:]
+            x = K.concatenate([x1, x2], axis=1)
+            if (not self.unroll) and K.int_shape(u)[1] is not None:
+                x = K.reshape(x, [B, L, D])
+
+            return i + 1, x
+
+        if self.unroll:
+            for i in range(log2_L):
+                _, u = rwkv(i + 1, u)
+        else:
+            _, u = tf.while_loop(lambda i, x: i <= log2_L, rwkv, [1, u])
+
+        u1, u2 = tf.split(u[:, :L_in], 2, axis=-1)
+        u = tf.math.divide_no_nan(u1 + gamma * kv, u2 + gamma * k) * r
+        return self.o_dense(u)
+
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'use_bias': self.use_bias,
+            'unroll': self.unroll,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(RWKV, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
--- a/TensorFlow_eaxmple/Model_train_test/model/LRU/slru.py
+++ b/TensorFlow_eaxmple/Model_train_test/model/LRU/slru.py
@ -0,0 +1,110 @@
+#! -*- coding: utf-8 -*-
+# 简化版线性循环单元（Simpler Linear Recurrent Unit）
+# tensorflow 1.15 + bert4keras 0.11.4 测试通过
+
+from bert4keras.layers import *
+
+
+class SLRU(Layer):
+    """实数版线性循环单元
+    链接1：https://arxiv.org/abs/2303.06349
+    链接2：https://kexue.fm/archives/9554
+    """
+    def __init__(
+        self,
+        units,
+        activation='linear',
+        use_bias=True,
+        unroll=True,  # unroll可以加速训练，但是会增加显存消耗
+        kernel_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(SLRU, self).__init__(**kwargs)
+        self.units = units
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+        self.unroll = unroll
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(SLRU, self).build(input_shape)
+        hidden_size = input_shape[-1]
+        self.i_dense = Dense(
+            units=self.units,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.o_dense = Dense(
+            units=hidden_size,
+            use_bias=self.use_bias,
+            activation=self.activation,
+            kernel_initializer=self.kernel_initializer
+        )
+
+        def initializer(shape, dtype=None):
+            r_min, r_max = 0.9, 0.999
+            u = np.random.random(size=shape[1])
+            nu_log = np.log(-0.5 * np.log(u * (r_max**2 - r_min**2) + r_min**2))
+            gamma_log = np.log(np.sqrt(1 - np.exp(-np.exp(nu_log))**2))
+            return np.array([nu_log, gamma_log])
+
+        self.params_log = self.add_weight(
+            name='params_log', shape=(2, self.units), initializer=initializer
+        )
+
+    @recompute_grad
+    def call(self, inputs, mask=None):
+        u = self.i_dense(inputs)
+        params = K.exp(self.params_log)
+        nu, gamma = params[0], params[1]
+
+        if self.unroll:
+            L_in = K.int_shape(u)[1]
+            assert L_in is not None, 'input_length can not be None while unroll=True'
+            log2_L = int(np.ceil(np.log2(L_in)))
+        else:
+            L_in = K.shape(u)[1]
+            log2_L = K.log(K.cast(L_in, K.floatx())) / K.log(2.)
+            log2_L = K.cast(tf.ceil(log2_L), 'int32')
+
+        u = tf.pad(u, [[0, 0], [0, 2**log2_L - K.shape(u)[1]], [0, 0]])
+        B, L, D = K.shape(u)[0], K.shape(u)[1], K.int_shape(u)[-1]
+
+        def lru(i, x):
+            l = 2**i
+            x = K.reshape(x, [B * L // l, l, D])
+            x1, x2 = x[:, :l // 2], x[:, l // 2:]
+
+            pos = K.arange(1, l // 2 + 1, dtype=K.floatx())
+            nus = tf.einsum('n,d->nd', pos, nu)
+            lambs = K.exp(-nus)
+
+            x2 = x2 + lambs * x1[:, -1:]
+            x = K.concatenate([x1, x2], axis=1)
+            if (not self.unroll) and K.int_shape(u)[1] is not None:
+                x = K.reshape(x, [B, L, D])
+
+            return i + 1, x
+
+        if self.unroll:
+            x = u
+            for i in range(log2_L):
+                _, x = lru(i + 1, x)
+        else:
+            _, x = tf.while_loop(lambda i, x: i <= log2_L, lru, [1, u])
+
+        x = x[:, :L_in] * gamma
+        return self.o_dense(x)
+
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'activation': activations.serialize(self.activation),
+            'use_bias': self.use_bias,
+            'unroll': self.unroll,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(SLRU, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))