self_example/TensorFlow_eaxmple/transformer_realize/model.py

243 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import tensorflow as tf
from tensorflow.keras import layers, models, initializers
class PosEmbed(layers.Layer):
def __init__(self, embed_dim=768, num_patches=196, name=None):
super(PosEmbed, self).__init__(name=name)
self.embed_dim = embed_dim
self.num_patches = num_patches
def build(self, input_shape):
# 创建两个可训练的参数weight,对应于cls_tokenpos_embed的weight
# shape第一维是batch维度后面分别是1*768和197*768trainable表示是可训练的参数
self.cls_token = self.add_weight(name="cls",
shape=[1, 1, self.embed_dim],
initializer=initializers.Zeros(),
trainable=True,
dtype=tf.float32)
self.pos_embed = self.add_weight(name="pos_embed",
shape=[1, self.num_patches + 1, self.embed_dim],
initializer=initializers.RandomNormal(stddev=0.02),
trainable=True,
dtype=tf.float32)
def call(self, inputs, **kwargs):
# 需要获取一下batch的因为输入图片的时候不是一张图片而是将图片打包成一个个batch同时输入进去
batch_size, _, _ = inputs.shape
# [1, 1, 768] -> [B, 1, 768]
# 把cls_token复制B份进行拼接broadcast_to方法将cls_token进行一下广播在batch维度就可以变成batch维度
cls_token = tf.broadcast_to(self.cls_token, shape=[batch_size, 1, self.embed_dim])
# 与input进行拼接
x = tf.concat([cls_token, inputs], axis=1) # [B, 197, 768]
# 加上位置编码
x = x + self.pos_embed
return x
class attention(layers.Layer):
def __init__(self, dim, head_num, qkv_bias=False,
qk_scale=None,
attn_drop_ratio=0.,
proj_drop_ratio=0.,
name=None):
super(attention, self).__init__(name=name)
self.head_num = head_num
head_dim = dim // head_num
self.scale = qk_scale or head_dim ** -0.5
self.kqv = layers.Dense(3 * dim, use_bias=qkv_bias, name="qkv",
kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.attn_drop = layers.Dropout(attn_drop_ratio)
self.proj = layers.Dense(dim, name="out",
kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.proj_drop = layers.Dropout(proj_drop_ratio)
def call(self, inputs, **kwargs):
B, N, C = inputs.shape
kqv = self.kqv(inputs)
kqv = tf.reshape(kqv, [B, N, 3, self.head_num, C // self.head_num])
kqv = tf.transpose(kqv, [2, 0, 3, 1, 4])
q, k, v = kqv[0], kqv[1], kqv[2]
attn = tf.matmul(q, k, transpose_b=True) * self.scale
attn = tf.nn.softmax(attn, axis=-1)
drop = self.attn_drop(attn)
x = tf.matmul(drop, v)
x = tf.transpose(x, [0, 2, 1, 3])
x = tf.reshape(x, [B, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
class Encoder(layers.Layer):
k_ini = initializers.GlorotUniform()
b_ini = initializers.RandomNormal(stddev=1e-6)
def __init__(self, dim, head_num=8,
qkv_bias=False,
qk_scale=None,
drop_ratio=0.,
attn_drop_ratio=0.,
drop_path_ratio=0., name=None):
super(Encoder, self).__init__(name=name)
self.norm0 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_0")
self.attn = attention(dim, head_num, qkv_bias, qk_scale,
attn_drop_ratio=attn_drop_ratio,
proj_drop_ratio=drop_path_ratio,
name='Multi-head self-sttention')
self.drop1 = layers.Dropout(drop_ratio)
self.norm1 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1")
self.dense1 = layers.Dense(dim * 4, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.act = layers.Activation("relu")
self.dense2 = layers.Dense(dim, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.drop2 = layers.Dropout(drop_ratio)
self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_2")
def call(self, inputs, **kwargs):
x = self.norm0(inputs)
x = self.attn(x)
x = self.drop1(x)
x = self.norm1(x)
x = inputs + x
y = self.dense1(x)
y = self.act(y)
y = self.dense2(y)
y = self.drop2(y)
y = self.norm2(y)
y = x + y
return y
class Mask_attention(layers.Layer):
def __init__(self, dim, head_num, qkv_bias=False,
qk_scale=None,
attn_drop_ratio=0.,
proj_drop_ratio=0.,
name=None):
super(Mask_attention, self).__init__(name=name)
self.head_num = head_num
head_dim = dim // head_num
self.scale = qk_scale or head_dim ** -0.5
self.kqv = layers.Dense(3 * dim, use_bias=qkv_bias, name="qkv",
kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.attn_drop = layers.Dropout(attn_drop_ratio)
self.proj = layers.Dense(dim, name="out",
kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.proj_drop = layers.Dropout(proj_drop_ratio)
def call(self, inputs, **kwargs):
B, N, C = inputs.shape
kqv = self.kqv(inputs)
kqv = tf.reshape(kqv, [B, N, 3, self.head_num, C // self.head_num])
kqv = tf.transpose(kqv, [2, 0, 3, 1, 4])
# tf.linalg.bandPart (a, numLower, numUpper)
# a:它是要传递的tf.tensor。
# numLower:要保留的对角行数。如果为负,则保持整个下三角
# numUpper:要保留的对角行数。如果为负,则保持整个上三角
mask = tf.linalg.band_part(tf.ones((B, self.head_num, N, C // self.head_num)), -1, 0)
# 用tf.broadcast_to将这个矩阵扩展到B维
# mask=tf.broadcast_to(mask,shape=[B,self.head_num, N, C // self.head_num])
q, k, v = kqv[0], kqv[1], kqv[2]
attn = tf.matmul(q, k, transpose_b=True) * self.scale
attn = tf.matmul(attn, mask)
attn = tf.nn.softmax(attn, axis=-1)
drop = self.attn_drop(attn)
x = tf.matmul(drop, v)
x = tf.transpose(x, [0, 2, 1, 3])
x = tf.reshape(x, [B, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
class Decoder(layers.Layer):
k_ini = initializers.GlorotUniform()
b_ini = initializers.RandomNormal(stddev=1e-6)
def __init__(self, dim, head_num=8,
qkv_bias=False,
qk_scale=None,
drop_ratio=0.,
attn_drop_ratio=0.,
drop_path_ratio=0., name=None):
super(Decoder, self).__init__(name=name)
self.mask_attention = Mask_attention(dim, 8, qkv_bias,
qk_scale, attn_drop_ratio=attn_drop_ratio,
proj_drop_ratio=drop_path_ratio, name='Mask_self_attention')
self.norm1 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1")
self.attention = attention(dim, 8, qkv_bias,
qk_scale, attn_drop_ratio=attn_drop_ratio,
proj_drop_ratio=drop_path_ratio, name='Mask_self_attention')
self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_2")
self.drop1 = layers.Dropout(drop_ratio)
self.encoder = Encoder(dim, head_num=8,
qkv_bias=False,
qk_scale=None,
drop_ratio=0.,
attn_drop_ratio=0.,
drop_path_ratio=0., name=None)
self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1")
self.dense1 = layers.Dense(dim * 4, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.act = layers.Activation("relu")
self.dense2 = layers.Dense(dim, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.drop2 = layers.Dropout(drop_ratio)
self.norm3 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_3")
def call(self, inputs, input2, **kwargs):
x = self.norm1(inputs)
y = self.mask_attention(x)
y = self.drop1 = layers.Dropout(y)
y = self.norm2(y)
y = inputs + y
z = y + self.encoder(input2)
z = self.attention(z)
z = self.drop2(z)
z = self.norm3(z)
z = z + y
k = self.dense1(z)
k = self.act(k)
k = self.dense2(k)
k = self.drop2(k)
k = k + z
return k
class Transformer(layers.Layer):
k_ini = initializers.GlorotUniform()
b_ini = initializers.RandomNormal(stddev=1e-6)
def __init__(self, train_dim=224, label_dim=234, patch_size1=16, patch_size2=16, embed_dim=768, embed_dim2=768,
depth1=12,depth2=12, num_heads=12, qkv_bias=True, qk_scale=None,
drop_ratio=0., attn_drop_ratio=0., drop_path_ratio=0.,
representation_size=None, num_classes=1000, name="Transformer"):
self.depth1=depth1
self.depth2 = depth2
self.PosEmbed1 = PosEmbed(embed_dim=embed_dim, num_patches=patch_size1, name='posEmbed')
self.PosEmbed2 = PosEmbed(embed_dim=embed_dim, num_patches=patch_size2, name='posEmbed')
self.encoder = Encoder(dim=embed_dim, head_num=num_heads,
qkv_bias=False, qk_scale=None, drop_ratio=drop_ratio,
attn_drop_ratio=attn_drop_ratio, drop_path_ratio=drop_path_ratio, name='Encoder')
self.decoder = Decoder(dim=embed_dim2, head_num=num_heads,
qkv_bias=False, qk_scale=None, drop_ratio=drop_ratio,
attn_drop_ratio=attn_drop_ratio, drop_path_ratio=drop_path_ratio, name='Encoder')
self.linear = layers.Dense(num_classes, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
self.softmax = layers.Activation("softmax")
self.dropout1=layers.Dropout(drop_ratio)
def call(self, inputs1,input2, **kwargs):
x=self.PosEmbed1(inputs1)
x=self.dropout1(x)
for _ in range(self.depth1):
x=self.encoder(x)
x = self.dropout1(x)
y=self.PosEmbed2(input2)
y = self.dropout1(y)
for _ in range(self.depth2):
y=self.decoder(y,x)
y = self.dropout1(y)
z=self.linear(y)
z=self.softmax(z)
return z