243 lines
11 KiB
Python
243 lines
11 KiB
Python
import tensorflow as tf
|
||
from tensorflow.keras import layers, models, initializers
|
||
|
||
|
||
class PosEmbed(layers.Layer):
|
||
def __init__(self, embed_dim=768, num_patches=196, name=None):
|
||
super(PosEmbed, self).__init__(name=name)
|
||
self.embed_dim = embed_dim
|
||
self.num_patches = num_patches
|
||
|
||
def build(self, input_shape):
|
||
# 创建两个可训练的参数weight,对应于cls_token,pos_embed的weight
|
||
# shape第一维是batch维度,后面分别是1*768和197*768,trainable表示是可训练的参数
|
||
self.cls_token = self.add_weight(name="cls",
|
||
shape=[1, 1, self.embed_dim],
|
||
initializer=initializers.Zeros(),
|
||
trainable=True,
|
||
dtype=tf.float32)
|
||
self.pos_embed = self.add_weight(name="pos_embed",
|
||
shape=[1, self.num_patches + 1, self.embed_dim],
|
||
initializer=initializers.RandomNormal(stddev=0.02),
|
||
trainable=True,
|
||
dtype=tf.float32)
|
||
|
||
def call(self, inputs, **kwargs):
|
||
# 需要获取一下batch的,因为输入图片的时候不是一张图片,而是将图片打包成一个个batch同时输入进去
|
||
batch_size, _, _ = inputs.shape
|
||
|
||
# [1, 1, 768] -> [B, 1, 768]
|
||
# 把cls_token复制B份进行拼接,broadcast_to方法,将cls_token进行一下广播,在batch维度就可以变成batch维度
|
||
cls_token = tf.broadcast_to(self.cls_token, shape=[batch_size, 1, self.embed_dim])
|
||
# 与input进行拼接
|
||
x = tf.concat([cls_token, inputs], axis=1) # [B, 197, 768]
|
||
# 加上位置编码
|
||
x = x + self.pos_embed
|
||
|
||
return x
|
||
|
||
|
||
class attention(layers.Layer):
|
||
|
||
def __init__(self, dim, head_num, qkv_bias=False,
|
||
qk_scale=None,
|
||
attn_drop_ratio=0.,
|
||
proj_drop_ratio=0.,
|
||
name=None):
|
||
super(attention, self).__init__(name=name)
|
||
self.head_num = head_num
|
||
head_dim = dim // head_num
|
||
self.scale = qk_scale or head_dim ** -0.5
|
||
self.kqv = layers.Dense(3 * dim, use_bias=qkv_bias, name="qkv",
|
||
kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.attn_drop = layers.Dropout(attn_drop_ratio)
|
||
self.proj = layers.Dense(dim, name="out",
|
||
kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.proj_drop = layers.Dropout(proj_drop_ratio)
|
||
|
||
def call(self, inputs, **kwargs):
|
||
B, N, C = inputs.shape
|
||
kqv = self.kqv(inputs)
|
||
kqv = tf.reshape(kqv, [B, N, 3, self.head_num, C // self.head_num])
|
||
kqv = tf.transpose(kqv, [2, 0, 3, 1, 4])
|
||
q, k, v = kqv[0], kqv[1], kqv[2]
|
||
attn = tf.matmul(q, k, transpose_b=True) * self.scale
|
||
attn = tf.nn.softmax(attn, axis=-1)
|
||
drop = self.attn_drop(attn)
|
||
x = tf.matmul(drop, v)
|
||
x = tf.transpose(x, [0, 2, 1, 3])
|
||
x = tf.reshape(x, [B, N, C])
|
||
x = self.proj(x)
|
||
x = self.proj_drop(x)
|
||
return x
|
||
|
||
|
||
class Encoder(layers.Layer):
|
||
k_ini = initializers.GlorotUniform()
|
||
b_ini = initializers.RandomNormal(stddev=1e-6)
|
||
|
||
def __init__(self, dim, head_num=8,
|
||
qkv_bias=False,
|
||
qk_scale=None,
|
||
drop_ratio=0.,
|
||
attn_drop_ratio=0.,
|
||
drop_path_ratio=0., name=None):
|
||
super(Encoder, self).__init__(name=name)
|
||
self.norm0 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_0")
|
||
self.attn = attention(dim, head_num, qkv_bias, qk_scale,
|
||
attn_drop_ratio=attn_drop_ratio,
|
||
proj_drop_ratio=drop_path_ratio,
|
||
name='Multi-head self-sttention')
|
||
self.drop1 = layers.Dropout(drop_ratio)
|
||
self.norm1 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1")
|
||
self.dense1 = layers.Dense(dim * 4, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.act = layers.Activation("relu")
|
||
self.dense2 = layers.Dense(dim, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.drop2 = layers.Dropout(drop_ratio)
|
||
self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_2")
|
||
|
||
def call(self, inputs, **kwargs):
|
||
x = self.norm0(inputs)
|
||
x = self.attn(x)
|
||
x = self.drop1(x)
|
||
x = self.norm1(x)
|
||
x = inputs + x
|
||
y = self.dense1(x)
|
||
y = self.act(y)
|
||
y = self.dense2(y)
|
||
y = self.drop2(y)
|
||
y = self.norm2(y)
|
||
y = x + y
|
||
return y
|
||
|
||
|
||
class Mask_attention(layers.Layer):
|
||
|
||
def __init__(self, dim, head_num, qkv_bias=False,
|
||
qk_scale=None,
|
||
attn_drop_ratio=0.,
|
||
proj_drop_ratio=0.,
|
||
name=None):
|
||
super(Mask_attention, self).__init__(name=name)
|
||
self.head_num = head_num
|
||
head_dim = dim // head_num
|
||
self.scale = qk_scale or head_dim ** -0.5
|
||
self.kqv = layers.Dense(3 * dim, use_bias=qkv_bias, name="qkv",
|
||
kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.attn_drop = layers.Dropout(attn_drop_ratio)
|
||
self.proj = layers.Dense(dim, name="out",
|
||
kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.proj_drop = layers.Dropout(proj_drop_ratio)
|
||
|
||
def call(self, inputs, **kwargs):
|
||
B, N, C = inputs.shape
|
||
kqv = self.kqv(inputs)
|
||
kqv = tf.reshape(kqv, [B, N, 3, self.head_num, C // self.head_num])
|
||
kqv = tf.transpose(kqv, [2, 0, 3, 1, 4])
|
||
# tf.linalg.bandPart (a, numLower, numUpper)
|
||
# a:它是要传递的tf.tensor。
|
||
# numLower:要保留的对角行数。如果为负,则保持整个下三角
|
||
# numUpper:要保留的对角行数。如果为负,则保持整个上三角
|
||
mask = tf.linalg.band_part(tf.ones((B, self.head_num, N, C // self.head_num)), -1, 0)
|
||
# 用tf.broadcast_to将这个矩阵扩展到B维
|
||
# mask=tf.broadcast_to(mask,shape=[B,self.head_num, N, C // self.head_num])
|
||
q, k, v = kqv[0], kqv[1], kqv[2]
|
||
attn = tf.matmul(q, k, transpose_b=True) * self.scale
|
||
attn = tf.matmul(attn, mask)
|
||
attn = tf.nn.softmax(attn, axis=-1)
|
||
drop = self.attn_drop(attn)
|
||
x = tf.matmul(drop, v)
|
||
x = tf.transpose(x, [0, 2, 1, 3])
|
||
x = tf.reshape(x, [B, N, C])
|
||
x = self.proj(x)
|
||
x = self.proj_drop(x)
|
||
return x
|
||
|
||
|
||
class Decoder(layers.Layer):
|
||
k_ini = initializers.GlorotUniform()
|
||
b_ini = initializers.RandomNormal(stddev=1e-6)
|
||
|
||
def __init__(self, dim, head_num=8,
|
||
qkv_bias=False,
|
||
qk_scale=None,
|
||
drop_ratio=0.,
|
||
attn_drop_ratio=0.,
|
||
drop_path_ratio=0., name=None):
|
||
super(Decoder, self).__init__(name=name)
|
||
self.mask_attention = Mask_attention(dim, 8, qkv_bias,
|
||
qk_scale, attn_drop_ratio=attn_drop_ratio,
|
||
proj_drop_ratio=drop_path_ratio, name='Mask_self_attention')
|
||
self.norm1 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1")
|
||
self.attention = attention(dim, 8, qkv_bias,
|
||
qk_scale, attn_drop_ratio=attn_drop_ratio,
|
||
proj_drop_ratio=drop_path_ratio, name='Mask_self_attention')
|
||
self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_2")
|
||
self.drop1 = layers.Dropout(drop_ratio)
|
||
self.encoder = Encoder(dim, head_num=8,
|
||
qkv_bias=False,
|
||
qk_scale=None,
|
||
drop_ratio=0.,
|
||
attn_drop_ratio=0.,
|
||
drop_path_ratio=0., name=None)
|
||
self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1")
|
||
self.dense1 = layers.Dense(dim * 4, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.act = layers.Activation("relu")
|
||
self.dense2 = layers.Dense(dim, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.drop2 = layers.Dropout(drop_ratio)
|
||
self.norm3 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_3")
|
||
|
||
def call(self, inputs, input2, **kwargs):
|
||
x = self.norm1(inputs)
|
||
y = self.mask_attention(x)
|
||
y = self.drop1 = layers.Dropout(y)
|
||
y = self.norm2(y)
|
||
y = inputs + y
|
||
z = y + self.encoder(input2)
|
||
z = self.attention(z)
|
||
z = self.drop2(z)
|
||
z = self.norm3(z)
|
||
z = z + y
|
||
k = self.dense1(z)
|
||
k = self.act(k)
|
||
k = self.dense2(k)
|
||
k = self.drop2(k)
|
||
k = k + z
|
||
return k
|
||
|
||
|
||
class Transformer(layers.Layer):
|
||
k_ini = initializers.GlorotUniform()
|
||
b_ini = initializers.RandomNormal(stddev=1e-6)
|
||
|
||
def __init__(self, train_dim=224, label_dim=234, patch_size1=16, patch_size2=16, embed_dim=768, embed_dim2=768,
|
||
depth1=12,depth2=12, num_heads=12, qkv_bias=True, qk_scale=None,
|
||
drop_ratio=0., attn_drop_ratio=0., drop_path_ratio=0.,
|
||
representation_size=None, num_classes=1000, name="Transformer"):
|
||
self.depth1=depth1
|
||
self.depth2 = depth2
|
||
self.PosEmbed1 = PosEmbed(embed_dim=embed_dim, num_patches=patch_size1, name='posEmbed')
|
||
self.PosEmbed2 = PosEmbed(embed_dim=embed_dim, num_patches=patch_size2, name='posEmbed')
|
||
self.encoder = Encoder(dim=embed_dim, head_num=num_heads,
|
||
qkv_bias=False, qk_scale=None, drop_ratio=drop_ratio,
|
||
attn_drop_ratio=attn_drop_ratio, drop_path_ratio=drop_path_ratio, name='Encoder')
|
||
self.decoder = Decoder(dim=embed_dim2, head_num=num_heads,
|
||
qkv_bias=False, qk_scale=None, drop_ratio=drop_ratio,
|
||
attn_drop_ratio=attn_drop_ratio, drop_path_ratio=drop_path_ratio, name='Encoder')
|
||
self.linear = layers.Dense(num_classes, kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
|
||
self.softmax = layers.Activation("softmax")
|
||
self.dropout1=layers.Dropout(drop_ratio)
|
||
def call(self, inputs1,input2, **kwargs):
|
||
x=self.PosEmbed1(inputs1)
|
||
x=self.dropout1(x)
|
||
for _ in range(self.depth1):
|
||
x=self.encoder(x)
|
||
x = self.dropout1(x)
|
||
y=self.PosEmbed2(input2)
|
||
y = self.dropout1(y)
|
||
for _ in range(self.depth2):
|
||
y=self.decoder(y,x)
|
||
y = self.dropout1(y)
|
||
z=self.linear(y)
|
||
z=self.softmax(z)
|
||
return z |