import tensorflow as tf from tensorflow.keras import layers, models, initializers class PosEmbed(layers.Layer): def __init__(self, embed_dim=768, num_patches=196, name=None): super(PosEmbed, self).__init__(name=name) self.embed_dim = embed_dim self.num_patches = num_patches def build(self, input_shape): # 创建两个可训练的参数weight,对应于cls_token,pos_embed的weight # shape第一维是batch维度,后面分别是1*768和197*768,trainable表示是可训练的参数 self.cls_token = self.add_weight(name="cls", shape=[1, 1, self.embed_dim], initializer=initializers.Zeros(), trainable=True, dtype=tf.float32) self.pos_embed = self.add_weight(name="pos_embed", shape=[1, self.num_patches + 1, self.embed_dim], initializer=initializers.RandomNormal(stddev=0.02), trainable=True, dtype=tf.float32) def call(self, inputs, **kwargs): # 需要获取一下batch的,因为输入图片的时候不是一张图片,而是将图片打包成一个个batch同时输入进去 batch_size, _, _ = inputs.shape # [1, 1, 768] -> [B, 1, 768] # 把cls_token复制B份进行拼接,broadcast_to方法,将cls_token进行一下广播,在batch维度就可以变成batch维度 cls_token = tf.broadcast_to(self.cls_token, shape=[batch_size, 1, self.embed_dim]) # 与input进行拼接 x = tf.concat([cls_token, inputs], axis=1) # [B, 197, 768] # 加上位置编码 x = x + self.pos_embed return x class attention(layers.Layer): def __init__(self, dim, head_num, qkv_bias=False, qk_scale=None, attn_drop_ratio=0., proj_drop_ratio=0., name=None): super(attention, self).__init__(name=name) self.head_num = head_num head_dim = dim // head_num self.scale = qk_scale or head_dim ** -0.5 self.kqv = layers.Dense(3 * dim, use_bias=qkv_bias, name="qkv", kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.attn_drop = layers.Dropout(attn_drop_ratio) self.proj = layers.Dense(dim, name="out", kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.proj_drop = layers.Dropout(proj_drop_ratio) def call(self, inputs, **kwargs): B, N, C = inputs.shape kqv = self.kqv(inputs) kqv = tf.reshape(kqv, [B, N, 3, self.head_num, C // self.head_num]) kqv = tf.transpose(kqv, [2, 0, 3, 1, 4]) q, k, v = kqv[0], kqv[1], kqv[2] attn = tf.matmul(q, k, transpose_b=True) * self.scale attn = tf.nn.softmax(attn, axis=-1) drop = self.attn_drop(attn) x = tf.matmul(drop, v) x = tf.transpose(x, [0, 2, 1, 3]) x = tf.reshape(x, [B, N, C]) x = self.proj(x) x = self.proj_drop(x) return x class Encoder(layers.Layer): k_ini = initializers.GlorotUniform() b_ini = initializers.RandomNormal(stddev=1e-6) def __init__(self, dim, head_num=8, qkv_bias=False, qk_scale=None, drop_ratio=0., attn_drop_ratio=0., drop_path_ratio=0., name=None): super(Encoder, self).__init__(name=name) self.norm0 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_0") self.attn = attention(dim, head_num, qkv_bias, qk_scale, attn_drop_ratio=attn_drop_ratio, proj_drop_ratio=drop_path_ratio, name='Multi-head self-sttention') self.drop1 = layers.Dropout(drop_ratio) self.norm1 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1") self.dense1 = layers.Dense(dim * 4, kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.act = layers.Activation("relu") self.dense2 = layers.Dense(dim, kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.drop2 = layers.Dropout(drop_ratio) self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_2") def call(self, inputs, **kwargs): x = self.norm0(inputs) x = self.attn(x) x = self.drop1(x) x = self.norm1(x) x = inputs + x y = self.dense1(x) y = self.act(y) y = self.dense2(y) y = self.drop2(y) y = self.norm2(y) y = x + y return y class Mask_attention(layers.Layer): def __init__(self, dim, head_num, qkv_bias=False, qk_scale=None, attn_drop_ratio=0., proj_drop_ratio=0., name=None): super(Mask_attention, self).__init__(name=name) self.head_num = head_num head_dim = dim // head_num self.scale = qk_scale or head_dim ** -0.5 self.kqv = layers.Dense(3 * dim, use_bias=qkv_bias, name="qkv", kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.attn_drop = layers.Dropout(attn_drop_ratio) self.proj = layers.Dense(dim, name="out", kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.proj_drop = layers.Dropout(proj_drop_ratio) def call(self, inputs, **kwargs): B, N, C = inputs.shape kqv = self.kqv(inputs) kqv = tf.reshape(kqv, [B, N, 3, self.head_num, C // self.head_num]) kqv = tf.transpose(kqv, [2, 0, 3, 1, 4]) # tf.linalg.bandPart (a, numLower, numUpper) # a:它是要传递的tf.tensor。 # numLower:要保留的对角行数。如果为负,则保持整个下三角 # numUpper:要保留的对角行数。如果为负,则保持整个上三角 mask = tf.linalg.band_part(tf.ones((B, self.head_num, N, C // self.head_num)), -1, 0) # 用tf.broadcast_to将这个矩阵扩展到B维 # mask=tf.broadcast_to(mask,shape=[B,self.head_num, N, C // self.head_num]) q, k, v = kqv[0], kqv[1], kqv[2] attn = tf.matmul(q, k, transpose_b=True) * self.scale attn = tf.matmul(attn, mask) attn = tf.nn.softmax(attn, axis=-1) drop = self.attn_drop(attn) x = tf.matmul(drop, v) x = tf.transpose(x, [0, 2, 1, 3]) x = tf.reshape(x, [B, N, C]) x = self.proj(x) x = self.proj_drop(x) return x class Decoder(layers.Layer): k_ini = initializers.GlorotUniform() b_ini = initializers.RandomNormal(stddev=1e-6) def __init__(self, dim, head_num=8, qkv_bias=False, qk_scale=None, drop_ratio=0., attn_drop_ratio=0., drop_path_ratio=0., name=None): super(Decoder, self).__init__(name=name) self.mask_attention = Mask_attention(dim, 8, qkv_bias, qk_scale, attn_drop_ratio=attn_drop_ratio, proj_drop_ratio=drop_path_ratio, name='Mask_self_attention') self.norm1 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1") self.attention = attention(dim, 8, qkv_bias, qk_scale, attn_drop_ratio=attn_drop_ratio, proj_drop_ratio=drop_path_ratio, name='Mask_self_attention') self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_2") self.drop1 = layers.Dropout(drop_ratio) self.encoder = Encoder(dim, head_num=8, qkv_bias=False, qk_scale=None, drop_ratio=0., attn_drop_ratio=0., drop_path_ratio=0., name=None) self.norm2 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_1") self.dense1 = layers.Dense(dim * 4, kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.act = layers.Activation("relu") self.dense2 = layers.Dense(dim, kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.drop2 = layers.Dropout(drop_ratio) self.norm3 = layers.LayerNormalization(epsilon=1e-6, name="LayerNorm_3") def call(self, inputs, input2, **kwargs): x = self.norm1(inputs) y = self.mask_attention(x) y = self.drop1 = layers.Dropout(y) y = self.norm2(y) y = inputs + y z = y + self.encoder(input2) z = self.attention(z) z = self.drop2(z) z = self.norm3(z) z = z + y k = self.dense1(z) k = self.act(k) k = self.dense2(k) k = self.drop2(k) k = k + z return k class Transformer(layers.Layer): k_ini = initializers.GlorotUniform() b_ini = initializers.RandomNormal(stddev=1e-6) def __init__(self, train_dim=224, label_dim=234, patch_size1=16, patch_size2=16, embed_dim=768, embed_dim2=768, depth1=12,depth2=12, num_heads=12, qkv_bias=True, qk_scale=None, drop_ratio=0., attn_drop_ratio=0., drop_path_ratio=0., representation_size=None, num_classes=1000, name="Transformer"): self.depth1=depth1 self.depth2 = depth2 self.PosEmbed1 = PosEmbed(embed_dim=embed_dim, num_patches=patch_size1, name='posEmbed') self.PosEmbed2 = PosEmbed(embed_dim=embed_dim, num_patches=patch_size2, name='posEmbed') self.encoder = Encoder(dim=embed_dim, head_num=num_heads, qkv_bias=False, qk_scale=None, drop_ratio=drop_ratio, attn_drop_ratio=attn_drop_ratio, drop_path_ratio=drop_path_ratio, name='Encoder') self.decoder = Decoder(dim=embed_dim2, head_num=num_heads, qkv_bias=False, qk_scale=None, drop_ratio=drop_ratio, attn_drop_ratio=attn_drop_ratio, drop_path_ratio=drop_path_ratio, name='Encoder') self.linear = layers.Dense(num_classes, kernel_initializer=self.k_ini, bias_initializer=self.b_ini) self.softmax = layers.Activation("softmax") self.dropout1=layers.Dropout(drop_ratio) def call(self, inputs1,input2, **kwargs): x=self.PosEmbed1(inputs1) x=self.dropout1(x) for _ in range(self.depth1): x=self.encoder(x) x = self.dropout1(x) y=self.PosEmbed2(input2) y = self.dropout1(y) for _ in range(self.depth2): y=self.decoder(y,x) y = self.dropout1(y) z=self.linear(y) z=self.softmax(z) return z