# Transformers-Part1

Attention is All your Need

最近服务器大量跑计算，正好有时间，把这个非常出名的 Transformers 好好看看，结果就发现他过于抽象，还是归因于自己对于 python 的类、实例、函数认识不清，对于 Torch 还有深度学习基础架构不熟悉。趁这个机会正好好好学习一下。主要就是首先简单记录一下 Transformers 的基础架构，再深入看看基于 Torch 的 Transformers 源码。文中肯定是有纰漏的，有缘人看到请帮忙指正…

# Transformers 模型架构

OIP

乍一看这个东西挺唬人的，把它分解开来看还是较为简单。左边部分为 Encoder 层，右边则为 Decoder 层。

Encoder 接受数据及其位置编码（NLP 应用中），数据经过 Embedding 后则进入 Encoder 层。Encoder 层由 N 个 EncoderLayers 组成，每个 layers 之间是相连的，EncoderLayers 则包括多头注意力层（Multi-Head Attention），Add & Norm，Feed Fordward 等小组件组成。

Decoder 接受数据及其位置编码，与 Encoder 的不同在于，Decoder 同样接受来自 Encoder 的 output。Decoder 层由 N 个 DecoderLayers 组成，每个 layers 之间相互链接，其中的组件同样包括多头注意力层（Multi-Head Attention），Add & Norm，Feed Fordward 等小组件。

虽然这里有一大堆让人看着就不太明白的词汇，但其实就可以发现整个 Transformers 架构是由很多小组件类似于拼积木组成。

# Multi-Head Attention

关于注意力机制的原理可以看 B 站 3blue1brown 的视频【官方双语】直观解释注意力机制，Transformer 的核心 | 【深度学习第 6 章】_哔哩哔哩_bilibili。起初的注意力机制将注意力汇聚的输出计算成为值的加权和。许多博客也有解释，通过 Query 与 Key 的注意力汇聚（即，给定一个 Query，计算 Query 与 Key 的相关性，然后根据 Query 与 Key 的相关性去和对应的 Value 进行相乘）实现对 Value 的注意力权重分配，生成最终的输出结果。给我的感觉就是 Attention 还是一种计算键值对相关性的一种形式。而当 Query，Key，Value 均来自同一个数据的时候，神奇的事情发生了，这时的 Query，Key，Value 相同，即计算自己跟自己的相关性，那么抽像的来说就是对数据自身包含的规律进行了学习，是一种将单个序列的不同位置相关联的注意力机制。

$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$

注意力机制源码，参考

import math
import torch
import torch.nn as nn

class SelfAttentionV1(nn.Module):
    def __init__(self, hidden_dim: int = 728):
        super().__init__()
        self.hidden_dim = hidden_dim
        
        # linear 方法 用于定义线性变换，包括权重矩阵初始化
        self.query_proj = nn.Linear(hidden_dim, hidden_dim) # 有时加上bias=False加速运行
        self.key_proj = nn.Linear(hidden_dim, hidden_dim)
        self.value_proj = nn.Linear(hidden_dim, hidden_dim)
        
    def forward(self, X):
        # X shape is (batch size, seq_len, hidden_dim)
        Q = self.query_proj(X)
        K = self.key_proj(X)
        V = self.value_proj(X)
        # Q, K, V shape (batch, seq, hidden_dim)
        print(f"Q shape: {Q.size()}; K shape: {K.size()}, V shape: {V.size()}")
        
        # attention value (batch, seq, seq)
        attention_value = torch.matmul(
            Q, K.transpose(-1, -2) # -1 -2 交换位置
        )
        # (batch, seq, seq)
        attention_weight = torch.softmax(
            attention_value / math.sqrt(self.hidden_dim),
            dim= -1
        )
        print(f"Weight shape: {attention_weight.size()}")
        
        output = torch.matmul(attention_weight, V)
        return output

# test
X = torch.rand(3, 2, 4)
print(X)

self_att_net = SelfAttentionV1(4) # hidden layer 为4
self_att_net(X)

从源码来看就是首先对数据进行三次线性变换（Pytorch nn.Linear () 的基本用法与原理详解及全连接层简介 - CSDN 博客，包括了权重 W 矩阵的初始化），生成 Query，Key，Value 三个 tensor 对象，接着根据 Attention 公式进行计算。

而多头注意力机制是在自注意力机制的基础上发展起来的，是自注意力机制的变体，旨在增强模型的表达能力和泛化能力。它通过使用多个独立的注意力头，分别计算注意力权重，并将它们的结果进行拼接或加权求和，从而获得更丰富的表示。

MHA

class MutiHeadSelfAttentionFormal(nn.Module):
    def __init__(self, dim, head_num, attentnion_dropout=0.1):
        super().__init__()
        self.hidden_dim = dim
        self.head_num = head_num
        self.head_dim = dim // head_num # (head_num * head_dim = dim)
        
        self.q_proj = nn.Linear(dim, dim) # (dim, head_num * head_dim) 后面可以用split拆分
        self.k_proj = nn.Linear(dim, dim)
        self.v_proj = nn.Linear(dim, dim)
        self.out_proj = nn.Linear(dim, dim)
        
        self.attentnion_dropout = nn.Dropout(attentnion_dropout)
        
    def forward(self, X, attention_mask=None):
        # X (batch, seq, hidden_dim)
        batch, seq_len, _ = X.size()
        
        Q = self.q_proj(X)
        K = self.k_proj(X)
        V = self.v_proj(X) # (b, s, h)
        
        # (b, s, h) -> (b, head_num, s, head_dim) # (head_num * head_dim = h)
        q_state = Q.view(batch, seq_len, self.head_num, self.head_dim).transpose(1, 2)
        k_state = K.view(batch, seq_len, self.head_num, self.head_dim).transpose(1, 2)
        v_state = V.view(batch, seq_len, self.head_num, self.head_dim).transpose(1, 2)
        
        # (b, head_num, s, s)
        attention_weight = torch.matmul(
            q_state, k_state.transpose(-1, -2)
        )/math.sqrt(self.head_dim)
        
        if attention_mask is not None:
            attention_weight = attention_weight.masked_fill(
                attention_mask==0, float('-1e20')
            )
        print(attention_weight.shape)
        
        attention_weight = torch.softmax(attention_weight, dim=-1)
        attention_weight = self.attentnion_dropout(attention_weight)
        
        output_mid = attention_weight @ v_state # (b, head_num, s, head_dim)
        
        output_mid = output_mid.transpose(1, 2).contiguous() # 链接不同header
        output_mid = output_mid.view(batch, seq_len, -1)
        
        output = self.out_proj(output_mid)
        return output
# test
attention_mask = (
    torch.tensor(
    [
        [0, 1],
        [0, 0],
        [1, 0]
    ])
    .unsqueeze(1)
    .unsqueeze(2)
    .expand(3, 8, 2, 2)
)

X = torch.rand(3, 2, 128)
net = MutiHeadSelfAttentionFormal(128, 8)
net(X, attention_mask)

# Transformers

因此 Transformers 可分块写为

MHA

import torch
import math
import torch.nn as nn
import copy

# 多头注意力
class MutiHeadSelfAttentionFormal(nn.Module):
    def __init__(self, dim, head_num, attentnion_dropout=0.1):
        super().__init__()
        self.hidden_dim = dim
        self.head_num = head_num
        self.head_dim = dim // head_num # (head_num * head_dim = dim)
        
        self.q_proj = nn.Linear(dim, dim) # (dim, head_num * head_dim) 后面可以用split拆分
        self.k_proj = nn.Linear(dim, dim)
        self.v_proj = nn.Linear(dim, dim)
        self.out_proj = nn.Linear(dim, dim)
        
        self.attentnion_dropout = nn.Dropout(attentnion_dropout)

    def forward(self, query, key, value, attention_mask=None,  mask_type="Encoder"):
        """
        In Encoder: query = key = value = X;
        in Decoder: query = X, key = value = m
        """
        # X (batch, seq, hidden_dim)
        batch, seq_len, _ = query.size()
        
        Q = self.q_proj(query)
        K = self.k_proj(key)
        V = self.v_proj(value) # (b, s, h)
        
        # (b, s, h) -> (b, head_num, s, head_dim) # (head_num * head_dim = h)
        q_state = Q.view(batch, seq_len, self.head_num, self.head_dim).transpose(1, 2)
        k_state = K.view(batch, seq_len, self.head_num, self.head_dim).transpose(1, 2)
        v_state = V.view(batch, seq_len, self.head_num, self.head_dim).transpose(1, 2)
        
        # (b, head_num, s, s)
        attention_weight = torch.matmul(
            q_state, k_state.transpose(-1, -2)
        )/math.sqrt(self.head_dim)
        
        if attention_mask is None:
            if mask_type == "Encoder":
                attention_mask = torch.ones_like(attention_weight) # 全是1的矩阵
            elif mask_type == "Decoder":
                attention_mask = torch.ones_like(attention_weight).tril() # 全是1的矩阵构建下三角矩阵
        else:
            if mask_type == "Encoder":
                pass
            elif mask_type == "Decoder":
                attention_mask = attention_mask.tril()
        attention_weight = attention_weight.masked_fill(attention_mask==0, float('-1e20'))
        # print(attention_weight.shape)

        attention_weight = torch.softmax(attention_weight, dim=-1)
        attention_weight = self.attentnion_dropout(attention_weight)
        
        output_mid = attention_weight @ v_state # (b, head_num, s, head_dim)
        
        output_mid = output_mid.transpose(1, 2).contiguous() # 链接不同header
        output_mid = output_mid.view(batch, seq_len, -1)
        
        output = self.out_proj(output_mid)
        return output

Feed-forward network 前馈神经网络（FFN）

$FFN(x) = RELU(xW_1 + b_1)W_2 + b_2$

# FFN
class ffn(nn.Module):
    def __init__(self, dim, ffn_dropout_rate=0.1):
        super().__init__()
        # ffn Position-wise Feed-Forward Networks（FFN） 升维 -> 降维 -> LayerNorm
        self.up_proj = nn.Linear(dim, dim*4)
        self.act_fn = nn.GELU() # 可换用RELU
        self.down_proj = nn.Linear(dim*4, dim)
        self.drop_ffn = nn.Dropout(ffn_dropout_rate)
        #self.ffn_ln = nn.LayerNorm(dim, eps=0.0000001) # smooth
    
    def forward(self, X):
        # ffn
        up = self.up_proj(X) # 升维
        up = self.act_fn(up) # 激活 GELU
        down = self.down_proj(self.drop_ffn(up)) # 降维
        #down = self.drop_ffn(down) # drop
        
        # add & Norm
        return down

Encoder

将 Encoder 的各个组分连接起来

# Function clones modules
# useful function
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

# add & norm
class add_and_norm(nn.Module):
    def __init__(self, dim, add_dropout_rate=0.1):
        super().__init__()
        self.norm = nn.LayerNorm(dim, eps=0.0000001)
        self.add_dropout = nn.Dropout(add_dropout_rate)
    
    def forward(self, X, sublayer):
        return X + self.add_dropout(sublayer(self.norm(X)))

Encoder 结构

# Encoder
class EncoderLayer(nn.Module):
    def __init__(self, dim, self_attn, self_ffn):
        super().__init__()
        self.hidden_dim = dim
        self.self_attn = self_attn # attention layer
        self.self_ffn = self_ffn # ffn layer
        self.add_norm_layers = clones(add_and_norm(dim), 2) # add and norm layer
        
    def forward(self, X, mask):
        # add & norm结构的forward接受sublayer为函数对象
        # attention layer 则需要多个参数X于mask，因此调用了lambda构造函数
        s1 = self.add_norm_layers[0](X, lambda X: self.self_attn(X, X, X, mask))
        s2 = self.add_norm_layers[1](s1, self.self_ffn) # FFN
        
        return s2

class Encoder(nn.Module):
    def __init__(self, dim, self_attn, self_ffn, N=6):
        """
        dim: input hidden dim;
        self_attn: MHA object;
        self_ffn: FFN object;
        N: layers number
        """
        super().__init__()
        self.layer_list = clones(EncoderLayer(dim, self_attn, self_ffn), N)
        self.norm = nn.LayerNorm(dim, eps=0.0000001)
        
    def forward(self, X, mask=None):
        "Pass the input (and mask) through each layer in turn."
        for m in self.layer_list:
            X = m(X, mask) # 每个EncoderLayer构建模型
        print(X.shape)
        
        output = self.norm(X)
        return output

Decoder

Decoder 跟 Encoder 的结构类似，只是组合方式不一样

# Decoder
class DecoderLayer(nn.Module):
    def __init__(self, dim, self_attn, src_attn, self_ffn):
        super().__init__()
        self.hidden_dim = dim
        self.self_attn = self_attn
        self.src_att = src_attn
        self.self_ffn = self_ffn # 这里的ffn实际为 ffn + add & Normal
        self.add_norm_layers = clones(add_and_norm(dim), 3) # add and norm layer
        
    def forward(self, X, memory, src_mask=None, tgt_mask=None):
        m = memory
        # step1 attention
        s1 = self.add_norm_layers[0](X, lambda X: self.self_attn(X, X, X, src_mask))
        # step2 attention with memory
        s2 = self.add_norm_layers[1](s1, lambda X: self.self_attn(s1, m, m, tgt_mask))
        # ffn
        s3 = self.add_norm_layers[2](s2, self.self_ffn)
        
        return s3


class Decoder(nn.Module):
    def __init__(self, dim, self_attn, src_attn, self_ffn, N=6):
        """
        dim: input hidden dim;
        self_attn: MHA object;
        self_ffn: FFN object;
        N: layers number
        """
        super().__init__()
        self.layer_list = clones(DecoderLayer(dim, self_attn, src_attn, self_ffn), N)
        self.norm = nn.LayerNorm(dim, eps=0.0000001)
        
    def forward(self, X, memory, src_mask=None, tgt_mask=None):
        "Pass the input (and mask) through each layer in turn."
        for m in self.layer_list:
            X = m(X, memory, src_mask=None, tgt_mask=None) # 每个DecoderLayer构建模型
        print(X.shape)
        
        output = self.norm(X)
        return torch.softmax(output, dim=-1)

Test

# test NB!
c = copy.deepcopy

attention_mask = (
    torch.tensor(
    [
        [0, 1],
        [0, 0],
        [1, 0]
    ])
    .unsqueeze(1)
    .unsqueeze(2)
    .expand(3, 8, 2, 2)
)

X = torch.rand(3, 2, 128)
# module init
attn_net = MutiHeadSelfAttentionFormal(128, 8)
ffn_net = ffn(128)
Encoder_net = Encoder(128, c(attn_net), c(ffn_net))
Decoder_net = Decoder(128, c(attn_net), c(attn_net), c(ffn_net))

# run
E1 = Encoder_net(X, attention_mask)
D1 = Decoder_net(X, E1)

到此为止我们有了 Transformers 的基础架构，后面再仔细看看 Embedding 与 Training 部分

continuing

# Reference

The Annotated Transformer

注意力机制综述（图解完整版附代码） - 知乎

Pytorch nn.Linear () 的基本用法与原理详解及全连接层简介 - CSDN 博客

Transformer 源码详解（Pytorch 版本） - 知乎

chaofa 用代码打点酱油的个人空间 - chaofa 用代码打点酱油个人主页 - 哔哩哔哩视频

# Transformers-Part1

# Transformers 模型架构

# Multi-Head Attention

# Transformers

# Reference

hexo-config

Lightning