Qwen2.5模型结构
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 这个是用来干嘛的
输出层,词汇投影层,将模型输出的隐藏状态向量映射回词表空间,用于预测下一个token
# 预测 logits,未经过 softmax
lm_logits = self.lm_head(hidden_states) # shape: [B, L, vocab_size]
之后再通过 softmax 转换为每个 token 的概率。
class Qwen2Model(Qwen2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
self.layers = nn.ModuleList([
Qwen2DecoderLayer(config) for _ in range(config.num_hidden_layers)
]) #这里由config文件决定的有很多很多层
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
每一层结构
class Qwen2DecoderLayer(nn.Module):
def __init__(self, config):
...
self.self_attn = Qwen2Attention(config)
self.mlp = Qwen2MLP(config)
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
def forward(self, hidden_states, ...):
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
hidden_states, _ = self.self_attn(hidden_states, ...)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
return hidden_states, ...
self.mlp = Qwen2MLP(config) 这个是什么
标准Transformer FFN
class TransformerFFN(nn.Module):
def __init__(self, embed_dim, hidden_dim):
super().__init__()
self.linear1 = nn.Linear(embed_dim, hidden_dim)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(hidden_dim, embed_dim)
def forward(self, x):
return self.linear2(self.relu(self.linear1(x)))