当前位置: 首页 > wzjs >正文

企业网站建设网站制作免费个人自助建站

企业网站建设网站制作,免费个人自助建站,网站,商城,app 建设,成人做暧视频观看网站前言: 这里面重点通过PyTorch 实现Transformer MoE的模型部分 主要有两种架构: 1 一个 Transformer 编码器内部有多个专家。 2 以整体 Transformer 编码器为专家 目录: 1:整个 transformer encoder 作为 expert 2&#x…

前言:

    这里面重点通过PyTorch 实现Transformer MoE的模型部分

 

主要有两种架构:

1 一个 Transformer 编码器内部有多个专家。 

2  以整体 Transformer 编码器为专家


目录: 

1:整个 transformer encoder  作为  expert

2:以整体 Transformer 编码器为expert


一: 一个 Transformer 编码器内部有多个专家。 

        把 Transformer  FFN 换成多个MoE

    里面的Expert采用了FFN 模型

   

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 24 11:45:03 2025@author: chengxf2
"""import torch
import torch.nn as nn
import torch.nn.functional as Fclass Expert(nn.Module):"""与 Transformer Encoder Layers 中的 FFN 类似的 FFN 专家。""" def __init__(self, d_model=512, hidden_dim=1024):super(Expert, self).__init__()self.input_projection  = nn.Linear(d_model, hidden_dim)self.output_projection =  nn.Linear(hidden_dim, d_model)self.activation = nn.ReLU()def forward(self, x):x = self.input_projection(x)x = self.activation(x)output = self.output_projection(x)return outputclass Router(nn.Module):"""用于将 token 分发给专家的路由器。""" def __init__(self, d_model, num_experts):super(Router, self).__init__()self.layer = nn.Linear(d_model, num_experts)def  forward(self, x):z = self.layer(x)output = F.softmax(z,dim=-1)return outputclass MoE(nn.Module):def __init__(self, d_model, num_experts, hidden_dim, top_k=2):super(MoE, self).__init__()self.experts = nn.ModuleList([Expert(d_model,hidden_dim) for i in range(num_experts)])self.router = Router(d_model, num_experts)self.top_k = top_kdef forward(self, x):# 为路由器展平为 (token_num, d_model)#其中 token_num = batch_size*seq_lenrouting_weights = self.router(x)topk_vals, topk_indices = torch.topk(routing_weights, self.top_k, dim=1)topk_vals_normalized    = topk_vals / topk_vals.sum(dim=1, keepdim=True)outputs = torch.zeros_like(x)print("\n topk_vals.shape ",topk_vals.shape)for i , expert in enumerate(self.experts):#expert_mask.shape: [token_num, top_k]expert_mask = (topk_indices==i).float()if expert_mask.any():#token choice#input_to_expert = x.unsqueeze(1).repeat(1,self.top_k,1)*expert_mask#inputs_to_expert = x*expert_mask.unsqueeze(-1)expert_mask = expert_mask.unsqueeze(-1)#print("\n x",x.shape, "\t expert_mask",expert_mask.shape)inputs_to_expert = torch.mul(x.unsqueeze(1), expert_mask)expert_output = expert(inputs_to_expert)#print("\n expert_output: ",expert_output.shape, "\t topk_vals_normalized ", topk_vals_normalized.shape)weighted_expert_outputs = expert_output * topk_vals_normalized.unsqueeze(-1)outputs += weighted_expert_outputs.sum(dim=1)return outputsclass  TransformerEncoderLayerWithMoE (nn.Module): def __init__(self, d_model, nhead, num_experts, hidden_dim, dropout, top_k):super (TransformerEncoderLayerWithMoE, self).__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)self.moe = MoE(d_model, num_experts, hidden_dim,top_k)# 前馈模型的实现self.dropout = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) def forward(self, src=None, src_mask=None, src_key_padding_mask=None):src2 = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[ 0 ] src = src + self.dropout(src2) src = self.norm1(src) #print('\n src ',src.shape)# 利用专家混合batch_size, seq_len, d_model = src.shapesrc = src.view(-1,d_model)src2 = self.moe(src) src = src + self.dropout(src2) src = self.norm2(src) src = src.view(batch_size, seq_len,d_model)return src#Step 3: Initialize the model, the loss and optimizer
num_experts = 8
d_model = 512
nhead = 8
hidden_dim = 1024
dropout = 0.1 
num_layers = 3
batch_size = 2 
seq_len = 3
d_model = 512
top_k = 2
x = torch.randn(batch_size,seq_len, d_model)input_dim= 512
# Flatten to (batch_size*seq_len, d_model) for the router
model = TransformerEncoderLayerWithMoE(d_model, nhead, num_experts, hidden_dim, dropout, top_k)
output = model(x)
print(output.shape)


二   整个 transformer encoder 作为 expert

         在这种方法中,我们用多个 Transformer 编码器内部的前馈网络(FFN)作为专家,如下图

     Step 1: Build an Expert Network

     Step 2: Build the Mixture of Experts

     Step 3: Initialize the model, the loss and optimizer

      Step 4: Train the model

# -*- coding: utf-8 -*-
"""
Created on Fri Mar 21 15:58:13 2025@author: chengxf2
"""
import torch
import torch.nn as nn
import torch.nn.functional as Fclass Expert(nn.Module):def __init__(self, d_model, nhead, num_layers, input_dim, output_dim):super(Expert, self).__init__()encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)self.input_projection = nn.Linear(input_dim, d_model)self.output_projection = nn.Linear(d_model, output_dim)def forward(self, x):x = self.input_projection(x)x = self.transformer_encoder(x.unsqueeze(0)).squeeze(0)  # Transformer expects (S, N, E), adjusting for N=1output =  self.output_projection(x)return outputclass MixtureOfExperts(nn.Module):def __init__(self, num_experts, d_model, nhead, num_layers, input_size, output_size):super(MixtureOfExperts, self).__init__()self.experts = nn.ModuleList([Expert(d_model, nhead, num_layers, input_size, output_size) for _ in range(num_experts)])self.gates = nn.Linear(input_size, num_experts)def forward(self, x):weights = F.softmax(self.gates(x), dim=1)outputs = torch.stack([expert(x) for expert in self.experts], dim=2)return (weights.unsqueeze(2) * outputs).sum(dim=2)class Router(nn.Module):def __init__(self, input_dim=512, num_experts=8):super().__init__()self.layer = nn.Linear(input_dim, num_experts)def forward(self, x):z = self.layer(x)output =  F.softmax(z, dim=-1)#print("\n input.shape",x.shape,"\t output", output.shape)return outputclass MoE(nn.Module):def __init__(self, input_dim, output_dim, num_experts, d_model, nhead, num_layers, top_k=2):super().__init__()self.experts = nn.ModuleList([Expert(d_model, nhead, num_layers, input_dim, output_dim) for _ in range(num_experts)])self.router = Router(input_dim, num_experts)self.output_dim = output_dimself.top_k = top_kdef forward(self, x):token_num = x.size(0)#Flatten to (batch_size*seq_len, d_model) for the routerrouting_weights = self.router(x)topk_vals, topk_indices = torch.topk(routing_weights, self.top_k, dim=1)print("\n topk_vals ",topk_vals.shape, "\n topk_indices",topk_indices)topk_vals_normalized = topk_vals / topk_vals.sum(dim=1, keepdim=True)outputs = torch.zeros(token_num, self.output_dim, device=x.device)for i, expert in enumerate(self.experts):#当前的expert: token choicetoken_choice = (topk_indices == i).float()#print("\n token_choice \n",i,token_choice)#print(f"expert{i} \n",expert_mask)if token_choice.any():#[token_num, k]d_model = x.size(1)#[token_num,top_k, d_model]expert_mask = token_choice.unsqueeze(-1)#print(expert_mask.shape)expert_mask= expert_mask.expand(-1, -1, d_model)inputs_to_expert = x.unsqueeze(1).repeat(1, self.top_k, 1) * expert_mask#稀疏inputs_to_expert = inputs_to_expert.view(-1, d_model)expert_outputs =   expert(inputs_to_expert).view(token_num, self.top_k, -1)# Weight outputs by normalized routing probability and sum across selected expertsweighted_expert_outputs = expert_outputs * topk_vals_normalized.unsqueeze(-1)outputs += weighted_expert_outputs.sum(dim=1)return outputsdef train():#Step 3: Initialize the model, the loss and optimizernum_experts = 8d_model = 512nhead = 8num_layers = 3batch_size = 2 seq_len = 3d_model = 512top_k = 2x = torch.randn(batch_size,seq_len, d_model)x = x.view(-1,d_model)input_dim= 512# Flatten to (batch_size*seq_len, d_model) for the routermodel = MoE(input_dim, num_experts, num_experts, d_model, nhead, num_layers, top_k)output = model(x)print(output.shape)'''criterion = nn.MSELoss()optimizer = torch.optim.Adam(model.parameters(), lr=0.001)# Training loopfor epoch in range(100):for i, data in enumerate(dataloader):  # Assume dataloader is defined and provides input and target datainputs, targets = dataoptimizer.zero_grad()outputs = model(inputs)loss = criterion(outputs, targets)loss.backward()optimizer.step()print(f"Epoch {epoch + 1}, Loss: {loss.item()}")'''train()    






 

Mixtral of experts | Mistral AI

http://www.dtcms.com/wzjs/225379.html

相关文章:

  • 在线制作结婚证seo排名推广
  • wordpress搬家乱码win7系统优化大师
  • 专科医院网站建设网络培训班
  • 直播app源码河北网站seo地址
  • 电脑去哪里建设网站做什么推广最赚钱
  • 网站建设与网页制作教程永久免费个人网站申请注册
  • 要怎么做网站百度入口官网
  • 网站开发与网页制作seo服务公司上海
  • 上海博大园林建设发展有限公司网站网站联盟推广
  • 联通做网站河北百度竞价优化
  • 金华vi设计公司seo完整教程视频教程
  • 重庆网站设计最佳科技网站seo基础优化
  • 虚拟空间软件下载武汉seo招聘
  • 做医疗网站建设百度竞价客服电话
  • 自己做的网站可以开直播长沙seo工作室
  • 织梦如何做几种语言的网站十大免费网站推广
  • nodejs做视频网站正规百度推广
  • app网站开发哪家专业seo基础入门免费教程
  • 传媒网站建设方案怎么创建网站链接
  • 安徽网站建设合肥网站建设今日的重大新闻
  • 如何给网站做右侧悬浮电话seo关键词推广案例
  • wordpress 文章分页代码图片seo优化是什么意思
  • 购物网站建设开题报告广州最新政策
  • 有什么网站可以在线做试题广州新闻最新消息今天
  • 华汇建设集团有限公司网站搜索风云榜百度
  • 建设网站时 首先要解决两个问题 一是什么seo关键词找29火星软件
  • 网站死链存在的问题西安网站建设维护
  • 网站开发用哪个软件成都seo论坛
  • 凡科建设的网站手机打不开攀枝花seo
  • 乌鲁木齐做网站优化网络营销推广方式都有哪些