当前位置: 首页 > news >正文

2.3 Transformer 变体与扩展:BERT、GPT 与多模态模型

Transformer 变体与扩展:BERT、GPT 与多模态模型

引言

在深度学习与自然语言处理领域,Transformer 架构无疑是最具革命性的突破之一。自从2017年 Vaswani 等人提出原始 Transformer 模型以来,各种基于 Transformer 的变体如雨后春笋般涌现,彻底改变了自然语言处理的格局。

在本章节中,我们将深入探讨三种最重要的 Transformer 变体:BERT、GPT 系列以及多模态模型。这些模型不仅在学术研究上取得了突破性进展,更在工业界得到了广泛应用,成为了现代 AI 系统的核心组件。

BERT:双向编码器的革命

BERT 的核心思想

BERT(Bidirectional Encoder Representations from Transformers)由 Google 在 2018 年提出,其最大的创新在于双向上下文理解。与传统的从左到右或从右到左的语言模型不同,BERT 能够同时考虑单词左右两侧的上下文信息。

import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import numpy as npclass BERTExplainer:def __init__(self, model_name='bert-base-uncased'):self.tokenizer = BertTokenizer.from_pretrained(model_name)self.model = BertModel.from_pretrained(model_name)self.model.eval()def demonstrate_masked_language_modeling(self, text):"""演示 BERT 的掩码语言建模能力"""# 将文本中的某个词替换为 [MASK]masked_text = text.replace("language", "[MASK]")# 编码输入inputs = self.tokenizer(masked_text, return_tensors="pt")with torch.no_grad():outputs = self.model(**inputs)# 获取预测结果predictions = outputs.last_hidden_statemask_token_index = torch.where(inputs["input_ids"][0] == self.tokenizer.mask_token_id)[0]# 获取 mask 位置的预测向量mask_token_logits = predictions[0, mask_token_index, :]# 找到最可能的预测top_tokens = torch.topk(mask_token_logits, 5, dim=1)print(f"原始文本: {text}")print(f"掩码文本: {masked_text}")print("Top 5 预测结果:")for i, (value, index) in enumerate(zip(top_tokens.values[0], top_tokens.indices[0])):token = self.tokenizer.decode([index])print(f"{i+1}. {token} (得分: {value:.4f})")# 使用示例
explainer = BERTExplainer()
text = "Natural language processing is amazing."
explainer.demonstrate_masked_language_modeling(text)

BERT 的预训练任务

BERT 通过两个关键的预训练任务来学习语言表示:

1. 掩码语言建模(MLM)
import torch
from transformers import BertForMaskedLM, BertTokenizerclass MLMDemonstration:def __init__(self):self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')self.model.eval()def mlm_inference(self, text_with_mask):"""执行 MLM 推理"""inputs = self.tokenizer(text_with_mask, return_tensors="pt")with torch.no_grad():outputs = self.model(**inputs)logits = outputs.logitsmask_token_index = torch.where(inputs["input_ids"][0] == self.tokenizer.mask_token_id)[0]# 获取预测结果mask_logits = logits[0, mask_token_index, :]top_tokens = torch.topk(mask_logits, 3, dim=1)print(f"输入: {text_with_mask}")for i, (values, indices) in enumerate(zip(top_tokens.values, top_tokens.indices)):for j in range(len(values)):token = self.tokenizer.decode([indices[j]])print(f"位置 {i+1}{j+1} 预测: {token} (概率: {torch.softmax(values, dim=0)[j]:.4f})")# 演示 MLM
mlm_demo = MLMDemonstration()
mlm_demo.mlm_inference("The weather today is [MASK] and sunny.")
2. 下一句预测(NSP)
class NSPDemonstration:def __init__(self):self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')self.model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')self.model.eval()def nsp_inference(self, sentence_a, sentence_b):"""下一句预测推理"""# 编码输入inputs = self.tokenizer(sentence_a, sentence_b, return_tensors='pt')with torch.no_grad():outputs = self.model(**inputs)logits = outputs.logitsprobabilities = torch.softmax(logits, dim=1)is_next = probabilities[0, 0].item()  # 是下一句的概率not_next = probabilities[0, 1].item()  # 不是下一句的概率print(f"句子 A: {sentence_a}")print(f"句子 B: {sentence_b}")print(f"是下一句的概率: {is_next:.4f}")print(f"不是下一句的概率: {not_next:.4f}")print(f"预测: {'是下一句' if is_next > not_next else '不是下一句'}")# 演示 NSP
nsp_demo = NSPDemonstration()
nsp_demo.nsp_inference("The company reported strong earnings this quarter.","As a result, the stock price increased significantly."
)

BERT 的架构细节

import torch.nn as nnclass SimplifiedBERT(nn.Module):"""简化的 BERT 模型实现,用于教学目的"""def __init__(self, vocab_size=30522, hidden_size=768, num_layers=12, num_attention_heads=12, intermediate_size=3072, max_position_embeddings=512):super(SimplifiedBERT, self).__init__()self.embedding = BERTEmbedding(vocab_size, hidden_size, max_position_embeddings)self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(hidden_size, num_attention_heads, intermediate_size)for _ in range(num_layers)])def forward(self, input_ids, attention_mask=None):# 嵌入层hidden_states = self.embedding(input_ids)# Transformer 编码器层for layer in self.encoder_layers:hidden_states = layer(hidden_states, attention_mask)return hidden_statesclass BERTEmbedding(nn.Module):def __init__(self, vocab_size, hidden_size, max_position_embeddings):super(BERTEmbedding, self).__init__()self.word_embeddings = nn.Embedding(vocab_size, hidden_size)self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)self.token_type_embeddings = nn.Embedding(2, hidden_size)  # 用于句子对任务self.LayerNorm = nn.LayerNorm(hidden_size)self.dropout = nn.Dropout(0.1)def forward(self, input_ids):seq_length = input_ids.size(1)position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)position_ids = position_ids.unsqueeze(0).expand_as(input_ids)words_embeddings = self.word_embeddings(input_ids)position_embeddings = self.position_embeddings(position_ids)embeddings = words_embeddings + position_embeddingsembeddings = self.LayerNorm(embeddings)embeddings = self.dropout(embeddings)return embeddingsclass TransformerEncoderLayer(nn.Module):def __init__(self, hidden_size, num_attention_heads, intermediate_size):super(TransformerEncoderLayer, self).__init__()self.attention = MultiHeadAttention(hidden_size, num_attention_heads)self.intermediate = IntermediateLayer(hidden_size, intermediate_size)self.output = OutputLayer(hidden_size, intermediate_size)def forward(self, hidden_states, attention_mask=None):# 自注意力层attention_output = self.attention(hidden_states, attention_mask)# 前馈网络intermediate_output = self.intermediate(attention_output)layer_output = self.output(intermediate_output, attention_output)return layer_outputclass MultiHeadAttention(nn.Module):def __init__(self, hidden_size, num_attention_heads):super(MultiHeadAttention, self).__init__()self.num_attention_heads = num_attention_headsself.attention_head_size = hidden_size // num_attention_headsself.query = nn.Linear(hidden_size, hidden_size)self.key = nn.Linear(hidden_size, hidden_size)self.value = nn.Linear(hidden_size, hidden_size)self.dense = nn.Linear(hidden_size, hidden_size)def forward(self, hidden_states, attention_mask=None):batch_size, seq_length, hidden_size = hidden_states.size()# 线性变换query_layer = self.query(hidden_states)key_layer = self.key(hidden_states)value_layer = self.value(hidden_states)# 多头注意力计算attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))attention_scores = attention_scores / torch.sqrt(torch.tensor(self.attention_head_size, dtype=torch.float32))if attention_mask is not None:attention_scores = attention_scores + attention_maskattention_probs = torch.softmax(attention_scores, dim=-1)context_layer = torch.matmul(attention_probs, value_layer)# 输出投影context_layer = self.dense(context_layer)return context_layerclass IntermediateLayer(nn.Module):def __init__(self, hidden_size, intermediate_size):super(IntermediateLayer, self).__init__()self.dense = nn.Linear(hidden_size, intermediate_size)self.intermediate_act_fn = nn.GELU()def forward(self, hidden_states):hidden_states = self.dense(hidden_states)hidden_states = self.intermediate_act_fn(hidden_states)return hidden_statesclass OutputLayer(nn.Module):def __init__(self, hidden_size, intermediate_size):super(OutputLayer, self).__init__()self.dense = nn.Linear(intermediate_size, hidden_size)self.LayerNorm = nn.LayerNorm(hidden_size)self.dropout = nn.Dropout(0.1)def forward(self, hidden_states, input_tensor):hidden_states = self.dense(hidden_states)hidden_states = self.dropout(hidden_states)hidden_states = self.LayerNorm(hidden_states + input_tensor)return hidden_states

GPT 系列:自回归生成模型的演进

GPT 架构概述

GPT(Generative Pre-trained Transformer)系列模型采用了纯解码器架构,专注于自回归语言建模任务。

import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizerclass GPTDemonstrator:def __init__(self, model_name='gpt2'):self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)self.model = GPT2LMHeadModel.from_pretrained(model_name)self.model.eval()def generate_text(self, prompt, max_length=100, temperature=0.7):"""使用 GPT 生成文本"""inputs = self.tokenizer.encode(prompt, return_tensors='pt')with torch.no_grad():outputs = self.model.generate(inputs,max_length=max_length,temperature=temperature,do_sample=True,pad_token_id=self.tokenizer.eos_token_id,num_return_sequences=1)generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)return generated_textdef demonstrate_autoregressive_generation(self, prompt):"""演示自回归生成过程"""print(f"输入提示: {prompt}")print("生成过程:")input_ids = self.tokenizer.encode(prompt, return_tensors='pt')generated = input_ids.clone()with torch.no_grad():for step in range(5):  # 只演示前5步outputs = self.model(generated)next_token_logits = outputs.logits[:, -1, :]# 应用温度调节next_token_logits = next_token_logits / 1.0next_token_probs = torch.softmax(next_token_logits, dim=-1)# 选择最可能的下一个tokennext_token_id = torch.argmax(next_token_probs, dim=-1)# 添加到生成序列generated = torch.cat([generated, next_token_id.unsqueeze(-1)], dim=-1)next_token = self.tokenizer.decode(next_token_id)print(f"步骤 {step+1}: 添加 token '{next_token}'")if next_token_id.item() == self.tokenizer.eos_token_id:breakfull_text = self.tokenizer.decode(generated[0], skip_special_tokens=True)print(f"\n完整生成文本: {full_text}")# 使用示例
gpt_demo = GPTDemonstrator()
result = gpt_demo.generate_text("The future of artificial intelligence")
print("GPT 生成结果:", result)# 演示自回归过程
gpt_demo.demonstrate_autoregressive_generation("In the world of machine learning")

GPT 系列模型演进

graph TDA[GPT-1] --> B[GPT-2]B --> C[GPT-3]C --> D[GPT-3.5]D --> E[GPT-4]A --> A1[1.17亿参数]B --> B1[15亿参数]C --> C1[1750亿参数]D --> D1[未知]E --> E1[未知, 多模态]A1 --> A2[Transformer 解码器]B1 --> B2[零样本学习]C1 --> C2[上下文学习]D2[指令微调] --> DE2[多模态能力] --> E

上下文学习(In-context Learning)

class InContextLearningDemo:def __init__(self, model_name='gpt2'):self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)self.model = GPT2LMHeadModel.from_pretrained(model_name)self.model.eval()def few_shot_learning(self, examples, query):"""少样本学习演示"""# 构建少样本提示prompt = self._build_few_shot_prompt(examples, query)inputs = self.tokenizer.encode(prompt, return_tensors='pt')with torch.no_grad():outputs = self.model.generate(inputs,max_length=len(inputs[0]) + 20,temperature=0.7,do_sample=True,pad_token_id=self.tokenizer.eos_token_id,num_return_sequences=1)response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)return response[len(prompt):]  # 只返回新生成的部分def _build_few_shot_prompt(self, examples, query):"""构建少样本提示"""prompt = ""for example in examples:prompt += f"输入: {example['input']}\n输出: {example['output']}\n\n"prompt += f"输入: {query}\n输出:"return prompt# 使用示例
icl_demo = InContextLearningDemo()# 定义少样本示例
examples = [{"input": "这部电影太精彩了", "output": "正面"},{"input": "服务很差,不推荐", "output": "负面"},{"input": "产品一般般,没什么特别", "output": "中性"}
]query = "这个餐厅的食物非常美味"
result = icl_demo.few_shot_learning(examples, query)
print(f"查询: {query}")
print(f"模型预测: {result}")

多模态模型:跨模态理解的突破

CLIP:连接视觉与语言

CLIP(Contrastive Language-Image Pre-training)通过对比学习将图像和文本映射到同一语义空间。

import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requestsclass CLIPDemonstrator:def __init__(self, model_name="openai/clip-vit-base-patch32"):self.model = CLIPModel.from_pretrained(model_name)self.processor = CLIPProcessor.from_pretrained(model_name)self.model.eval()def image_text_similarity(self, image, text_candidates):"""计算图像与多个文本的相似度"""# 处理输入inputs = self.processor(text=text_candidates, images=image, return_tensors="pt", padding=True)with torch.no_grad():outputs = self.model(**inputs)# 计算相似度logits_per_image = outputs.logits_per_image  # 图像与文本的相似度probs = logits_per_image.softmax(dim=1)return probsdef zero_shot_image_classification(self, image, class_names):"""零样本图像分类"""# 构建文本提示text_descriptions = [f"a photo of a {class_name}" for class_name in class_names]# 计算相似度probs = self.image_text_similarity(image, text_descriptions)# 输出结果results = []for i, class_name in enumerate(class_names):results.append({"class": class_name,"probability": probs[0][i].item()})# 按概率排序results.sort(key=lambda x: x["probability"], reverse=True)return results# 使用示例
def demo_clip():# 加载示例图像url = "http://images.cocodataset.org/val2017/000000039769.jpg"image = Image.open(requests.get(url, stream=True).raw)clip_demo = CLIPDemonstrator()# 定义候选类别class_names = ["cat", "dog", "car", "person", "building"]# 执行零样本分类results = clip_demo.zero_shot_image_classification(image, class_names)print("零样本图像分类结果:")for result in results:print(f"{result['class']}: {result['probability']:.4f}")# 运行演示
demo_clip()

CLIP 模型架构详解

import torch
import torch.nn as nn
import torch.nn.functional as Fclass SimplifiedCLIP(nn.Module):"""简化的 CLIP 模型实现"""def __init__(self, embed_dim=512, image_encoder=None, text_encoder=None):super(SimplifiedCLIP, self).__init__()self.image_encoder = image_encoder or SimpleImageEncoder(embed_dim)self.text_encoder = text_encoder or SimpleTextEncoder(embed_dim)# 可学习的温度参数self.logit_scale = nn.Parameter(torch.ones([]) * torch.log(torch.tensor(1/0.07)))def forward(self, images, texts):# 编码图像和文本image_features = self.image_encoder(images)text_features = self.text_encoder(texts)# 归一化特征image_features = F.normalize(image_features, dim=-1)text_features = F.normalize(text_features, dim=-1)# 计算相似度矩阵logit_scale = self.logit_scale.exp()logits_per_image = logit_scale * image_features @ text_features.t()logits_per_text = logits_per_image.t()return logits_per_image, logits_per_textclass SimpleImageEncoder(nn.Module):"""简化的图像编码器(使用 CNN)"""def __init__(self, embed_dim):super(SimpleImageEncoder, self).__init__()self.conv_layers = nn.Sequential(# 输入: 3 x 224 x 224nn.Conv2d(3, 64, kernel_size=3, padding=1),nn.ReLU(),nn.MaxPool2d(2),  # 64 x 112 x 112nn.Conv2d(64, 128, kernel_size=3, padding=1),nn.ReLU(),nn.MaxPool2d(2),  # 128 x 56 x 56nn.Conv2d(128, 256, kernel_size=3, padding=1),nn.ReLU(),nn.MaxPool2d(2),  # 256 x 28 x 28nn.Conv2d(256, 512, kernel_size=3, padding=1),nn.ReLU(),nn.AdaptiveAvgPool2d((1, 1))  # 512 x 1 x 1)self.projection = nn.Linear(512, embed_dim)def forward(self, x):features = self.conv_layers(x)features = features.view(features.size(0), -1)return self.projection(features)class SimpleTextEncoder(nn.Module):"""简化的文本编码器(使用 Transformer)"""def __init__(self, embed_dim, vocab_size=10000, max_length=77):super(SimpleTextEncoder, self).__init__()self.token_embedding = nn.Embedding(vocab_size, embed_dim)self.position_embedding = nn.Embedding(max_length, embed_dim)self.transformer_blocks = nn.ModuleList([TransformerBlock(embed_dim, num_heads=8)for _ in range(6)])self.ln_final = nn.LayerNorm(embed_dim)def forward(self, text):batch_size, seq_len = text.shape# 创建位置编码positions = torch.arange(seq_len, device=text.device).unsqueeze(0)# 获取嵌入token_embeddings = self.token_embedding(text)position_embeddings = self.position_embedding(positions)x = token_embeddings + position_embeddings# 通过 Transformer 块for block in self.transformer_blocks:x = block(x)# 取 [EOS] token 的特征作为文本表示x = self.ln_final(x)return x[:, -1, :]  # 取最后一个tokenclass TransformerBlock(nn.Module):def __init__(self, embed_dim, num_heads):super(TransformerBlock, self).__init__()self.attention = nn.MultiheadAttention(embed_dim, num_heads)self.mlp = nn.Sequential(nn.Linear(embed_dim, embed_dim * 4),nn.GELU(),nn.Linear(embed_dim * 4, embed_dim))self.ln1 = nn.LayerNorm(embed_dim)self.ln2 = nn.LayerNorm(embed_dim)def forward(self, x):# 自注意力attn_output, _ = self.attention(x, x, x)x = x + attn_outputx = self.ln1(x)# 前馈网络mlp_output = self.mlp(x)x = x + mlp_outputx = self.ln2(x)return x

BLIP:Bootstrapping Language-Image Pre-training

from transformers import BlipProcessor, BlipForConditionalGenerationclass BLIPDemo:def __init__(self):self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")def generate_image_caption(self, image):"""生成图像描述"""inputs = self.processor(image, return_tensors="pt")with torch.no_grad():outputs = self.model.generate(**inputs)caption = self.processor.decode(outputs[0], skip_special_tokens=True)return captiondef visual_question_answering(self, image, question):"""视觉问答"""inputs = self.processor(image, question, return_tensors="pt")with torch.no_grad():outputs = self.model.generate(**inputs)answer = self.processor.decode(outputs[0], skip_special_tokens=True)return answer# 使用示例
def demo_blip():# 加载示例图像url = "http://images.cocodataset.org/val2017/000000039769.jpg"image = Image.open(requests.get(url, stream=True).raw)blip_demo = BLIPDemo()# 生成图像描述caption = blip_demo.generate_image_caption(image)print(f"图像描述: {caption}")# 视觉问答question = "What animals are in this image?"answer = blip_demo.visual_question_answering(image, question)print(f"问题: {question}")print(f"回答: {answer}")# 运行演示
demo_blip()

混合专家模型(Mixture of Experts, MoE)

MoE 基本原理

混合专家模型通过引入多个"专家"网络来扩展模型容量,同时保持计算效率。

import torch
import torch.nn as nn
import torch.nn.functional as Fclass MixtureOfExperts(nn.Module):"""混合专家模型实现"""def __init__(self, input_dim, expert_dim, num_experts=8, capacity_factor=1.0):super(MixtureOfExperts, self).__init__()self.num_experts = num_expertsself.capacity_factor = capacity_factor# 专家网络self.experts = nn.ModuleList([nn.Sequential(nn.Linear(input_dim, expert_dim),nn.GELU(),nn.Linear(expert_dim, input_dim)) for _ in range(num_experts)])# 门控网络self.gate = nn.Linear(input_dim, num_experts)def forward(self, x):batch_size, seq_len, hidden_dim = x.shape# 扁平化处理x_flat = x.reshape(-1, hidden_dim)# 计算门控权重gate_logits = self.gate(x_flat)  # [batch*seq_len, num_experts]gate_weights = F.softmax(gate_logits, dim=-1)# 选择top-k专家top_k = 2  # 通常选择前2个专家top_k_weights, top_k_indices = torch.topk(gate_weights, top_k, dim=-1)top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)# 初始化输出output = torch.zeros_like(x_flat)# 专家路由for expert_idx in range(self.num_experts):# 找到需要当前专家处理的样本expert_mask = (top_k_indices == expert_idx).any(dim=-1)if expert_mask.any():expert_input = x_flat[expert_mask]expert_output = self.experts[expert_idx](expert_input)# 计算权重weights_for_expert = torch.zeros(expert_mask.sum(), device=x.device)for k in range(top_k):mask_at_k = top_k_indices[expert_mask, k] == expert_idxweights_for_expert[mask_at_k] = top_k_weights[expert_mask, k][mask_at_k]# 加权求和output[expert_mask] += expert_output * weights_for_expert.unsqueeze(-1)# 恢复形状output = output.reshape(batch_size, seq_len, hidden_dim)return outputclass MoETransformerBlock(nn.Module):"""包含 MoE 的 Transformer 块"""def __init__(self, hidden_dim, num_heads, num_experts=8):super(MoETransformerBlock, self).__init__()self.attention = nn.MultiheadAttention(hidden_dim, num_heads)self.moe = MixtureOfExperts(hidden_dim, hidden_dim * 4, num_experts)self.ln1 = nn.LayerNorm(hidden_dim)self.ln2 = nn.LayerNorm(hidden_dim)def forward(self, x):# 自注意力attn_output, _ = self.attention(x, x, x)x = x + attn_outputx = self.ln1(x)# MoE 前馈网络moe_output = self.moe(x)x = x + moe_outputx = self.ln2(x)return x# 使用示例
def demo_moe():batch_size, seq_len, hidden_dim = 2, 10, 512num_experts = 4# 创建 MoE 模型moe_layer = MixtureOfExperts(hidden_dim, hidden_dim * 4, num_experts)# 创建输入x = torch.randn(batch_size, seq_len, hidden_dim)# 前向传播output = moe_layer(x)print(f"输入形状: {x.shape}")print(f"输出形状: {output.shape}")print(f"MoE 参数量: {sum(p.numel() for p in moe_layer.parameters())}")demo_moe()

实践应用:构建多模态问答系统

import torch
from transformers import (CLIPModel, CLIPProcessor, BlipForQuestionAnswering, BlipProcessor,AutoModel, AutoTokenizer
)
from PIL import Image
import numpy as npclass MultimodalQASystem:"""多模态问答系统"""def __init__(self):# 初始化 CLIP 模型用于图像-文本匹配self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")# 初始化 BLIP 模型用于视觉问答self.blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")# 文本模型用于文本问答self.text_model = AutoModel.from_pretrained("bert-base-uncased")self.text_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")def answer_question(self, image, question, context=None):"""回答关于图像或文本的问题"""# 判断问题类型question_type = self._classify_question_type(question)if question_type == "visual" and image is not None:# 视觉问答return self._visual_qa(image, question)elif question_type == "textual" and context is not None:# 文本问答return self._text_qa(context, question)else:# 通用问答return self._general_qa(question)def _classify_question_type(self, question):"""分类问题类型"""visual_keywords = ["image", "picture", "photo", "see", "look", "color", "shape"]textual_keywords = ["text", "document", "article", "passage", "read"]question_lower = question.lower()if any(keyword in question_lower for keyword in visual_keywords):return "visual"elif any(keyword in question_lower for keyword in textual_keywords):return "textual"else:return "general"def _visual_qa(self, image, question):"""视觉问答"""inputs = self.blip_processor(image, question, return_tensors="pt")with torch.no_grad():outputs = self.blip_model.generate(**inputs)answer = self.blip_processor.decode(outputs[0], skip_special_tokens=True)return answerdef _text_qa(self, context, question):"""文本问答"""# 使用 BERT 进行阅读理解inputs = self.text_tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)with torch.no_grad():outputs = self.text_model(**inputs)# 简单的答案提取(简化版)start_scores = outputs.last_hidden_state[:, 1:-1].mean(dim=-1)end_scores = outputs.last_hidden_state[:, 1:-1].mean(dim=-1)start_idx = torch.argmax(start_scores)end_idx = torch.argmax(end_scores)answer_tokens = inputs["input_ids"][0][start_idx:end_idx+1]answer = self.text_tokenizer.decode(answer_tokens, skip_special_tokens=True)return answerdef _general_qa(self, question):"""通用问答(基于知识)"""# 这里可以集成知识库或调用外部 API# 简化实现:返回固定回答knowledge_base = {"what is ai": "Artificial Intelligence is the simulation of human intelligence in machines.","how does machine learning work": "Machine learning uses algorithms to parse data, learn from it, and make predictions.","what is deep learning": "Deep learning is a subset of machine learning using neural networks with multiple layers."}question_lower = question.lower()for key in knowledge_base:if key in question_lower:return knowledge_base[key]return "I'm sorry, I don't have enough information to answer that question."# 使用示例
def demo_multimodal_qa():qa_system = MultimodalQASystem()# 视觉问答示例try:url = "http://images.cocodataset.org/val2017/000000039769.jpg"image = Image.open(requests.get(url, stream=True).raw)question = "What animals are in this image?"answer = qa_system.answer_question(image, question)print(f"视觉问答:")print(f"问题: {question}")print(f"回答: {answer}\n")except:print("无法加载示例图像,跳过视觉问答演示")# 文本问答示例context = """Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of intelligent agents: any system that perceives its environment and takes actions that maximize its chance of achieving its goals."""question = "What is artificial intelligence?"answer = qa_system.answer_question(None, question, context)print(f"文本问答:")print(f"问题: {question}")print(f"回答: {answer}\n")# 通用问答示例question = "How does machine learning work?"answer = qa_system.answer_question(None, question)print(f"通用问答:")print(f"问题: {question}")print(f"回答: {answer}")demo_multimodal_qa()

性能优化与最佳实践

模型压缩技术

import torch
from transformers import AutoModel, AutoTokenizer
from opacus import PrivacyEngineclass ModelOptimizer:"""模型优化工具类"""@staticmethoddef quantize_model(model):"""量化模型以减少内存使用"""quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)return quantized_model@staticmethoddef prune_model(model, pruning_rate=0.2):"""剪枝模型以减少参数数量"""# 简单的全局剪枝parameters_to_prune = []for name, module in model.named_modules():if isinstance(module, torch.nn.Linear):parameters_to_prune.append((module, 'weight'))torch.nn.utils.prune.global_unstructured(parameters_to_prune,pruning_method=torch.nn.utils.prune.L1Unstructured,amount=pruning_rate,)return model@staticmethoddef apply_differential_privacy(model, train_loader, sample_rate, noise_multiplier, max_grad_norm):"""应用差分隐私"""privacy_engine = PrivacyEngine()model, optimizer, train_loader = privacy_engine.make_private(module=model,optimizer=optimizer,data_loader=train_loader,noise_multiplier=noise_multiplier,max_grad_norm=max_grad_norm,)return model, optimizer, train_loader# 优化示例
def optimization_demo():# 加载原始模型model = AutoModel.from_pretrained("bert-base-uncased")original_size = sum(p.numel() for p in model.parameters())print(f"原始模型参数量: {original_size}")# 量化模型quantized_model = ModelOptimizer.quantize_model(model)quantized_size = sum(p.numel() for p in quantized_model.parameters())print(f"量化后模型参数量: {quantized_size}")print(f"压缩比例: {(1 - quantized_size/original_size)*100:.2f}%")# 剪枝模型(注意:这需要在实际训练后进行)# pruned_model = ModelOptimizer.prune_model(model, pruning_rate=0.2)optimization_demo()

总结

在本章节中,我们深入探讨了 Transformer 架构的重要变体和扩展:

  1. BERT:通过双向编码器和 MLM、NSP 预训练任务,实现了深度上下文理解
  2. GPT 系列:采用自回归生成架构,在文本生成任务上表现出色
  3. 多模态模型:如 CLIP、BLIP,实现了视觉与语言的跨模态理解
  4. 混合专家模型:通过专家路由机制,在保持效率的同时扩展模型容量

这些模型不仅在学术研究上取得了突破,更在工业应用中展现了巨大价值。理解这些模型的原理和实现细节,对于构建先进的 AI 系统至关重要。

在下一章节中,我们将深入探讨如何使用 PyTorch 和 TensorFlow 实现文本分类任务,进一步巩固对这些模型的理解和应用能力。

http://www.dtcms.com/a/585003.html

相关文章:

  • 《Windows 服务器 ×WinSCP 保姆级配置指南:从 0 到 1 实现 “无痛” 远程文件管理》
  • 用nas做网站泰安集团
  • 自己做的网站可以运营不wordpress和json
  • 怎么做文学动漫网站公司logo设计图片欣赏
  • 网站建设 模块厦门网站建设哪家不错
  • 深圳做高端网站建设公司做家装的网站有什么不同
  • 武威网站建设DS716 II 做网站
  • 网站开发授权书如何更换网站域名
  • 做企业网站的轻量级cms一站式互联网营销平台
  • 长沙产品网站建设国外哪些网站可以注册域名
  • 汕头自助建站系统手机建网站
  • 网站建设教学后记宜昌市高新区建设局网站
  • 山西网站建设排名深圳自适应网站的公司
  • wordpress 安装中文字体如何为网站做seo体检
  • 国内高端网站定制江山建设工程信息网站
  • 皖icp合肥网站开发公司车身广告设计图片
  • 服饰类网站模板烟台网站建设 烟台网亿网络公司
  • 镇江网站建设要多少钱wordpress自定义登陆
  • 做英语阅读的网站360免费wifi总是断断续续的掉线
  • 自己免费怎么制作网站吗动漫设计和动漫制作技术的区别
  • 帮别人做非法网站长沙高端网站制作公司
  • led企业网站策划wordpress 替换google
  • 网站建设需求分析的实施继续浏览此网站(不推荐)
  • 白沟做网站邢台网站网页设计
  • 3d设计网站外贸网站用wordpress
  • 长春企业网站设计培训学校类网站建设方案
  • 外贸公司的网站企业网站
  • 网站怎么申请微信支付接口seo英文怎么读
  • wordpress批量增加用户权限邢台网络优化技术公司
  • 网站建设刂金手指下拉十五规模以上工业企业个数