当前位置: 首页 > news >正文

基于BERT类的MRPC语义相似度检测(从0到-1系列)

基于BERT类的MRPC语义相似度检测(从0到-1系列)

介绍

BERT(Bidirectional Encoder Representations from Transformers)是由Google开发的一种预训练模型,它是一种基于Transformer机制的深度双向模型,可以对大规模文本数据进行无监督预训练,然后在各种NLP任务中进行微调。BERT 模型在多项自然语言处理任务上取得了巨大成功,成为了NLP 领域里的一个重要里程碑。

语意相似度检测任务是NLP领域中一项重要的任务,它旨在判断两个文本片段之间的语义相似程度。在这项任务中,我们需要输入两个文本片段,然后输出一个相似度分数,表示这两个文本片段之间的语义相似程度。语义相似度检测任务在很多NLP应用中非常有用,比如信息检索、问答系统、自动摘要等领域。

BERT 在语意相似度检测任务中的应用非常成功,通过将两个文本片段拼接并输入到BERT模型中,可以得到两个文本片段的语义表示,然后通过一些微调层或者特定的输出层进行语义相似度的判断。BERT 模型具有强大的语义表达能力,可以有效地捕捉文本之间的语义信息,因此在语义相似度检测任务中取得了很好的效果。

数据预处理

分析数据基本结构

Quality #1 ID #2 ID #1 String #2 String

1 702876 702977 Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence . Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .

0 2108705 2108831 Yucaipa owned Dominick 's before selling the chain
to Safeway in 1998 for $ 2.5 billion . Yucaipa bought Dominick 's in
1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in
1998 .

数据由五个列组合而成,其中 ID 这一属性对该任务无明显作用,故使用 py 脚本剔除,python脚本如下:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: bruanlin
# datetime: 2025/4/23 13:44
# software: PyCharm
# project: [Bert-mrpc]-[data_trasform]
""" Code Describe :nothing!!
"""import pandas as pd
from typing import Tuple
import logging
import os
from pathlib import Path# 配置日志记录
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)def load_mrpc_data(file_path: str) -> Tuple[pd.DataFrame, int]:"""加载MRPC数据集"""try:# 显式指定列类型防止自动类型推断错误dtype_spec = {'#1 ID': 'string','#2 ID': 'string','#1 String': 'string','#2 String': 'string','Quality': 'int8'}# 分块读取大文件(适用于Colab内存受限情况)chunks = pd.read_csv(file_path,sep='\t',header=0,dtype=dtype_spec,usecols=['Quality', '#1 String', '#2 String'],  # 明确指定需要列chunksize=1000,on_bad_lines='warn'  # 跳过格式错误行)df = pd.concat(chunks)except FileNotFoundError:logger.error(f"文件未找到: {file_path}")raiseexcept pd.errors.ParserError as e:logger.error(f"TSV解析错误: {str(e)}")raise# 记录初始行数original_rows = df.shape[0]# 数据清洗df = df.dropna(subset=['#1 String', '#2 String'])  # 删除空值行df = df.rename(columns={'Quality': 'label','#1 String': 'sentence1','#2 String': 'sentence2'})# 记录清洗结果cleaned_rows = df.shape[0]logger.info(f"数据清洗完成: 原始行数={original_rows}, 有效行数={cleaned_rows}, "f"丢弃行数={original_rows - cleaned_rows}")return df.reset_index(drop=True), (original_rows - cleaned_rows)def save_processed_data(df: pd.DataFrame,output_dir = "processed_data",file_name = "mrpc_processed",formats ='tsv') -> None:"""保存数据"""# 创建输出目录Path(output_dir).mkdir(parents=True, exist_ok=True)# 确保列顺序正确df = df[['label', 'sentence1', 'sentence2']]# 保存不同格式try:full_path = os.path.join(output_dir, f"{file_name}.{formats}")if formats == 'tsv':# TSV格式(默认格式)df.to_csv(full_path,sep='\t',index=False,header=True,  # 根据需求调整quoting=3,  # 与原始数据一致escapechar='\\')elif formats == 'csv':# CSV格式(带列名)df.to_csv(full_path,index=False,quotechar='"',escapechar='\\')else:raise ValueError(f"不支持的格式: {formats}")print(f"成功保存 {full_path} ({os.path.getsize(full_path) / 1024:.1f}KB)")except Exception as e:print(f"保存 {formats} 格式失败: {str(e)}")# 使用示例
if __name__ == "__main__":try:train_df, train_dropped = load_mrpc_data("DATASET/MRPC/train.tsv")dev_df, dev_dropped = load_mrpc_data("DATASET/MRPC/dev.tsv")# 展示样本结构print("\n训练集样例:")print(train_df[['label', 'sentence1', 'sentence2']].head(3))# 数据分布分析print("\n标签分布:")print(train_df['label'].value_counts(normalize=True))# 保存数据try:# 保存训练集数据save_processed_data(train_df,"./DATASET/train_processed","train_processed_mrpc","tsv")save_processed_data(dev_df,"./DATASET/dev_processed","dev_processed_mrpc","tsv")except Exception as e:logger.error(f"保存数据失败: {e}")except Exception as e:logger.error("数据加载失败,请检查文件路径和格式")

剔除后的实验数据如下:

label sentence1 sentence2

1 He said the foodservice pie business doesn 't fit the company 's
long-term growth strategy . The foodservice pie business does not fit
our long-term growth strategy .
0 The dollar was at 116.92 yen against the yen , flat on
the session , and at 1.2891 against the Swiss franc , also flat . The
dollar was at 116.78 yen JPY = , virtually flat on the session , and
at 1.2871 against the Swiss franc CHF = , down 0.1 percent .

创建 dataset 和 dataloader

# 参数设置
Max_length = 128
Batch_size = 32
Lr = 2e-5
Epochs = 10
Model_name = 'bert-base-uncased'
train_tsv_path = "/kaggle/input/mrpc-bert/train_processed_mrpc.tsv"
dev_tsv_path = "/kaggle/input/mrpc-bert/dev_processed_mrpc.tsv"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')# 加载
def load_mrpc_data(file_path):"""加载并预处理MRPC TSV文件"""try:chunks = pd.read_csv(file_path,sep='\t',header=0,dtype=str,usecols=['label', 'sentence1', 'sentence2'],  # 关键修改点chunksize=1000,on_bad_lines='warn')df = pd.concat(chunks)# 类型转换df['label'] = pd.to_numeric(df['label'], errors='coerce').astype('Int8')df = df.dropna().reset_index(drop=True)return dfexcept Exception as e:print(f"Error: {e}")return None  # 确保异常时返回明确空值# 创建自定义Dataset
class MRPCDataset(Dataset):def __init__(self, dataframe, tokenizer, max_len):self.data = dataframeself.tokenizer = tokenizerself.max_len = max_lendef __len__(self):return len(self.data)def __getitem__(self, index):# 提取单行数据row = self.data.iloc[index]sentence1 = str(row['sentence1'])sentence2 = str(row['sentence2'])label = int(row['label'])# 分词与编码encoding = self.tokenizer.encode_plus(text=sentence1,text_pair=sentence2,add_special_tokens=True,  # 添加[CLS], [SEP]max_length=self.max_len,padding='max_length',truncation=True,return_tensors='pt',  # 返回PyTorch Tensorreturn_token_type_ids=True,return_attention_mask=True)return {'input_ids': encoding['input_ids'].flatten(),'attention_mask': encoding['attention_mask'].flatten(),'token_type_ids': encoding['token_type_ids'].flatten(),'label': torch.tensor(label, dtype=torch.long)}# 初始化组件
tokenizer = BertTokenizer.from_pretrained(Model_name,cache_dir="./huggingface_models",  # 指定缓存目录mirror='https://mirror.sjtu.edu.cn/huggingface'  # 上海交大镜像)
df = load_mrpc_data(train_tsv_path)
dataset = MRPCDataset(df, tokenizer, max_len=Max_length)# 创建DataLoader
dataloader = DataLoader(dataset,batch_size=Batch_size,shuffle=True,num_workers=4,  # 多进程加载pin_memory=True  # 加速GPU传输
)# 加载验证集
dev_df = load_mrpc_data(dev_tsv_path)
dev_dataset = MRPCDataset(dev_df, tokenizer, Max_length)
dev_dataloader = DataLoader(dev_dataset,batch_size=Batch_size,shuffle=False,num_workers=2
)
训练
训练指标
训练平台	kaggle-GPUT4
优化器	Adamw
Max_length	128
Batch_size	32
Lr 	2e-5
Epochs 	30# ====================创建训练组件===============
model = BertForSequenceClassification.from_pretrained(Model_name,num_labels=2,  # 二分类任务force_download=True,  # 强制重新下载mirror='https://mirror.sjtu.edu.cn/huggingface',cache_dir="./huggingface_models"
)
model.to(device)# ===============优化器 ==================
# 修改优化器初始化代码
optimizer = AdamW(model.parameters(), lr=Lr, weight_decay=0.01)
total_steps = len(dataloader) * Epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps
)

训练函数

def train_epoch(model, dataloader, optimizer, scheduler, device):model.train()total_loss = 0progress_bar = tqdm(dataloader, desc="Training", leave=False)for batch in progress_bar:optimizer.zero_grad()input_ids = batch['input_ids'].to(device)attention_mask = batch['attention_mask'].to(device)token_type_ids = batch['token_type_ids'].to(device)labels = batch['label'].to(device)outputs = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,labels=labels)loss = outputs.losstotal_loss += loss.item()loss.backward()torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)optimizer.step()scheduler.step()progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})return total_loss / len(dataloader)

验证函数

def evaluate(model, dataloader, device):model.eval()total_loss = 0predictions = []true_labels = []with torch.no_grad():for batch in tqdm(dataloader, desc="Evaluating", leave=False):input_ids = batch['input_ids'].to(device)attention_mask = batch['attention_mask'].to(device)token_type_ids = batch['token_type_ids'].to(device)labels = batch['label'].to(device)outputs = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,labels=labels)loss = outputs.losstotal_loss += loss.item()logits = outputs.logitspreds = torch.argmax(logits, dim=1)predictions.extend(preds.cpu().numpy())true_labels.extend(labels.cpu().numpy())accuracy = accuracy_score(true_labels, predictions)f1 = f1_score(true_labels, predictions)avg_loss = total_loss / len(dataloader)return avg_loss, accuracy, f1if __name__ == "__main__":# 查看第一个batchsample_batch = next(iter(dataloader))print(f"Batch输入尺寸:")print(f"Input IDs: {sample_batch['input_ids'].shape}")print(f"Attention Mask: {sample_batch['attention_mask'].shape}")print(f"Token Type IDs: {sample_batch['token_type_ids'].shape}")print(f"Labels: {sample_batch['label'].shape}")# 输出示例print("\n解码第一个样本:")print(tokenizer.decode(sample_batch['input_ids'][0]))print(f"======== training model ===========")# 主训练循环# 初始化指标存储列表metrics_data = []best_f1 = 0for epoch in range(Epochs):print(f"\nEpoch {epoch + 1}/{Epochs}")print("-" * 40)# 训练阶段train_loss = train_epoch(model, dataloader, optimizer, scheduler, device)print(f"Train Loss: {train_loss:.4f}")# 验证阶段val_loss, val_acc, val_f1 = evaluate(model, dev_dataloader, device)print(f"Val Loss: {val_loss:.4f} | Accuracy: {val_acc:.4f} | F1: {val_f1:.4f}")# 记录指标metrics_data.append({'Epoch': epoch + 1,'Train Loss': round(train_loss, 4),'Val Loss': round(val_loss, 4),'Accuracy': round(val_acc, 4),'F1 Score': round(val_f1, 4)})metrics_df = pd.DataFrame([metrics_data[-1]])  # 只取最新数据if epoch == 0:metrics_df.to_excel("training_metrics.xlsx", index=False)else:with pd.ExcelWriter("training_metrics.xlsx", mode='a', engine='openpyxl',if_sheet_exists='overlay') as writer:metrics_df.to_excel(writer, index=False, header=False, startrow=epoch + 1)# 保存最佳模型if val_f1 > best_f1:best_f1 = val_f1model.save_pretrained("./best_model")tokenizer.save_pretrained("./best_model")print(f"New best model saved with F1: {val_f1:.4f}")print("\nTraining completed!")print(f"Best Validation F1: {best_f1:.4f}")

总体结果与实验总结

采用预训练模型bert-base-uncased,在数据集 MRPC 上测试得到的情况如下:

Train Loss0.0024
Val Loss1.6600
Accuracy0.8247
Val Loss0.8811
F10.0024
Best Validation F10.8862

分析比对现有方法,形成简单的总结报告,想办法提升性能(不一定要比所有方法都好,接近即可),制作表格,自己方法与其他方法的性能对比。
相似度检测任务的现有方法

模型对比

在这里插入图片描述
训练结果如下,超参数设置如下
在这里插入图片描述

总结对比


在这里插入图片描述

优化改进

数据数据增强
训练过程AdamW 分层学习率设置
训练过程超参数搜索
模型模型集成

数据增强

在这里插入图片描述

分层学习率

def create_model(model_name, num_labels=2):"""创建带分层参数的模型"""model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_labels)# 参数分组no_decay = ["bias", "LayerNorm.weight"]optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters()if "embeddings" in n and not any(nd in n for nd in no_decay)],"lr": config['lr'] * 0.1,"weight_decay": 0.0},{"params": [p for n, p in model.named_parameters()if "classifier" in n],"lr": config['lr'] * 10,"weight_decay": 0.01}]return model, optimizer_grouped_parameters

● 分层逻辑
○ 排除动态学习率
■ Bias参数:偏移量不需要正则化(过大的L2惩罚会降低模型灵活性)
■ LayerNorm参数:标准化层权重已自带缩放机制,额外正则化可能破坏分布
○ 词嵌入层的学习率设置:“lr”: config[‘lr’] * 0.1,“weight_decay”: 0.0
○ 分类器学习率设置: “lr”: config[‘lr’] * 10,“weight_decay”: 0.01

超参数搜索

def objective(trial):"""Optuna超参数优化目标函数"""# 超参数建议范围config.update({'lr': trial.suggest_float('lr', 1e-6, 5e-5, log=True),'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),'aug_prob': trial.suggest_float('aug_prob', 0.1, 0.3)})# 初始化模型集合models = []optimizers = []schedulers = []for model_name in config['model_names']:model, params = create_model(model_name)model.to(device)optimizer = AdamW(params)scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=100,num_training_steps=len(train_loader) * config['epochs'])models.append(model)optimizers.append(optimizer)schedulers.append(scheduler)# 训练循环best_f1 = 0for epoch in range(config['epochs']):for model, optimizer, scheduler in zip(models, optimizers, schedulers):train_epoch(model, train_loader, optimizer, scheduler, device)metrics = evaluate_ensemble(models, dev_loader, device)trial.report(metrics['f1'], epoch)if metrics['f1'] > best_f1:best_f1 = metrics['f1']if trial.should_prune():raise optuna.TrialPruned()return best_f1模型集成评估
"model_names": ['bert-base-uncased','roberta-base','google/electra-small-discriminator'],
"ensemble_weights": [0.4, 0.3, 0.3]def evaluate_ensemble(models, dataloader, device):"""集成模型评估"""all_logits = []true_labels = []for model in models:model.eval()model_logits = []with torch.no_grad():for batch in tqdm(dataloader):inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}outputs = model(**inputs)model_logits.append(outputs.logits.cpu().numpy())if len(true_labels) == 0:true_labels.extend(batch['label'].cpu().numpy())all_logits.append(np.concatenate(model_logits))# 加权集成weighted_logits = np.zeros_like(all_logits[0])for i, weight in enumerate(config['ensemble_weights']):weighted_logits += all_logits[i] * weightpredictions = np.argmax(weighted_logits, axis=1)return {'accuracy': accuracy_score(true_labels, predictions),'f1': f1_score(true_labels, predictions)}

相关文章:

  • 机箱结构的EMC设计
  • 5月1日日记
  • Window通过虚拟机17安装Ubuntu20.04并安装相关的插件(胎教级教程)
  • 进程与线程:04 内核线程
  • 2025年4月文章一览
  • 杜邦分析法
  • 实验五 完整性
  • openEuler 22.03 安装 Mysql 5.7,TAR离线安装
  • Java 基础--数组(Array):存储数据的“排排坐”
  • C语言中数字转化为字符串的方法
  • eNSP实验——防火墙 IPSec 配置
  • 【统计方法】方差分析(ANOVA):判断数据差异的统计方法
  • 【dify—9】Chatflow实战——博客文章生成器
  • Java 多线程进阶:线程安全、synchronized、死锁、wait/notify 全解析(含代码示例)
  • linux python3安装
  • 数字智慧方案6145丨智慧学校智能化系统设计方案(53页PPT)(文末有下载方式)
  • CMake中强制启用option定义变量的方法
  • Arduino程序函数从入门到精通
  • Cursor 是什么
  • 【IP101】图像滤波技术详解:从均值滤波到高斯滤波的完整指南
  • 魔都眼|咖啡节上小孩儿忍不住尝了咖啡香,母亲乐了
  • 燕子矶:物流网络中的闪亮节点|劳动者的书信②
  • 见证历史与未来共舞:上海西岸“蝶变共生”对话讲坛圆满举行
  • 美乌签署协议建立美乌重建投资基金
  • 内蒙古公开宣判144件毁林毁草刑案,单起非法占用林地逾250亩
  • 总有黑眼圈是因为“虚”吗?怎么睡才能改善?