当前位置: 首页 > wzjs >正文

做蛋糕比较火的网站友链外链app

做蛋糕比较火的网站,友链外链app,新产品开发流程和步骤,专业做互联网招聘的网站有哪些基于BERT类的MRPC语义相似度检测(从0到-1系列) 介绍 BERT(Bidirectional Encoder Representations from Transformers)是由Google开发的一种预训练模型,它是一种基于Transformer机制的深度双向模型,可以对…

基于BERT类的MRPC语义相似度检测(从0到-1系列)

介绍

BERT(Bidirectional Encoder Representations from Transformers)是由Google开发的一种预训练模型,它是一种基于Transformer机制的深度双向模型,可以对大规模文本数据进行无监督预训练,然后在各种NLP任务中进行微调。BERT 模型在多项自然语言处理任务上取得了巨大成功,成为了NLP 领域里的一个重要里程碑。

语意相似度检测任务是NLP领域中一项重要的任务,它旨在判断两个文本片段之间的语义相似程度。在这项任务中,我们需要输入两个文本片段,然后输出一个相似度分数,表示这两个文本片段之间的语义相似程度。语义相似度检测任务在很多NLP应用中非常有用,比如信息检索、问答系统、自动摘要等领域。

BERT 在语意相似度检测任务中的应用非常成功,通过将两个文本片段拼接并输入到BERT模型中,可以得到两个文本片段的语义表示,然后通过一些微调层或者特定的输出层进行语义相似度的判断。BERT 模型具有强大的语义表达能力,可以有效地捕捉文本之间的语义信息,因此在语义相似度检测任务中取得了很好的效果。

数据预处理

分析数据基本结构

Quality #1 ID #2 ID #1 String #2 String

1 702876 702977 Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence . Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .

0 2108705 2108831 Yucaipa owned Dominick 's before selling the chain
to Safeway in 1998 for $ 2.5 billion . Yucaipa bought Dominick 's in
1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in
1998 .

数据由五个列组合而成,其中 ID 这一属性对该任务无明显作用,故使用 py 脚本剔除,python脚本如下:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: bruanlin
# datetime: 2025/4/23 13:44
# software: PyCharm
# project: [Bert-mrpc]-[data_trasform]
""" Code Describe :nothing!!
"""import pandas as pd
from typing import Tuple
import logging
import os
from pathlib import Path# 配置日志记录
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)def load_mrpc_data(file_path: str) -> Tuple[pd.DataFrame, int]:"""加载MRPC数据集"""try:# 显式指定列类型防止自动类型推断错误dtype_spec = {'#1 ID': 'string','#2 ID': 'string','#1 String': 'string','#2 String': 'string','Quality': 'int8'}# 分块读取大文件(适用于Colab内存受限情况)chunks = pd.read_csv(file_path,sep='\t',header=0,dtype=dtype_spec,usecols=['Quality', '#1 String', '#2 String'],  # 明确指定需要列chunksize=1000,on_bad_lines='warn'  # 跳过格式错误行)df = pd.concat(chunks)except FileNotFoundError:logger.error(f"文件未找到: {file_path}")raiseexcept pd.errors.ParserError as e:logger.error(f"TSV解析错误: {str(e)}")raise# 记录初始行数original_rows = df.shape[0]# 数据清洗df = df.dropna(subset=['#1 String', '#2 String'])  # 删除空值行df = df.rename(columns={'Quality': 'label','#1 String': 'sentence1','#2 String': 'sentence2'})# 记录清洗结果cleaned_rows = df.shape[0]logger.info(f"数据清洗完成: 原始行数={original_rows}, 有效行数={cleaned_rows}, "f"丢弃行数={original_rows - cleaned_rows}")return df.reset_index(drop=True), (original_rows - cleaned_rows)def save_processed_data(df: pd.DataFrame,output_dir = "processed_data",file_name = "mrpc_processed",formats ='tsv') -> None:"""保存数据"""# 创建输出目录Path(output_dir).mkdir(parents=True, exist_ok=True)# 确保列顺序正确df = df[['label', 'sentence1', 'sentence2']]# 保存不同格式try:full_path = os.path.join(output_dir, f"{file_name}.{formats}")if formats == 'tsv':# TSV格式(默认格式)df.to_csv(full_path,sep='\t',index=False,header=True,  # 根据需求调整quoting=3,  # 与原始数据一致escapechar='\\')elif formats == 'csv':# CSV格式(带列名)df.to_csv(full_path,index=False,quotechar='"',escapechar='\\')else:raise ValueError(f"不支持的格式: {formats}")print(f"成功保存 {full_path} ({os.path.getsize(full_path) / 1024:.1f}KB)")except Exception as e:print(f"保存 {formats} 格式失败: {str(e)}")# 使用示例
if __name__ == "__main__":try:train_df, train_dropped = load_mrpc_data("DATASET/MRPC/train.tsv")dev_df, dev_dropped = load_mrpc_data("DATASET/MRPC/dev.tsv")# 展示样本结构print("\n训练集样例:")print(train_df[['label', 'sentence1', 'sentence2']].head(3))# 数据分布分析print("\n标签分布:")print(train_df['label'].value_counts(normalize=True))# 保存数据try:# 保存训练集数据save_processed_data(train_df,"./DATASET/train_processed","train_processed_mrpc","tsv")save_processed_data(dev_df,"./DATASET/dev_processed","dev_processed_mrpc","tsv")except Exception as e:logger.error(f"保存数据失败: {e}")except Exception as e:logger.error("数据加载失败,请检查文件路径和格式")

剔除后的实验数据如下:

label sentence1 sentence2

1 He said the foodservice pie business doesn 't fit the company 's
long-term growth strategy . The foodservice pie business does not fit
our long-term growth strategy .
0 The dollar was at 116.92 yen against the yen , flat on
the session , and at 1.2891 against the Swiss franc , also flat . The
dollar was at 116.78 yen JPY = , virtually flat on the session , and
at 1.2871 against the Swiss franc CHF = , down 0.1 percent .

创建 dataset 和 dataloader

# 参数设置
Max_length = 128
Batch_size = 32
Lr = 2e-5
Epochs = 10
Model_name = 'bert-base-uncased'
train_tsv_path = "/kaggle/input/mrpc-bert/train_processed_mrpc.tsv"
dev_tsv_path = "/kaggle/input/mrpc-bert/dev_processed_mrpc.tsv"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')# 加载
def load_mrpc_data(file_path):"""加载并预处理MRPC TSV文件"""try:chunks = pd.read_csv(file_path,sep='\t',header=0,dtype=str,usecols=['label', 'sentence1', 'sentence2'],  # 关键修改点chunksize=1000,on_bad_lines='warn')df = pd.concat(chunks)# 类型转换df['label'] = pd.to_numeric(df['label'], errors='coerce').astype('Int8')df = df.dropna().reset_index(drop=True)return dfexcept Exception as e:print(f"Error: {e}")return None  # 确保异常时返回明确空值# 创建自定义Dataset
class MRPCDataset(Dataset):def __init__(self, dataframe, tokenizer, max_len):self.data = dataframeself.tokenizer = tokenizerself.max_len = max_lendef __len__(self):return len(self.data)def __getitem__(self, index):# 提取单行数据row = self.data.iloc[index]sentence1 = str(row['sentence1'])sentence2 = str(row['sentence2'])label = int(row['label'])# 分词与编码encoding = self.tokenizer.encode_plus(text=sentence1,text_pair=sentence2,add_special_tokens=True,  # 添加[CLS], [SEP]max_length=self.max_len,padding='max_length',truncation=True,return_tensors='pt',  # 返回PyTorch Tensorreturn_token_type_ids=True,return_attention_mask=True)return {'input_ids': encoding['input_ids'].flatten(),'attention_mask': encoding['attention_mask'].flatten(),'token_type_ids': encoding['token_type_ids'].flatten(),'label': torch.tensor(label, dtype=torch.long)}# 初始化组件
tokenizer = BertTokenizer.from_pretrained(Model_name,cache_dir="./huggingface_models",  # 指定缓存目录mirror='https://mirror.sjtu.edu.cn/huggingface'  # 上海交大镜像)
df = load_mrpc_data(train_tsv_path)
dataset = MRPCDataset(df, tokenizer, max_len=Max_length)# 创建DataLoader
dataloader = DataLoader(dataset,batch_size=Batch_size,shuffle=True,num_workers=4,  # 多进程加载pin_memory=True  # 加速GPU传输
)# 加载验证集
dev_df = load_mrpc_data(dev_tsv_path)
dev_dataset = MRPCDataset(dev_df, tokenizer, Max_length)
dev_dataloader = DataLoader(dev_dataset,batch_size=Batch_size,shuffle=False,num_workers=2
)
训练
训练指标
训练平台	kaggle-GPUT4
优化器	Adamw
Max_length	128
Batch_size	32
Lr 	2e-5
Epochs 	30# ====================创建训练组件===============
model = BertForSequenceClassification.from_pretrained(Model_name,num_labels=2,  # 二分类任务force_download=True,  # 强制重新下载mirror='https://mirror.sjtu.edu.cn/huggingface',cache_dir="./huggingface_models"
)
model.to(device)# ===============优化器 ==================
# 修改优化器初始化代码
optimizer = AdamW(model.parameters(), lr=Lr, weight_decay=0.01)
total_steps = len(dataloader) * Epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps
)

训练函数

def train_epoch(model, dataloader, optimizer, scheduler, device):model.train()total_loss = 0progress_bar = tqdm(dataloader, desc="Training", leave=False)for batch in progress_bar:optimizer.zero_grad()input_ids = batch['input_ids'].to(device)attention_mask = batch['attention_mask'].to(device)token_type_ids = batch['token_type_ids'].to(device)labels = batch['label'].to(device)outputs = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,labels=labels)loss = outputs.losstotal_loss += loss.item()loss.backward()torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)optimizer.step()scheduler.step()progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})return total_loss / len(dataloader)

验证函数

def evaluate(model, dataloader, device):model.eval()total_loss = 0predictions = []true_labels = []with torch.no_grad():for batch in tqdm(dataloader, desc="Evaluating", leave=False):input_ids = batch['input_ids'].to(device)attention_mask = batch['attention_mask'].to(device)token_type_ids = batch['token_type_ids'].to(device)labels = batch['label'].to(device)outputs = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,labels=labels)loss = outputs.losstotal_loss += loss.item()logits = outputs.logitspreds = torch.argmax(logits, dim=1)predictions.extend(preds.cpu().numpy())true_labels.extend(labels.cpu().numpy())accuracy = accuracy_score(true_labels, predictions)f1 = f1_score(true_labels, predictions)avg_loss = total_loss / len(dataloader)return avg_loss, accuracy, f1if __name__ == "__main__":# 查看第一个batchsample_batch = next(iter(dataloader))print(f"Batch输入尺寸:")print(f"Input IDs: {sample_batch['input_ids'].shape}")print(f"Attention Mask: {sample_batch['attention_mask'].shape}")print(f"Token Type IDs: {sample_batch['token_type_ids'].shape}")print(f"Labels: {sample_batch['label'].shape}")# 输出示例print("\n解码第一个样本:")print(tokenizer.decode(sample_batch['input_ids'][0]))print(f"======== training model ===========")# 主训练循环# 初始化指标存储列表metrics_data = []best_f1 = 0for epoch in range(Epochs):print(f"\nEpoch {epoch + 1}/{Epochs}")print("-" * 40)# 训练阶段train_loss = train_epoch(model, dataloader, optimizer, scheduler, device)print(f"Train Loss: {train_loss:.4f}")# 验证阶段val_loss, val_acc, val_f1 = evaluate(model, dev_dataloader, device)print(f"Val Loss: {val_loss:.4f} | Accuracy: {val_acc:.4f} | F1: {val_f1:.4f}")# 记录指标metrics_data.append({'Epoch': epoch + 1,'Train Loss': round(train_loss, 4),'Val Loss': round(val_loss, 4),'Accuracy': round(val_acc, 4),'F1 Score': round(val_f1, 4)})metrics_df = pd.DataFrame([metrics_data[-1]])  # 只取最新数据if epoch == 0:metrics_df.to_excel("training_metrics.xlsx", index=False)else:with pd.ExcelWriter("training_metrics.xlsx", mode='a', engine='openpyxl',if_sheet_exists='overlay') as writer:metrics_df.to_excel(writer, index=False, header=False, startrow=epoch + 1)# 保存最佳模型if val_f1 > best_f1:best_f1 = val_f1model.save_pretrained("./best_model")tokenizer.save_pretrained("./best_model")print(f"New best model saved with F1: {val_f1:.4f}")print("\nTraining completed!")print(f"Best Validation F1: {best_f1:.4f}")

总体结果与实验总结

采用预训练模型bert-base-uncased,在数据集 MRPC 上测试得到的情况如下:

Train Loss0.0024
Val Loss1.6600
Accuracy0.8247
Val Loss0.8811
F10.0024
Best Validation F10.8862

分析比对现有方法,形成简单的总结报告,想办法提升性能(不一定要比所有方法都好,接近即可),制作表格,自己方法与其他方法的性能对比。
相似度检测任务的现有方法

模型对比

在这里插入图片描述
训练结果如下,超参数设置如下
在这里插入图片描述

总结对比


在这里插入图片描述

优化改进

数据数据增强
训练过程AdamW 分层学习率设置
训练过程超参数搜索
模型模型集成

数据增强

在这里插入图片描述

分层学习率

def create_model(model_name, num_labels=2):"""创建带分层参数的模型"""model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_labels)# 参数分组no_decay = ["bias", "LayerNorm.weight"]optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters()if "embeddings" in n and not any(nd in n for nd in no_decay)],"lr": config['lr'] * 0.1,"weight_decay": 0.0},{"params": [p for n, p in model.named_parameters()if "classifier" in n],"lr": config['lr'] * 10,"weight_decay": 0.01}]return model, optimizer_grouped_parameters

● 分层逻辑
○ 排除动态学习率
■ Bias参数:偏移量不需要正则化(过大的L2惩罚会降低模型灵活性)
■ LayerNorm参数:标准化层权重已自带缩放机制,额外正则化可能破坏分布
○ 词嵌入层的学习率设置:“lr”: config[‘lr’] * 0.1,“weight_decay”: 0.0
○ 分类器学习率设置: “lr”: config[‘lr’] * 10,“weight_decay”: 0.01

超参数搜索

def objective(trial):"""Optuna超参数优化目标函数"""# 超参数建议范围config.update({'lr': trial.suggest_float('lr', 1e-6, 5e-5, log=True),'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),'aug_prob': trial.suggest_float('aug_prob', 0.1, 0.3)})# 初始化模型集合models = []optimizers = []schedulers = []for model_name in config['model_names']:model, params = create_model(model_name)model.to(device)optimizer = AdamW(params)scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=100,num_training_steps=len(train_loader) * config['epochs'])models.append(model)optimizers.append(optimizer)schedulers.append(scheduler)# 训练循环best_f1 = 0for epoch in range(config['epochs']):for model, optimizer, scheduler in zip(models, optimizers, schedulers):train_epoch(model, train_loader, optimizer, scheduler, device)metrics = evaluate_ensemble(models, dev_loader, device)trial.report(metrics['f1'], epoch)if metrics['f1'] > best_f1:best_f1 = metrics['f1']if trial.should_prune():raise optuna.TrialPruned()return best_f1模型集成评估
"model_names": ['bert-base-uncased','roberta-base','google/electra-small-discriminator'],
"ensemble_weights": [0.4, 0.3, 0.3]def evaluate_ensemble(models, dataloader, device):"""集成模型评估"""all_logits = []true_labels = []for model in models:model.eval()model_logits = []with torch.no_grad():for batch in tqdm(dataloader):inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}outputs = model(**inputs)model_logits.append(outputs.logits.cpu().numpy())if len(true_labels) == 0:true_labels.extend(batch['label'].cpu().numpy())all_logits.append(np.concatenate(model_logits))# 加权集成weighted_logits = np.zeros_like(all_logits[0])for i, weight in enumerate(config['ensemble_weights']):weighted_logits += all_logits[i] * weightpredictions = np.argmax(weighted_logits, axis=1)return {'accuracy': accuracy_score(true_labels, predictions),'f1': f1_score(true_labels, predictions)}
http://www.dtcms.com/wzjs/462708.html

相关文章:

  • 江西宜春市建设局网站高端网站建设企业
  • 班级建设怎样建立班级网站免费网站推广网站在线
  • 做PPT的网站canva百度推广方案怎么写
  • 南通建设工程信息网官网seo是广告投放吗
  • 常州市教育基本建设与装备管理中心网站seo网站关键词优化工具
  • 十堰市茅箭区建设局网站百度大数据预测平台
  • 有关网站建设的电子商务论文百度网盘客服电话24小时
  • 建盏产业品牌上海seo推广
  • 家具网站素材创建自己的网站怎么弄
  • 深圳最专业的高端网站建设网页做推广
  • 常德做网站惠州网站建设
  • 网站文字代码教育培训机构前十名
  • 网站建设要实现的目标专业外贸网络推广
  • 网站站长是什么意思怎么做百度网页推广
  • 网站首页界面设计珠海网站建设制作
  • 济南快速建站模板网站优化工具
  • 西安市城乡建设管理局网站6江北关键词优化排名seo
  • 建设大厦网站seo详细教程
  • 溧水城市建设招标网站关键词提取
  • 沈阳网站建设蓝顶网络排名软件
  • 河南建达工程建设监理公司网站搜索引擎营销的英文缩写
  • js可以做动态网站吗chatgpt 链接
  • wordpress的hookseo关键词分析
  • 关于网站开发的自我评价聊城网站seo
  • 做网站的一定要开80或8080端口网站建设介绍ppt
  • 万维网的网站互联网培训
  • 网站开发cms热门关键词查询
  • 微信小程序公众平台浙江seo公司
  • 2014个人网站备案如何让网站被百度收录
  • 做牛仔裤的小视频网站win10优化软件