当前位置: 首页 > news >正文

python实现将COQE数据转换成字符串的格式

问题描述:

原始数据存储在一个.txt文件中,存储格式如下:

如何将按照上面格式存储的内容,修改成下面的格式(方便DiaCOQE处理):

问题解决:

from pdb import set_trace as stop
import os
import re
from tqdm import trange
generated_path= "/home/qtxu/DiaCOQE/data/Ele-COQE/dev.txt"
Unicoqe_path = "/home/qtxu/DiaCOQE/data/Ele-COQE/devss.txt"raw_data = []
with open(os.path.join(generated_path), 'r') as f:for line in f:raw_data.append(line)polarity_dict={'1': 'Better','0': 'Equal','-1': 'Worse','2': 'Different'}def obtain_span_str(cur_str, cur_path):if 'Camera' in cur_path:split_sign = '&&'else:split_sign = '&'cur_list = cur_str.strip()[1:-1].split()cur_span = ''.join(word for index, word in (pair.split(split_sign) if '&' in pair else [pair, ''] for pair in  cur_list))return cur_spanwith open(Unicoqe_path, 'w') as fw:line_id, i = 0, 0text_line, label_line = '', ''for line_id in trange(len(raw_data), desc= "procesing data ……"):cur_line = raw_data[line_id]if len(cur_line.split('\t')) != 2:label_line += '\n' + cur_lineelse:if text_line !='':sent, label = text_line.strip().split("\t")fw.write(sent+"####")label_list = label_line.strip().split('\n\n')span_words =[]all_span= ()for label_i in label_list:cur_span = label_i.strip()[1:-1].split(';')sub, obj,asp, opi, polarity = cur_span[0], cur_span[1], cur_span[2],cur_span[3], cur_span[4]try:sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] except:print(text_line)stop()span_quintuple = (sub_span, obj_span, asp_span, opi_span, por_span)span_words.append(span_quintuple)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")text_line = cur_linelabel_line=''fw.write(text_line.strip().split("\t")[0]+"####")label_list = label_line.strip().split('\n\n')span_words =[]all_span= ()for label_i in label_list:cur_span = label_i.strip()[1:-1].split(';')# sub, obj,asp = cur_span[0], cur_span[1], cur_span[2]sub, obj,asp, opi, polarity = cur_span[0], cur_span[1], cur_span[2],cur_span[3], cur_span[4]sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] span_quintuple = (sub_span, obj_span, asp_span,opi_span, por_span)span_words.append(span_quintuple)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")

优化版代码如下:

import os
import re
from tqdm import trange
from pdb import set_trace as stoppolarity_dict={'1': 'Better','0': 'Equal','-1': 'Worse','2': 'Different'}def obtain_span_str(cur_str, cur_path):split_sign = '&&' if 'Camera' in cur_path else '&'cur_list = cur_str.strip()[1:-1].split()cur_span = ''.join(word for index, word in (pair.split(split_sign) if '&' in pair else [pair, ''] for pair in  cur_list))return cur_spandef process_label_line(label_list, generated_path,text_line):span_words =[]for label_i in label_list:sub, obj,asp, opi, polarity  = label_i.strip()[1:-1].split(';')try:sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] except:print(text_line)stop()span_quintuple = (sub_span, obj_span, asp_span, opi_span, por_span)span_words.append(span_quintuple)return span_wordsdef process_file (read_file, write_file):raw_data = []with open(os.path.join(read_file), 'r') as f:for line in f:raw_data.append(line)with open(write_file, 'w') as fw:line_id = 0text_line, label_line = '', ''for line_id in trange(len(raw_data), desc= "procesing data ……"):cur_line = raw_data[line_id]if len(cur_line.split('\t')) != 2:label_line += '\n' + cur_lineelse:if text_line !='':sent, label = text_line.strip().split("\t")fw.write(sent+"####")label_list = label_line.strip().split('\n\n')span_words = process_label_line(label_list, generated_path, text_line)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")text_line = cur_linelabel_line=''fw.write(text_line.strip().split("\t")[0]+"####")label_list = label_line.strip().split('\n\n')span_words = process_label_line(label_list, generated_path, text_line)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")if __name__ == "__main__":generated_path= "/home/qtxu/DiaCOQE/data/Ele-COQE/dev.txt"write_path = "/home/qtxu/DiaCOQE/data/Ele-COQE/devs3.txt"process_file(generated_path, write_path)

http://www.dtcms.com/a/253761.html

相关文章:

  • ollama在win10中使用
  • 前端面试专栏-主流框架:10. React状态管理方案(Redux、Mobx、Zustand)
  • 错误监控----比如实现sentry一些思路
  • web和uniapp接入腾讯云直播
  • 腾讯云TCCA认证考试报名 - TDSQL数据库交付运维工程师(MySQL版)
  • Matlab学习笔记
  • 解决idea无法正常加载lombok包
  • CTF解题:[NSSCTF 2022 Spring Recruit]弱类型比较绕过
  • TikTok 矩阵如何快速涨粉
  • MySQL存储引擎深度解析:InnoDB、MyISAM、MEMORY 与 ARCHIVE 的全面对比与选型建议
  • YOLOv11改进系列---Conv篇---2024最新深度可分卷积与多尺度卷积结合的模块MSCB助力yolov11有效涨点
  • 微信中 qrcode 生成二维码长按无效果的解决方案
  • python函数(II)
  • Jira 需求处理全流程解析:从入门到实践
  • google ADK Agent间传参数
  • 利用cpolar实现Talebook数字图书馆的实时访问
  • 最新期刊影响因子,基本包含全部期刊
  • Linux内核编译、安装与回退完全指南:从配置到安全回滚
  • 【论文阅读笔记】《CodeS: Towards Building Open-source Language Models for Text-to-SQL 》
  • 【图像处理基石】什么是EIS和OIS?
  • Vue3 + TypeScript合并两个列表到目标列表,并且进行排序,数组合并、集合合并、列表合并、list合并
  • 力扣-416.分割等和子集
  • ArkUI-X跨平台技术落地-华为运动健康(二)
  • k8s中pod有哪些状态?
  • python学智能算法(十二)|机器学习朴素贝叶斯方法初步-拉普拉斯平滑计算条件概率
  • 深度学习:人工神经网络之参数初始化和神经网络搭建
  • Transformer-BiGRU、Transformer、CNN-BiGRU、BiGRU、CNN五模型多变量时序预测
  • 深入ZGC并发处理的原理
  • docker中部署gitlab
  • 实时中值滤波 + 低通滤波 示例程序(STM32环境)