当前位置: 首页 > news >正文

python实现将COQE数据转换成字符串的格式

问题描述:

原始数据存储在一个.txt文件中,存储格式如下:

如何将按照上面格式存储的内容,修改成下面的格式(方便DiaCOQE处理):

问题解决:

from pdb import set_trace as stop
import os
import re
from tqdm import trange
generated_path= "/home/qtxu/DiaCOQE/data/Ele-COQE/dev.txt"
Unicoqe_path = "/home/qtxu/DiaCOQE/data/Ele-COQE/devss.txt"raw_data = []
with open(os.path.join(generated_path), 'r') as f:for line in f:raw_data.append(line)polarity_dict={'1': 'Better','0': 'Equal','-1': 'Worse','2': 'Different'}def obtain_span_str(cur_str, cur_path):if 'Camera' in cur_path:split_sign = '&&'else:split_sign = '&'cur_list = cur_str.strip()[1:-1].split()cur_span = ''.join(word for index, word in (pair.split(split_sign) if '&' in pair else [pair, ''] for pair in  cur_list))return cur_spanwith open(Unicoqe_path, 'w') as fw:line_id, i = 0, 0text_line, label_line = '', ''for line_id in trange(len(raw_data), desc= "procesing data ……"):cur_line = raw_data[line_id]if len(cur_line.split('\t')) != 2:label_line += '\n' + cur_lineelse:if text_line !='':sent, label = text_line.strip().split("\t")fw.write(sent+"####")label_list = label_line.strip().split('\n\n')span_words =[]all_span= ()for label_i in label_list:cur_span = label_i.strip()[1:-1].split(';')sub, obj,asp, opi, polarity = cur_span[0], cur_span[1], cur_span[2],cur_span[3], cur_span[4]try:sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] except:print(text_line)stop()span_quintuple = (sub_span, obj_span, asp_span, opi_span, por_span)span_words.append(span_quintuple)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")text_line = cur_linelabel_line=''fw.write(text_line.strip().split("\t")[0]+"####")label_list = label_line.strip().split('\n\n')span_words =[]all_span= ()for label_i in label_list:cur_span = label_i.strip()[1:-1].split(';')# sub, obj,asp = cur_span[0], cur_span[1], cur_span[2]sub, obj,asp, opi, polarity = cur_span[0], cur_span[1], cur_span[2],cur_span[3], cur_span[4]sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] span_quintuple = (sub_span, obj_span, asp_span,opi_span, por_span)span_words.append(span_quintuple)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")

优化版代码如下:

import os
import re
from tqdm import trange
from pdb import set_trace as stoppolarity_dict={'1': 'Better','0': 'Equal','-1': 'Worse','2': 'Different'}def obtain_span_str(cur_str, cur_path):split_sign = '&&' if 'Camera' in cur_path else '&'cur_list = cur_str.strip()[1:-1].split()cur_span = ''.join(word for index, word in (pair.split(split_sign) if '&' in pair else [pair, ''] for pair in  cur_list))return cur_spandef process_label_line(label_list, generated_path,text_line):span_words =[]for label_i in label_list:sub, obj,asp, opi, polarity  = label_i.strip()[1:-1].split(';')try:sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] except:print(text_line)stop()span_quintuple = (sub_span, obj_span, asp_span, opi_span, por_span)span_words.append(span_quintuple)return span_wordsdef process_file (read_file, write_file):raw_data = []with open(os.path.join(read_file), 'r') as f:for line in f:raw_data.append(line)with open(write_file, 'w') as fw:line_id = 0text_line, label_line = '', ''for line_id in trange(len(raw_data), desc= "procesing data ……"):cur_line = raw_data[line_id]if len(cur_line.split('\t')) != 2:label_line += '\n' + cur_lineelse:if text_line !='':sent, label = text_line.strip().split("\t")fw.write(sent+"####")label_list = label_line.strip().split('\n\n')span_words = process_label_line(label_list, generated_path, text_line)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")text_line = cur_linelabel_line=''fw.write(text_line.strip().split("\t")[0]+"####")label_list = label_line.strip().split('\n\n')span_words = process_label_line(label_list, generated_path, text_line)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")if __name__ == "__main__":generated_path= "/home/qtxu/DiaCOQE/data/Ele-COQE/dev.txt"write_path = "/home/qtxu/DiaCOQE/data/Ele-COQE/devs3.txt"process_file(generated_path, write_path)

相关文章:

  • ollama在win10中使用
  • 前端面试专栏-主流框架:10. React状态管理方案(Redux、Mobx、Zustand)
  • 错误监控----比如实现sentry一些思路
  • web和uniapp接入腾讯云直播
  • 腾讯云TCCA认证考试报名 - TDSQL数据库交付运维工程师(MySQL版)
  • Matlab学习笔记
  • 解决idea无法正常加载lombok包
  • CTF解题:[NSSCTF 2022 Spring Recruit]弱类型比较绕过
  • TikTok 矩阵如何快速涨粉
  • MySQL存储引擎深度解析:InnoDB、MyISAM、MEMORY 与 ARCHIVE 的全面对比与选型建议
  • YOLOv11改进系列---Conv篇---2024最新深度可分卷积与多尺度卷积结合的模块MSCB助力yolov11有效涨点
  • 微信中 qrcode 生成二维码长按无效果的解决方案
  • python函数(II)
  • Jira 需求处理全流程解析:从入门到实践
  • google ADK Agent间传参数
  • 利用cpolar实现Talebook数字图书馆的实时访问
  • 最新期刊影响因子,基本包含全部期刊
  • Linux内核编译、安装与回退完全指南:从配置到安全回滚
  • 【论文阅读笔记】《CodeS: Towards Building Open-source Language Models for Text-to-SQL 》
  • 【图像处理基石】什么是EIS和OIS?
  • 美女做丝袜广告视频网站/批量查询神马关键词排名
  • slim编辑器Wordpress/保定seo排名
  • 网站开发制作平台/做优化关键词
  • wordpress sinaapp/南京seo排名公司
  • 云南省网站开发/武汉seo网络优化公司
  • 做行业门户网站注意什么/老王搜索引擎入口