python实现将COQE数据转换成字符串的格式
问题描述:
原始数据存储在一个.txt文件中,存储格式如下:
如何将按照上面格式存储的内容,修改成下面的格式(方便DiaCOQE处理):
问题解决:
from pdb import set_trace as stop
import os
import re
from tqdm import trange
generated_path= "/home/qtxu/DiaCOQE/data/Ele-COQE/dev.txt"
Unicoqe_path = "/home/qtxu/DiaCOQE/data/Ele-COQE/devss.txt"raw_data = []
with open(os.path.join(generated_path), 'r') as f:for line in f:raw_data.append(line)polarity_dict={'1': 'Better','0': 'Equal','-1': 'Worse','2': 'Different'}def obtain_span_str(cur_str, cur_path):if 'Camera' in cur_path:split_sign = '&&'else:split_sign = '&'cur_list = cur_str.strip()[1:-1].split()cur_span = ''.join(word for index, word in (pair.split(split_sign) if '&' in pair else [pair, ''] for pair in cur_list))return cur_spanwith open(Unicoqe_path, 'w') as fw:line_id, i = 0, 0text_line, label_line = '', ''for line_id in trange(len(raw_data), desc= "procesing data ……"):cur_line = raw_data[line_id]if len(cur_line.split('\t')) != 2:label_line += '\n' + cur_lineelse:if text_line !='':sent, label = text_line.strip().split("\t")fw.write(sent+"####")label_list = label_line.strip().split('\n\n')span_words =[]all_span= ()for label_i in label_list:cur_span = label_i.strip()[1:-1].split(';')sub, obj,asp, opi, polarity = cur_span[0], cur_span[1], cur_span[2],cur_span[3], cur_span[4]try:sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] except:print(text_line)stop()span_quintuple = (sub_span, obj_span, asp_span, opi_span, por_span)span_words.append(span_quintuple)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")text_line = cur_linelabel_line=''fw.write(text_line.strip().split("\t")[0]+"####")label_list = label_line.strip().split('\n\n')span_words =[]all_span= ()for label_i in label_list:cur_span = label_i.strip()[1:-1].split(';')# sub, obj,asp = cur_span[0], cur_span[1], cur_span[2]sub, obj,asp, opi, polarity = cur_span[0], cur_span[1], cur_span[2],cur_span[3], cur_span[4]sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] span_quintuple = (sub_span, obj_span, asp_span,opi_span, por_span)span_words.append(span_quintuple)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")
优化版代码如下:
import os
import re
from tqdm import trange
from pdb import set_trace as stoppolarity_dict={'1': 'Better','0': 'Equal','-1': 'Worse','2': 'Different'}def obtain_span_str(cur_str, cur_path):split_sign = '&&' if 'Camera' in cur_path else '&'cur_list = cur_str.strip()[1:-1].split()cur_span = ''.join(word for index, word in (pair.split(split_sign) if '&' in pair else [pair, ''] for pair in cur_list))return cur_spandef process_label_line(label_list, generated_path,text_line):span_words =[]for label_i in label_list:sub, obj,asp, opi, polarity = label_i.strip()[1:-1].split(';')try:sub_span = obtain_span_str(sub, generated_path)obj_span = obtain_span_str(obj, generated_path)asp_span = obtain_span_str(asp, generated_path)opi_span = obtain_span_str(opi, generated_path)if len(polarity) ==2:por_span = ''else:por_span = polarity_dict[polarity[1:-1]] except:print(text_line)stop()span_quintuple = (sub_span, obj_span, asp_span, opi_span, por_span)span_words.append(span_quintuple)return span_wordsdef process_file (read_file, write_file):raw_data = []with open(os.path.join(read_file), 'r') as f:for line in f:raw_data.append(line)with open(write_file, 'w') as fw:line_id = 0text_line, label_line = '', ''for line_id in trange(len(raw_data), desc= "procesing data ……"):cur_line = raw_data[line_id]if len(cur_line.split('\t')) != 2:label_line += '\n' + cur_lineelse:if text_line !='':sent, label = text_line.strip().split("\t")fw.write(sent+"####")label_list = label_line.strip().split('\n\n')span_words = process_label_line(label_list, generated_path, text_line)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")text_line = cur_linelabel_line=''fw.write(text_line.strip().split("\t")[0]+"####")label_list = label_line.strip().split('\n\n')span_words = process_label_line(label_list, generated_path, text_line)fw.write(str('['+', '.join(str(span) for span in span_words)) + "]\n")if __name__ == "__main__":generated_path= "/home/qtxu/DiaCOQE/data/Ele-COQE/dev.txt"write_path = "/home/qtxu/DiaCOQE/data/Ele-COQE/devs3.txt"process_file(generated_path, write_path)