Python —— 真题九
问题1. 数据统计。要求:分别统计两个文件中出现次数最多的10词语,作为主题词,要求词语不少于2个字符,打印输出在屏幕上,输出示例如下:(示例词语非答案)
2019:改革:10,企业:9,...(略),深化:2
2018:改革:11,效益:7,...(略),深化:1
注意:输出格式采用英文冒号和英文逗号,标点符号前后无空格,各词语间用逗号分隔,最后一个词语后无逗号。
import jieba
f = open(r'C:\Users\Administrator\Desktop\data2019.txt','r')
lines = f.readlines()
d={}
for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())
L.sort(key=lambda x:x[1],reverse = True)
List = []
for k in range(10):List.append('{}:{}'.format(L[k][0],L[k][1]))
print('2019:'+','.join(List))f = open(r'C:\Users\Administrator\Desktop\data2018.txt','r')
lines = f.readlines()
d={}
for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())
L.sort(key=lambda x:x[1],reverse = True)
List = []
for k in range(10):List.append('{}:{}'.format(L[k][0],L[k][1]))
print('2018:'+','.join(List))
法二函数法
import jieba
def file(data):f = open(r'C:\Users\Administrator\Desktop\data{}.txt'.format(data),'r')lines = f.readlines()d={}for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())L.sort(key=lambda x:x[1],reverse = True)List = []for k in range(10):List.append('{}:{}'.format(L[k][0],L[k][1]))print('{}:'.format(data)+','.join(List))file(2019)
file(2018)
法三
import jieba
def file(data):f = open(r'C:\Users\Administrator\Desktop\data{}.txt'.format(data),'r')lines = f.readlines()d={}for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())L.sort(key=lambda x:x[1],reverse = True)print(str(data)+':',end='')for k in range(10):if k<9:print('{}:{}'.format(L[k][0],L[k][1]),end=',')else:print('{}:{}'.format(L[k][0], L[k][1]))file(2019)
file(2018)
法四字符串切片
import jieba
def file(data):f = open(r'C:\Users\Administrator\Desktop\data{}.txt'.format(data),'r')lines = f.readlines()d={}for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())L.sort(key=lambda x:x[1],reverse = True)print(str(data)+':',end='')result=""for k in range(10):result += '{}:{}'.format(L[k][0],L[k][1]) + ','print(result[:-1])file(2019)
file(2018)
问题二. 数据关联。要求:对比两组主题词的差异,输出两组的共有词语和分别的特有词语。输出示例如下:(示例词语非答案)
共有词语:改革,...(略),深化
2019特有:企业,...(略),加强
2019特有:效益,...(略),创新
注意:输出格式采用英文冒号和英文逗号,标点符号前后无空格,各词语间用逗号分隔,最后一个词语后无逗号。
import jieba
f = open(r'C:\Users\Administrator\Desktop\data2019.txt','r')
lines = f.readlines()
f.close()
d={}
for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())
L.sort(key=lambda x:x[1],reverse = True)
L_2019 = []
for k in range(10):L_2019.append(L[k][0])f = open(r'C:\Users\Administrator\Desktop\data2018.txt','r')
lines = f.readlines()
f.close()
d={}
for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())
L.sort(key=lambda x:x[1],reverse = True)
L_2018 = []
for k in range(10):L_2018.append(L[k][0])L_C = [] #共有词语
for m in L_2019:if m in L_2018:L_C.append(m)# 特有
for n in L_C:L_2019.remove(n)L_2018.remove(n)print('共有词语:'+','.join(L_C))
print('2019特有:{}'.format(','.join(L_2019)))
print('2018特有:{}'.format(','.join(L_2018)))
法二set集合
import jieba
f = open(r'C:\Users\Administrator\Desktop\data2019.txt','r')
lines = f.readlines()
f.close()
d={}
for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())
L.sort(key=lambda x:x[1],reverse = True)
L_2019 = []
for k in range(10):L_2019.append(L[k][0])f = open(r'C:\Users\Administrator\Desktop\data2018.txt','r')
lines = f.readlines()
f.close()
d={}
for line in lines:for i in jieba.lcut(line):if len(i) >= 2:d[i] = d.get(i,0) + 1L = list(d.items())
L.sort(key=lambda x:x[1],reverse = True)
L_2018 = []
for k in range(10):L_2018.append(L[k][0])print('共有词语:'+ ','.join(list(set(L_2019) & set(L_2018))))
print('2019特有:'+','.join(list(set(L_2019) - set(L_2018))))
print('2018特有:'+','.join(list(set(L_2018) - set(L_2019))))