当前位置: 首页 > news >正文

5-TF·IDF关键词算法

介绍

0
1
2
dir_path = r"category_corpus/"

import jieba
import math
import os
import json
from collections import defaultdict"""
tfidf的计算和使用
"""
#统计tf和idf值
def build_tf_idf_dict(corpus):tf_dict = defaultdict(dict)  #key:文档序号,value:dict,文档中每个词出现的频率idf_dict = defaultdict(set)  #key:词, value:set,文档序号,最终用于计算每个词在多少篇文档中出现过for doc_index, text_words in enumerate(corpus):for word in text_words:if word not in tf_dict[doc_index]:tf_dict[doc_index][word] = 0tf_dict[doc_index][word] += 1idf_dict[word].add(doc_index)idf_dict = dict([(key, len(value)) for key, value in idf_dict.items()])return tf_dict, idf_dict#根据tf值和idf值计算tfidf
def calculate_tf_idf(tf_dict, idf_dict):tf_idf_dict = defaultdict(dict)for doc_index, word_tf_count_dict in tf_dict.items():for word, tf_count in word_tf_count_dict.items():tf = tf_count / sum(word_tf_count_dict.values())#tf-idf = tf * log(D/(idf + 1))tf_idf_dict[doc_index][word] = tf * math.log(len(tf_dict)/(idf_dict[word]+1))return tf_idf_dict#输入语料 list of string
#["xxxxxxxxx", "xxxxxxxxxxxxxxxx", "xxxxxxxx"]
def calculate_tfidf(corpus):#先进行分词corpus = [jieba.lcut(text) for text in corpus]tf_dict, idf_dict = build_tf_idf_dict(corpus)tf_idf_dict = calculate_tf_idf(tf_dict, idf_dict)return tf_idf_dict#根据tfidf字典,显示每个领域topK的关键词
def tf_idf_topk(tfidf_dict, paths=[], top=10, print_word=True):topk_dict = {}for doc_index, text_tfidf_dict in tfidf_dict.items():word_list = sorted(text_tfidf_dict.items(), key=lambda x:x[1], reverse=True)topk_dict[doc_index] = word_list[:top]if print_word:print(doc_index, paths[doc_index])for i in range(top):print(word_list[i])print("----------")return topk_dictdef main():dir_path = r"category_corpus/"corpus = []paths = []for path in os.listdir(dir_path):path = os.path.join(dir_path, path)if path.endswith("txt"):corpus.append(open(path, encoding="utf8").read())paths.append(os.path.basename(path))tf_idf_dict = calculate_tfidf(corpus)tf_idf_topk(tf_idf_dict, paths)if __name__ == "__main__":main()
TF·IDF应用

3

import jieba
import math
import os
import json
from calculate_tfidf import calculate_tfidf, tf_idf_topk
"""
基于tfidf实现简单搜索引擎
"""
jieba.initialize()
#加载文档数据(可以想象成网页数据),计算每个网页的tfidf字典
def load_data(file_path):corpus = []with open(file_path, encoding="utf8") as f:documents = json.loads(f.read())for document in documents:corpus.append(document["title"] + "\n" + document["content"])tf_idf_dict = calculate_tfidf(corpus)return tf_idf_dict, corpusdef search_engine(query, tf_idf_dict, corpus, top=3):query_words = jieba.lcut(query)res = []for doc_id, tf_idf in tf_idf_dict.items():score = 0for word in query_words:score += tf_idf.get(word, 0)res.append([doc_id, score])res = sorted(res, reverse=True, key=lambda x:x[1])for i in range(top):doc_id = res[i][0]print(corpus[doc_id])print("--------------")return resif __name__ == "__main__":path = "news.json"tf_idf_dict, corpus = load_data(path)while True:query = input("请输入您要搜索的内容:")search_engine(query, tf_idf_dict, corpus)

4

import jieba
import math
import os
import random
import re
import json
from collections import defaultdict
from calculate_tfidf import calculate_tfidf, tf_idf_topk
"""
基于tfidf实现简单文本摘要
"""
jieba.initialize()#加载文档数据(可以想象成网页数据),计算每个网页的tfidf字典
def load_data(file_path):corpus = []with open(file_path, encoding="utf8") as f:documents = json.loads(f.read())for document in documents:assert "\n" not in document["title"]assert "\n" not in document["content"]corpus.append(document["title"] + "\n" + document["content"])tf_idf_dict = calculate_tfidf(corpus)return tf_idf_dict, corpus#计算每一篇文章的摘要
#输入该文章的tf_idf词典,和文章内容
#top为人为定义的选取的句子数量
#过滤掉一些正文太短的文章,因为正文太短在做摘要意义不大
def generate_document_abstract(document_tf_idf, document, top=3):sentences = re.split("?|!|。", document)#过滤掉正文在五句以内的文章if len(sentences) <= 5:return Noneresult = []for index, sentence in enumerate(sentences):sentence_score = 0words = jieba.lcut(sentence)for word in words:sentence_score += document_tf_idf.get(word, 0)sentence_score /= (len(words) + 1)result.append([sentence_score, index])result = sorted(result, key=lambda x:x[0], reverse=True)#权重最高的可能依次是第10,第6,第3句,将他们调整为出现顺序比较合理,即3,6,10important_sentence_indexs = sorted([x[1] for x in result[:top]])return "。".join([sentences[index] for index in important_sentence_indexs])#生成所有文章的摘要
def generate_abstract(tf_idf_dict, corpus):res = []for index, document_tf_idf in tf_idf_dict.items():title, content = corpus[index].split("\n")abstract = generate_document_abstract(document_tf_idf, content)if abstract is None:continuecorpus[index] += "\n" + abstractres.append({"标题":title, "正文":content, "摘要":abstract})return resif __name__ == "__main__":path = "news.json"tf_idf_dict, corpus = load_data(path)res = generate_abstract(tf_idf_dict, corpus)writer = open("abstract.json", "w", encoding="utf8")writer.write(json.dumps(res, ensure_ascii=False, indent=2))writer.close()

5

#coding:utf8
import jieba
import math
import os
import json
from collections import defaultdict
from calculate_tfidf import calculate_tfidf, tf_idf_topk
"""
基于tfidf实现文本相似度计算
"""
jieba.initialize()#加载文档数据(可以想象成网页数据),计算每个网页的tfidf字典
#之后统计每篇文档重要在前10的词,统计出重要词词表
#重要词词表用于后续文本向量化
def load_data(file_path):corpus = []with open(file_path, encoding="utf8") as f:documents = json.loads(f.read())for document in documents:corpus.append(document["title"] + "\n" + document["content"])tf_idf_dict = calculate_tfidf(corpus)topk_words = tf_idf_topk(tf_idf_dict, top=5, print_word=False)vocab = set()for words in topk_words.values():for word, score in words:vocab.add(word)print("关键词词表大小:", len(vocab))return tf_idf_dict, list(vocab), corpus#passage是文本字符串
#vocab是词列表
#向量化的方式:计算每个重要词在文档中的出现频率
def doc_to_vec(passage, vocab):vector = [0] * len(vocab)passage_words = jieba.lcut(passage)for index, word in enumerate(vocab):vector[index] = passage_words.count(word) / len(passage_words)return vector#先计算所有文档的向量
def calculate_corpus_vectors(corpus, vocab):corpus_vectors = [doc_to_vec(c, vocab) for c in corpus]return corpus_vectors#计算向量余弦相似度
def cosine_similarity(vector1, vector2):x_dot_y = sum([x*y for x, y in zip(vector1, vector2)])sqrt_x = math.sqrt(sum([x ** 2 for x in vector1]))sqrt_y = math.sqrt(sum([x ** 2 for x in vector2]))if sqrt_y == 0 or sqrt_y == 0:return 0return x_dot_y / (sqrt_x * sqrt_y + 1e-7)#输入一篇文本,寻找最相似文本
def search_most_similar_document(passage, corpus_vectors, vocab):input_vec = doc_to_vec(passage, vocab)result = []for doc_index, vector in enumerate(corpus_vectors):score = cosine_similarity(input_vec, vector)result.append([doc_index, score])result = sorted(result, reverse=True, key=lambda x:x[1])return result[:4]if __name__ == "__main__":path = "news.json"tf_idf_dict, vocab, corpus = load_data(path)corpus_vectors = calculate_corpus_vectors(corpus, vocab)passage = "魔兽争霸"for doc_index, score in search_most_similar_document(passage, corpus_vectors, vocab):print("相似文章:\n", corpus[doc_index].strip())print("得分:", score)print("--------------")
TF·IDF的优势
  • 1.可解释性好
    可以清晰地看到关键词
    即使预测结果出错,也很容易找到原因
  • 2.计算速度快
    分词本身占耗时最多,其余为简单统计计算
  • 3.对标注数据依赖小
    可以使用无标注语料完成一部分工作
  • 4.可以与很多算法组合使用
    可以看做是词权重
TF·IDF的劣势

1.受分词效果影响大
2.词与词之间没有语义相似度
3.没有语序信息(词袋模型)
4.能力范围有限,无法完成复杂任务,如机器翻译和实体挖掘等
5.样本不均衡会对结果有很大影响
6.类内样本间分布不被考虑


http://www.dtcms.com/a/264200.html

相关文章:

  • 强化学习系列--从数值出发,解读 DPO 训练背后的偏好优化逻辑
  • Navicat Premium x TiDB 社区体验活动 | 赢 Navicat 正版授权+限量周边+TiDB 社区积分
  • 第8章路由协议,RIP、OSPF、BGP、IS-IS
  • RabbitMQ简单消息监听
  • 基于开源AI大模型AI智能名片S2B2C商城小程序的流量转化与价值沉淀研究
  • linux魔术字定位踩内存总结
  • 振荡电路Multisim电路仿真实验汇总——硬件工程师笔记
  • MySQL 常用命令大全
  • 0.96寸OLED显示屏 江协科技学习笔记(36个知识点)
  • swing音频输入
  • sqlmap学习ing(2.[第一章 web入门]SQL注入-2(报错,时间,布尔))
  • jQuery 安装使用教程
  • MySQL数据一键同步至ClickHouse数据库
  • 前端第二节(Vue)
  • 橙心同步助手2.0.1版本更新
  • Instruct-GPT中强化学习(RL)训练部分详解
  • Android实现仿iOS风格滚动时间选择器
  • 零信任安全管理系统介绍
  • 新版本 Spring Data Jpa + QueryDSL 使用教程
  • Java基础 集合框架 抽象类 AbstractList
  • Bootstrap 安装使用教程
  • 三极管是NPN还是PNP
  • CppCon 2018 学习:EMULATING THE NINTENDO 3DS
  • 以下是 Kafka 不同认证方式的配置示例,结合前面的单表设计方案,展示如何为每种认证方式填充配置表
  • Docker进阶命令与参数——AI教你学Docker
  • 第八十六篇 大数据排序算法:从厨房整理到分布式排序的智慧
  • MS1826+LT8644 4K@30Hz HD8×8/16×16高清矩阵
  • 数据结构复习5
  • 数字ic后端设计从入门到精通10(含fusion compiler, tcl教学)静态时序分析
  • 使用Ansible的playbook安装HTTP