当前位置: 首页 > news >正文

[机器学习]10-基于ID3决策树算法的西瓜数据集分类

基于ID3决策树算法对西瓜数据集分类

使用信息增益选择最优划分特征,递归构建树结构,直到当前节点样本全属同一类别或无剩余特征可用,返回多数类。对测试样本递归遍历决策树,统计准确率预测评估。

程序代码:

import math
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import randomfile_path = '西瓜数据集.xlsx'
label_list = []
data = pd.read_excel(file_path, skiprows=1,nrows=17)
data_dict = data.to_dict(orient='list')
data_list = list(zip(data['色泽'], data['根蒂'], data['敲声'], data['纹理'], data['脐部'], data['触感'], data['好瓜']))
train_set, test_set = train_test_split(data_list, test_size=2, random_state=random.randint(1, 1000))print("data_list:",len(data_list),data_list)
print("train_list:",len(train_set),train_set)
print("test_list:",len(test_set),test_set)keys = []
for index in data.keys():keys.append(index)
keys.remove('编号')
keys.remove('好瓜')
print('keys:',keys)def calculate_entropy(data, label_index):labels = [entry[label_index] for entry in data]unique_labels = set(labels)entropy = 0for label in unique_labels:prob = labels.count(label) / len(labels)entropy -= prob * math.log(prob, 2)return entropy
'''
def calculate_feature_entropy(data, feature_index, label_index):feature_values = set([entry[feature_index] for entry in data])entropy = 0for value in feature_values:subset_data = [entry for entry in data if entry[feature_index] == value]prob = len(subset_data) / len(data)entropy += prob * calculate_entropy(subset_data, label_index)return abs(entropy)
'''
def choose_best_feature(data, features, label_index):base_entropy = calculate_entropy(data, label_index)best_info_gain = 0.0best_feature = Nonefor feature in features:new_entropy = 0.0values = set(data[i][features.index(feature)] for i in range(len(data)))for value in values:subset_data = [data[i] for i in range(len(data)) if data[i][features.index(feature)] == value]prob = len(subset_data) / float(len(data))new_entropy += prob * calculate_entropy(subset_data, label_index)info_gain = base_entropy - new_entropyprint(f"Feature: {feature}, Info Gain: {info_gain}")if info_gain > best_info_gain:best_info_gain = info_gainbest_feature = featurereturn best_featuredef id3_decision_tree(data, features, label_index):if len(set(data[i][label_index] for i in range(len(data)))) == 1:return data[0][label_index]if len(features) == 0:max_class = max(set(entry[label_index] for entry in data), key=lambda x: data.count(x))return max_classroot_feature = choose_best_feature(data, features, label_index)if root_feature is None:max_class = max(set(entry[label_index] for entry in data), key=lambda x: data.count(x))return max_classprint(root_feature)tree = {root_feature: {}}for value in set(data[i][features.index(root_feature)] for i in range(len(data))):subset_data = [data[i] for i in range(len(data)) if data[i][features.index(root_feature)] == value]subset_features = [feat for feat in features if feat != root_feature]tree[root_feature][value] = id3_decision_tree(subset_data, subset_features, label_index)return tree# 测试
#'色泽','根蒂','敲声','纹理','脐部','触感'
selected_features = [0,1,2,3,4,5]
label_index = 6
decision_tree = id3_decision_tree(train_set, selected_features, label_index)
#print(decision_tree)
print(json.dumps(decision_tree, indent=2, ensure_ascii=False))def predict(tree, sample):if isinstance(tree, dict):feature, subtree = next(iter(tree.items()))value = sample[feature]if value in subtree:return predict(subtree[value], sample)else:return treedef evaluate(tree, test_set, label_index):correct_predictions = 0total_samples = len(test_set)for sample in test_set:prediction = predict(tree, sample)if prediction == sample[label_index]:correct_predictions += 1accuracy = correct_predictions / total_samplesreturn accuracytest_accuracy = evaluate(decision_tree, test_set, label_index)
print("Test Accuracy: {:.2%}".format(test_accuracy))

运行结果:

data_list: 17 [('青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'), ('乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'), ('乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'), ('青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'), ('浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'), ('青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '是'), ('乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '是'), ('乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '是'), ('乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '否'), ('青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '否'), ('浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '否'), ('浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '否'), ('青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '否'), ('浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '否'), ('乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '否'), ('浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '否'), ('青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '否')]
train_list: 15 [('青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '否'), ('青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '否'), ('乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '否'), ('乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '是'), ('乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'), ('乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '是'), ('乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'), ('青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '是'), ('青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'), ('浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '否'), ('浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '否'), ('浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '否'), ('青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'), ('浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'), ('乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '否')]
test_list: 2 [('青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '否'), ('浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '否')]
keys: ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
Feature: 0, Info Gain: 0.0894822670191825
Feature: 1, Info Gain: 0.19400203331720878
Feature: 2, Info Gain: 0.24045976745187625
Feature: 3, Info Gain: 0.3219280948873624
Feature: 4, Info Gain: 0.3367826633222949
Feature: 5, Info Gain: 0.02584103752696809
4
Feature: 0, Info Gain: 0.3166890883150208
Feature: 1, Info Gain: 0.6500224216483541
Feature: 2, Info Gain: 0.19087450462110933
Feature: 3, Info Gain: 0.6500224216483541
Feature: 5, Info Gain: 0.0
1
Feature: 0, Info Gain: 0.0
Feature: 1, Info Gain: 0.19087450462110944
Feature: 2, Info Gain: 0.4591479170272448
Feature: 3, Info Gain: 0.08170416594551044
Feature: 5, Info Gain: 0.0
2
Feature: 0, Info Gain: 0.12255624891826566
Feature: 1, Info Gain: 0.0
Feature: 3, Info Gain: 0.0
Feature: 5, Info Gain: 0.12255624891826566
0
Feature: 1, Info Gain: 0.0
Feature: 3, Info Gain: 0.0
Feature: 5, Info Gain: 0.0
{
"4": {
"凹陷": {
"1": {
"稍蜷": "否",
"蜷缩": "是"
}
},
"稍凹": {
"2": {
"浊响": {
"0": {
"乌黑": "否",
"青绿": "是"
}
},
"沉闷": "否"
}
},
"平坦": "否"
}
}
Test Accuracy: 100.00%

进程已结束,退出代码0

http://www.dtcms.com/a/336882.html

相关文章:

  • Apache RocketMQ,构建云原生统一消息引擎
  • 如何用github记录mit6s081-2020-labs学习过程
  • SQL注入防御
  • MacOS 安全机制与“文件已损坏”排查完整指南
  • 【前端】使用Vue3过程中遇到加载无效设置点击方法提示不存在的情况,原来是少加了一个属性
  • 动态规划:入门思考篇
  • SQL详细语法教程(五)事务和视图
  • zsh 使用笔记 命令行智能提示 bash智能
  • mac查看nginx安装位置 mac nginx启动、重启、关闭
  • 我的第一个开源项目:从0到1,我在GitHub写下的成长印记
  • OpenCV Python——Numpy基本操作(Numpy 矩阵操作、Numpy 矩阵的检索与赋值、Numpy 操作ROI)
  • 母猪姿态转换行为识别:计算机视觉与行为识别模型调优指南
  • 使用 ipconfig /all 获取电脑 IP 地址
  • Django 请求生命周期
  • TCP网络编程
  • Json A12 计算总和
  • Git版本控制与协作
  • 【秋招笔试】2025.08.16美团算法岗秋招机考真题
  • Cell Metab. (IF=30.9)|上海交大刘军力研究员团队:DLAT抑制亮氨酸分解驱动肿瘤发生
  • 朝花夕拾(七)--------从混淆矩阵到分类报告全面解析​
  • LeetCode 刷题【45. 跳跃游戏 II】
  • 云计算-云上实例部署 RocketChat:Mongodb、主从数据库、Node 环境配置指南
  • 生信分析自学攻略 | R软件和Rstudio的安装
  • 今日行情明日机会——20250818
  • 华为服务器设置bios中cpu为性能模式
  • week2-[循环结构]找出正数
  • element-plus:el-tree ref初始化异常记录
  • 【前端面试题】JavaScript 核心知识点解析(第一题到第三十题)
  • MQTT(轻量级消息中间件)基本使用指南
  • 套接字超时控制与服务器调度策略