当前位置：首页 > news >正文

基于随机森林的糖尿病预测模型研究应用(python)

news 2025/7/15 3:48:35

基于随机森林的糖尿病预测模型研究应用

1、导入糖尿病数据集

In [14]:

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
data=pd.read_csv('./糖尿病数据集.csv',encoding="gbk")
data.head()#查看前五行数据

Out[14]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

In [2]:

data.tail()

Out[2]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
763	10	101	76	48	180	32.9	0.171	63	0
764	2	122	70	27	0	36.8	0.340	27	0
765	5	121	72	23	112	26.2	0.245	30	0
766	1	126	60	0	0	30.1	0.349	47	1
767	1	93	70	31	0	30.4	0.315	23	0

2、糖尿病样本统计分析

提取进行样本分析的特征

In [2]:

##写一个类方法做一个数据转换操作，将1转换成糖尿病患者，0转换成正常人
data2=data.copy()
def tn_ftn(Outcome):if Outcome==1:return '糖尿病患者'else:return '正常人'
data2['result']=data2['Outcome'].apply(tn_ftn)##目标变量
y1=data2['result']
data2['age_groups'] = pd.cut(data2['Age'], bins=[0, 20, 40, 60,80,100],right=False)##分箱操作

In [3]:

age_felie=data2.groupby(['age_groups','Outcome'])['result'].count().reset_index()
age_felie['age_groups']=['(0,20]正常人','(0,20]糖尿病患者','(20,40]正常人','(20,40]糖尿病患者','(40,60]正常人','(40,60]糖尿病患者','(60,80]正常人','(60,80]糖尿病患者','(80,100]正常人','(80,100]糖尿病患者']
age_felie

Out[3]:

	age_groups	Outcome	result
0	(0,20]正常人	0	0
1	(0,20]糖尿病患者	1	0
2	(20,40]正常人	0	401
3	(20,40]糖尿病患者	1	160
4	(40,60]正常人	0	76
5	(40,60]糖尿病患者	1	99
6	(60,80]正常人	0	22
7	(60,80]糖尿病患者	1	9
8	(80,100]正常人	0	1
9	(80,100]糖尿病患者	1	0

In [4]:

fl=data2.groupby(['age_groups'])['Age'].count()
fl

Out[4]:

age_groups
[0, 20)        0
[20, 40)     561
[40, 60)     175
[60, 80)      31
[80, 100)      1
Name: Age, dtype: int64

In [5]:

age_felie['age_groups']

Out[5]:

0        (0,20]正常人
1      (0,20]糖尿病患者
2       (20,40]正常人
3     (20,40]糖尿病患者
4       (40,60]正常人
5     (40,60]糖尿病患者
6       (60,80]正常人
7     (60,80]糖尿病患者
8      (80,100]正常人
9    (80,100]糖尿病患者
Name: age_groups, dtype: object

一、糖尿病患者在各年龄阶段的年龄占比

In [14]:

from pyecharts.charts import Pie
from pyecharts import options as opts
# 绘制饼图
pie = Pie()
pie.add("", [list(z) for z in zip(age_felie['age_groups'].values.tolist(), list(age_felie['result']))],radius=[20,200])
pie.set_global_opts(legend_opts=opts.LegendOpts(orient="vertical", pos_bottom="50%", pos_left="75%"))
pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{c} \n ({d}%)"))
pie.render('各年龄阶段糖尿病患者人数.html')
# pie.render_notebook()

Out[14]:

二、各年龄阶段人数

In [13]:

from pyecharts import options as opts
from pyecharts.charts import Bar# 假设age_felie已经定义并包含'age_groups'和'result'列
y_data = age_felie['result'].values
x_data = age_felie['age_groups'].values# 初始化图表配置
init_opts = opts.InitOpts(width='1200px', height='800px')# 创建柱状图
bar = (Bar(init_opts).add_xaxis(x_data.tolist()).add_yaxis('糖尿病患者/正常人', y_data.tolist(), label_opts=opts.LabelOpts(position='insideTop')).set_global_opts(title_opts=opts.TitleOpts(title='各年龄阶段人数'),xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=20, color='skyblue')),yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=20, color='skyblue')))
)# 渲染到HTML文件
bar.render('各年龄阶段人数.html')
# bar.render_notebook()

Out[13]:

3、查看数据的描述性信息及相关性

数据的形状

In [15]:

data.shape

Out[15]:

(768, 9)

数据的标签

In [16]:

# 查看标签分布 
print("数据集一共多少条:",data.shape[0])
print("\n")
print("糖尿病数据标签的分布:\n")
print(data.Outcome.value_counts()) ##0代表正常人，1代表患者人数

数据集一共多少条: 768糖尿病数据标签的分布:0    500
1    268
Name: Outcome, dtype: int64

描述信息

In [17]:

data.describe().round(2)##保留两位小数

Out[17]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	768.00	768.00	768.00	768.00	768.00	768.00	768.00	768.00	768.00
mean	3.85	120.89	69.11	20.54	79.80	31.99	0.47	33.24	0.35
std	3.37	31.97	19.36	15.95	115.24	7.88	0.33	11.76	0.48
min	0.00	0.00	0.00	0.00	0.00	0.00	0.08	21.00	0.00
25%	1.00	99.00	62.00	0.00	0.00	27.30	0.24	24.00	0.00
50%	3.00	117.00	72.00	23.00	30.50	32.00	0.37	29.00	0.00
75%	6.00	140.25	80.00	32.00	127.25	36.60	0.63	41.00	1.00
max	17.00	199.00	122.00	99.00	846.00	67.10	2.42	81.00	1.00

In [18]:

#相关性
data.corr().round(2)

Out[18]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
Pregnancies	1.00	0.13	0.14	-0.08	-0.07	0.02	-0.03	0.54	0.22
Glucose	0.13	1.00	0.15	0.06	0.33	0.22	0.14	0.26	0.47
BloodPressure	0.14	0.15	1.00	0.21	0.09	0.28	0.04	0.24	0.07
SkinThickness	-0.08	0.06	0.21	1.00	0.44	0.39	0.18	-0.11	0.07
Insulin	-0.07	0.33	0.09	0.44	1.00	0.20	0.19	-0.04	0.13
BMI	0.02	0.22	0.28	0.39	0.20	1.00	0.14	0.04	0.29
DiabetesPedigreeFunction	-0.03	0.14	0.04	0.18	0.19	0.14	1.00	0.03	0.17
Age	0.54	0.26	0.24	-0.11	-0.04	0.04	0.03	1.00	0.24
Outcome	0.22	0.47	0.07	0.07	0.13	0.29	0.17	0.24	1.00

In [19]:

#相关性热力图
#忽略警告
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(data.corr(),cmap="Blues",annot=True)

Out[19]:

<Axes: >

4、数据预处理

一、缺失值——均值填充

In [20]:

#使用seaborn库绘图
import seaborn as sns
sns.set_style('whitegrid',{'font.sans-serif':['simhei','Arial']})
plt.figure(figsize=(30, 30))
g = sns.pairplot(data,x_vars=['Pregnancies','Glucose','BloodPressure','SkinThickness'],y_vars=['Age'],palette='Set1',hue='Outcome')
g = g.map_offdiag(plt.scatter)
plt.suptitle('各年龄阶段的其他特征情况1', verticalalignment='bottom' , y=1,color="skyblue",size=20)
plt.show()#0为正常人，1为患有糖尿病

<Figure size 3000x3000 with 0 Axes>

In [21]:

#使用seaborn库绘图
sns.set_style('whitegrid',{'font.sans-serif':['simhei','Arial']})
plt.figure(figsize=(30, 30))
g = sns.pairplot(data,x_vars=['Insulin','BMI','DiabetesPedigreeFunction'],y_vars=['Age'],palette='Set1',hue='Outcome')
g = g.map_offdiag(plt.scatter)
plt.suptitle('各年龄阶段的其他特征情况2', verticalalignment='bottom' , y=1,color="skyblue",size=20)
plt.show()#0为正常人，1为患有糖尿病

<Figure size 3000x3000 with 0 Axes>

可以观察到'Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI'上都含有0值，

从现实的实际情况来说，'Pregnancies'列含有0值是正常的，那么我们将其他列含有的0值视为缺失值，现在进行转换，

将'Glucose','BloodPressure','SkinThickness','Insulin','BMI'上所有列含有的0值填充为NaN值，进行查看空缺值

步骤：

1、缺失值检查

2、填充缺失值

1、缺失值检查

第一步：将Glucose、BloodPressure、SkinThickness、Insulin、BMI中的0替换成NaN值

第二步：使用data.info()检查缺失值

第一步：将Glucose、BloodPressure、SkinThickness、Insulin、BMI中的0替换成NaN值

In [15]:

column = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[column] = data[column].replace(0,np.nan)

第二步：使用data.info()检查缺失值

In [23]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):#   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  0   Pregnancies               768 non-null    int64  1   Glucose                   763 non-null    float642   BloodPressure             733 non-null    float643   SkinThickness             541 non-null    float644   Insulin                   394 non-null    float645   BMI                       757 non-null    float646   DiabetesPedigreeFunction  768 non-null    float647   Age                       768 non-null    int64  8   Outcome                   768 non-null    int64  
dtypes: float64(6), int64(3)
memory usage: 54.1 KB

可以很清楚的观察到糖尿病数据集中Glucose含有5条缺失值，BloodPressure含有35条缺失值，

SkinThickness含有227条缺失值，Insulin含有374条缺失值，BMI含有11条缺失值

即缺失值数据条数从多到少排序为：Insulin、SkinThickness、BloodPressure、BMI、Glucose

2、填充缺失值

填充原因：由上述的糖尿病数据相关性可知，目标变量与特征变量之间都存在一定的相关性，

故如果删除缺失值的话，会可能导致统计效力下降，模型的准确性和泛化能力也会受到影响

In [16]:

data['Glucose'].fillna(data.Glucose.mean().round(0),inplace=True)
data['BloodPressure'].fillna(data.BloodPressure.mean().round(0),inplace=True)
data['SkinThickness'].fillna(data.SkinThickness.mean().round(0),inplace=True)
data['Insulin'].fillna(data.Insulin.mean().round(0),inplace=True)
data['BMI'].fillna(data.BMI.mean().round(1),inplace=True)

In [25]:

data.head()##查看填充成功

Out[25]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148.0	72.0	35.0	156.0	33.6	0.627	50	1
1	1	85.0	66.0	29.0	156.0	26.6	0.351	31	0
2	8	183.0	64.0	29.0	156.0	23.3	0.672	32	1
3	1	89.0	66.0	23.0	94.0	28.1	0.167	21	0
4	0	137.0	40.0	35.0	168.0	43.1	2.288	33	1

二、异常值处理——中位数填充

由上述的描述信息可以看出Pregnancies、BloodPressure、Age这些值在实际生活中是正常的，那么现在需要进行对Glucose、SkinThickness、Insulin、BMI、DiabetesPedigreeFunction进行异常排查

第一步：画出需要分析列的箱线图，即画出糖尿病数据集中经过缺失值填充后Glucose、SkinThickness、Insulin、BMI、DiabetesPedigreeFunction列的箱线图

第二步：利用z-score的方法找出异常值所在的行

第三步：采用中位数对异常进行填充

In [26]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns# 删除指定的列
df = data.drop(['Pregnancies','BloodPressure','Age','Outcome'], axis=1)# 查看转换后的DataFrame的数据类型
# print(df.dtypes)# 生成箱型图
plt.figure(figsize=(15, 8))
sns.boxplot(data=df,orient= 'vertica')
plt.title('Box Plot of All Features')
plt.xlabel('Features')
plt.ylabel('Values')
#保存图片
plt.savefig('糖尿病数据集缺失值处理后的箱线图.png') 
plt.show()

①对Glucose列

In [17]:

##对异常值进行足一排查
import pandas as pd
# 选择要分析的列，Glucose——葡萄糖
column_to_analyze = 'Glucose'
# 计算该列的平均值和标准差
mean = data[column_to_analyze].mean()
std = data[column_to_analyze].std()
# 计算每个样本的Z-score
data['z_score'] = (data[column_to_analyze] - mean) / std
# 设定一个阈值，通常选择3作为标准，表示3个标准差之外的值为异常值
threshold = 3
# 识别异常值，即Z-score的绝对值大于阈值的样本
data['is_outlier'] = abs(data['z_score']) > threshold
# 打印出异常值的行
print("Glucose异常值所在行:")
print(data[data['is_outlier']])

Glucose异常值所在行:
Empty DataFrame
Columns: [Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome, z_score, is_outlier]
Index: []

可以看出Glucose无异常值

②对SkinThickness列

In [18]:

##第一步：利用Z-Score进行异常值排查
import pandas as pd
import math 
# 选择要分析的列，SkinThickness——皮脂厚度
column_to_analyze = 'SkinThickness'
# 计算该列的平均值和标准差
mean = data[column_to_analyze].mean()
std = data[column_to_analyze].std()
# 计算每个样本的Z-score
data['z_score'] = (data[column_to_analyze] - mean) / std
# 设定一个阈值，通常选择3作为标准，表示3个标准差之外的值为异常值
threshold = 3
# 识别异常值，即Z-score的绝对值大于阈值的样本
data['is_outlier'] = abs(data['z_score']) > threshold
# 打印出异常值的行
print("SkinThickness异常值所在行:")
print(data[data['is_outlier']])
# 第二步：利用中位数填充异常值
## 使用中位数替换异常值
# 计算列的中位数
median_value = data[column_to_analyze].median()
# 使用中位数替换异常值
data.loc[data['is_outlier'], column_to_analyze] = median_value

SkinThickness异常值所在行:Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
57             0    100.0           88.0           60.0    110.0  46.8   
120            0    162.0           76.0           56.0    100.0  53.2   
445            0    180.0           78.0           63.0     14.0  59.4   
579            2    197.0           70.0           99.0    156.0  34.7   DiabetesPedigreeFunction  Age  Outcome   z_score  is_outlier  
57                      0.962   31        0  3.513952        True  
120                     0.759   25        1  3.058952        True  
445                     2.420   25        1  3.855201        True  
579                     0.575   62        1  7.950196        True

③对Insulin列

In [19]:

import pandas as pd
# 选择要分析的列，BloodPressure——血压
column_to_analyze = 'Insulin'
# 计算该列的平均值和标准差
mean = data[column_to_analyze].mean()
std = data[column_to_analyze].std()
# # 使用math.floor()将均值向下取整为最接近的整数
# mean_value_int = math.floor(mean)
# 计算每个样本的Z-score
data['z_score'] = (data[column_to_analyze] - mean) / std
# 设定一个阈值，通常选择3作为标准，表示3个标准差之外的值为异常值
threshold = 3
# 识别异常值，即Z-score的绝对值大于阈值的样本
data['is_outlier'] = abs(data['z_score']) > threshold# 打印出异常值的行
print("Insulin异常值所在行:")
print(data[data['is_outlier']])
# 计算列的中位数
median_value = data[column_to_analyze].median()
# 使用中位数替换异常值
data.loc[data['is_outlier'], column_to_analyze] = median_value

Insulin异常值所在行:Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
8              2    197.0           70.0           45.0    543.0  30.5   
13             1    189.0           60.0           23.0    846.0  30.1   
111            8    155.0           62.0           26.0    495.0  34.0   
153            1    153.0           82.0           42.0    485.0  40.6   
186            8    181.0           68.0           36.0    495.0  30.1   
220            0    177.0           60.0           29.0    478.0  34.6   
228            4    197.0           70.0           39.0    744.0  36.7   
247            0    165.0           90.0           33.0    680.0  52.3   
286            5    155.0           84.0           44.0    545.0  38.7   
370            3    173.0           82.0           48.0    465.0  38.4   
392            1    131.0           64.0           14.0    415.0  23.7   
409            1    172.0           68.0           49.0    579.0  42.4   
415            3    173.0           84.0           33.0    474.0  35.7   
486            1    139.0           62.0           41.0    480.0  40.7   
584            8    124.0           76.0           24.0    600.0  28.7   
645            2    157.0           74.0           35.0    440.0  39.4   
655            2    155.0           52.0           27.0    540.0  38.7   
695            7    142.0           90.0           24.0    480.0  30.4   
753            0    181.0           88.0           44.0    510.0  43.3   DiabetesPedigreeFunction  Age  Outcome   z_score  is_outlier  
8                       0.158   53        1  4.554521        True  
13                      0.398   59        1  8.118329        True  
111                     0.543   46        1  3.989957        True  
153                     0.687   23        0  3.872340        True  
186                     0.615   60        1  3.989957        True  
220                     1.072   21        1  3.790007        True  
228                     2.329   31        0  6.918631        True  
247                     0.427   23        0  6.165880        True  
286                     0.619   34        0  4.578044        True  
370                     2.137   25        1  3.637105        True  
392                     0.389   21        0  3.049018        True  
409                     0.702   28        1  4.977944        True  
415                     0.258   22        1  3.742960        True  
486                     0.536   21        0  3.813531        True  
584                     0.687   52        1  5.224940        True  
645                     0.134   30        0  3.343061        True  
655                     0.240   25        1  4.519236        True  
695                     0.128   43        1  3.813531        True  
753                     0.222   26        1  4.166383        True

④对BMI列

In [20]:

import pandas as pd
import math
# 选择要分析的列
column_to_analyze = 'BMI'
# 计算该列的平均值和标准差
mean = data[column_to_analyze].mean()
std = data[column_to_analyze].std()
# # 使用math.floor()将均值向下取整为最接近的整数
# mean_value_int = math.floor(mean)
# 计算每个样本的Z-score
data['z_score'] = (data[column_to_analyze] - mean) / std
# 设定一个阈值，通常选择3作为标准，表示3个标准差之外的值为异常值
threshold = 3
# 识别异常值，即Z-score的绝对值大于阈值的样本
data['is_outlier'] = abs(data['z_score']) > threshold
# 打印出异常值的行
print("BMI异常值所在行:")
print(data[data['is_outlier']])
# 计算列的中位数
median_value = data[column_to_analyze].median()
# 使用中位数替换异常值
data.loc[data['is_outlier'], column_to_analyze] = median_value

BMI异常值所在行:Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
120            0    162.0           76.0           29.0    100.0  53.2   
125            1     88.0           30.0           42.0     99.0  55.0   
177            0    129.0          110.0           46.0    130.0  67.1   
445            0    180.0           78.0           29.0     14.0  59.4   
673            3    123.0          100.0           35.0    240.0  57.3   DiabetesPedigreeFunction  Age  Outcome   z_score  is_outlier  
120                     0.759   25        1  3.016940        True  
125                     0.496   26        1  3.278753        True  
177                     0.319   26        1  5.038713        True  
445                     2.420   25        1  3.918738        True  
673                     0.880   22        0  3.613291        True

⑤对DiabetesPedigreeFunction列

In [21]:

import pandas as pd
# 选择要分析的列，DiabetesPedigreeFunction——糖尿病遗传函数
column_to_analyze = 'DiabetesPedigreeFunction'
# 计算该列的平均值和标准差
mean = data[column_to_analyze].mean()
std = data[column_to_analyze].std()
# # 使用math.floor()将均值向下取整为最接近的整数
# mean_value_int = math.floor(mean)
# 计算每个样本的Z-score
data['z_score'] = (data[column_to_analyze] - mean) / std
# 设定一个阈值，通常选择3作为标准，表示3个标准差之外的值为异常值
threshold = 3
# 识别异常值，即Z-score的绝对值大于阈值的样本
data['is_outlier'] = abs(data['z_score']) > threshold
# 打印出异常值的行
print("DiabetesPedigreeFunction异常值所在行:")
print(data[data['is_outlier']])
# 计算列的中位数
median_value = data[column_to_analyze].median()
# 使用中位数替换异常值
data.loc[data['is_outlier'], column_to_analyze] = median_value

DiabetesPedigreeFunction异常值所在行:Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
4              0    137.0           40.0           35.0    168.0  43.1   
45             0    180.0           66.0           39.0    156.0  42.0   
58             0    146.0           82.0           29.0    156.0  40.5   
228            4    197.0           70.0           39.0    156.0  36.7   
330            8    118.0           72.0           19.0    156.0  23.1   
370            3    173.0           82.0           48.0    156.0  38.4   
371            0    118.0           64.0           23.0     89.0  32.5   
395            2    127.0           58.0           24.0    275.0  27.7   
445            0    180.0           78.0           29.0     14.0  32.4   
593            2     82.0           52.0           22.0    115.0  28.5   
621            2     92.0           76.0           20.0    156.0  24.2   DiabetesPedigreeFunction  Age  Outcome   z_score  is_outlier  
4                       2.288   33        1  5.481337        True  
45                      1.893   25        1  4.289167        True  
58                      1.781   44        0  3.951134        True  
228                     2.329   31        0  5.605081        True  
330                     1.476   46        0  3.030598        True  
370                     2.137   25        1  5.025596        True  
371                     1.731   21        0  3.800226        True  
395                     1.600   25        0  3.404849        True  
445                     2.420   25        1  5.879733        True  
593                     1.699   25        0  3.703646        True  
621                     1.698   28        0  3.700627        True

数据预处理之后的描述信息

In [34]:

data.drop(columns=['z_score']).describe().round(2)

Out[34]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	768.00	768.00	768.00	768.0	768.00	768.00	768.00	768.00	768.00
mean	3.85	121.69	72.39	28.9	146.22	32.29	0.45	33.24	0.35
std	3.37	30.44	12.10	8.2	56.27	6.53	0.28	11.76	0.48
min	0.00	44.00	24.00	7.0	14.00	18.20	0.08	21.00	0.00
25%	1.00	99.75	64.00	25.0	121.50	27.50	0.24	24.00	0.00
50%	3.00	117.00	72.00	29.0	156.00	32.40	0.37	29.00	0.00
75%	6.00	140.25	80.00	32.0	156.00	36.42	0.60	41.00	1.00
max	17.00	199.00	122.00	54.0	402.00	52.90	1.46	81.00	1.00

In [35]:

data.head(10)

Out[35]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome	z_score	is_outlier
0	6	148.0	72.0	35.0	156.0	33.6	0.6270	50	1	0.468187	False
1	1	85.0	66.0	29.0	156.0	26.6	0.3510	31	0	-0.364823	False
2	8	183.0	64.0	29.0	156.0	23.3	0.6720	32	1	0.604004	False
3	1	89.0	66.0	23.0	94.0	28.1	0.1670	21	0	-0.920163	False
4	0	137.0	40.0	35.0	168.0	43.1	0.3725	33	1	5.481337	True
5	5	116.0	74.0	29.0	156.0	25.6	0.2010	30	0	-0.817546	False
6	3	78.0	50.0	32.0	88.0	31.0	0.2480	26	1	-0.675693	False
7	10	115.0	72.0	29.0	156.0	35.3	0.1340	29	0	-1.019762	False
8	2	197.0	70.0	45.0	156.0	30.5	0.1580	53	1	-0.947326	False
9	8	125.0	96.0	29.0	156.0	32.5	0.2320	54	1	-0.723983	False

三、确定糖尿病数据集中的目标值与特征变量

确定实验二的目标变量与特征变量

In [22]:

X=data.drop(columns=['Outcome','z_score','is_outlier'])##特征变量（删除目标变量，其余的数据为特征变量）
y=data['Outcome']##目标变量 ----0为正常人，1为患有糖尿病

In [23]:

X##特征变量

Out[23]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age
0	6	148.0	72.0	35.0	156.0	33.6	0.6270	50
1	1	85.0	66.0	29.0	156.0	26.6	0.3510	31
2	8	183.0	64.0	29.0	156.0	23.3	0.6720	32
3	1	89.0	66.0	23.0	94.0	28.1	0.1670	21
4	0	137.0	40.0	35.0	168.0	43.1	0.3725	33
...	...	...	...	...	...	...	...	...
763	10	101.0	76.0	48.0	180.0	32.9	0.1710	63
764	2	122.0	70.0	27.0	156.0	36.8	0.3400	27
765	5	121.0	72.0	23.0	112.0	26.2	0.2450	30
766	1	126.0	60.0	29.0	156.0	30.1	0.3490	47
767	1	93.0	70.0	31.0	156.0	30.4	0.3150	23

768 rows × 8 columns

确定实验一的目标变量与特征变量

In [24]:

##写一个类方法做一个数据转换操作，将1转换成糖尿病患者，0转换成正常人
data1=data
def tn_ftn(Outcome):if Outcome==1:return '糖尿病患者'else:return '正常人'
data1['result']=data1['Outcome'].apply(tn_ftn)##目标变量
y1=data1['result']

In [25]:

X#特征变量

Out[25]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age
0	6	148.0	72.0	35.0	156.0	33.6	0.6270	50
1	1	85.0	66.0	29.0	156.0	26.6	0.3510	31
2	8	183.0	64.0	29.0	156.0	23.3	0.6720	32
3	1	89.0	66.0	23.0	94.0	28.1	0.1670	21
4	0	137.0	40.0	35.0	168.0	43.1	0.3725	33
...	...	...	...	...	...	...	...	...
763	10	101.0	76.0	48.0	180.0	32.9	0.1710	63
764	2	122.0	70.0	27.0	156.0	36.8	0.3400	27
765	5	121.0	72.0	23.0	112.0	26.2	0.2450	30
766	1	126.0	60.0	29.0	156.0	30.1	0.3490	47
767	1	93.0	70.0	31.0	156.0	30.4	0.3150	23

768 rows × 8 columns

4、糖尿病数据预测模型

实验一：

测试数据

In [40]:

##测试数据
data1.iloc[20:40,:].drop(columns=['Outcome','z_score','is_outlier'])

Out[40]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	result
20	3	126.0	88.0	41.0	235.0	39.3	0.704	27	正常人
21	8	99.0	84.0	29.0	156.0	35.4	0.388	50	正常人
22	7	196.0	90.0	29.0	156.0	39.8	0.451	41	糖尿病患者
23	9	119.0	80.0	35.0	156.0	29.0	0.263	29	糖尿病患者
24	11	143.0	94.0	33.0	146.0	36.6	0.254	51	糖尿病患者
25	10	125.0	70.0	26.0	115.0	31.1	0.205	41	糖尿病患者
26	7	147.0	76.0	29.0	156.0	39.4	0.257	43	糖尿病患者
27	1	97.0	66.0	15.0	140.0	23.2	0.487	22	正常人
28	13	145.0	82.0	19.0	110.0	22.2	0.245	57	正常人
29	5	117.0	92.0	29.0	156.0	34.1	0.337	38	正常人
30	5	109.0	75.0	26.0	156.0	36.0	0.546	60	正常人
31	3	158.0	76.0	36.0	245.0	31.6	0.851	28	糖尿病患者
32	3	88.0	58.0	11.0	54.0	24.8	0.267	22	正常人
33	6	92.0	92.0	29.0	156.0	19.9	0.188	28	正常人
34	10	122.0	78.0	31.0	156.0	27.6	0.512	45	正常人
35	4	103.0	60.0	33.0	192.0	24.0	0.966	33	正常人
36	11	138.0	76.0	29.0	156.0	33.2	0.420	35	正常人
37	9	102.0	76.0	37.0	156.0	32.9	0.665	46	糖尿病患者
38	2	90.0	68.0	42.0	156.0	38.2	0.503	27	糖尿病患者
39	4	111.0	72.0	47.0	207.0	37.1	1.390	56	糖尿病患者

预测诊断结果

In [15]:

import pandas as pd
##忽略警告
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression      
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import numpy as npdef lg_hgui():X_train,X_test,y_train,y_test=train_test_split(X,y1,test_size=0.3,random_state=25)lg=LogisticRegression(penalty='l2',max_iter=5)lg.fit(X_train,y_train)X_test1=data.iloc[20:40,:8]print("逻辑回归预测结果：",lg.predict(X_test1))def jue_cs():X_train,X_test,y_train,y_test=train_test_split(X,y1,test_size=0.3,random_state=25)jcs=DecisionTreeClassifier(criterion='gini',max_depth=3,splitter='best')jcs.fit(X_train,y_train)X_test1=data.iloc[20:40,:8]print("决策树预测结果：",jcs.predict(X_test1))def sj_sl():X_train,X_test,y_train,y_test=train_test_split(X,y1,test_size=0.3,random_state=25)sj=RandomForestClassifier(n_estimators=19,max_leaf_nodes=7,max_depth=4)sj.fit(X_train,y_train)X_test1=data.iloc[20:40,:8]print("随机森林预测结果：",sj.predict(X_test1))def in_out():print("预测结果结束！")print("真实数据：",data.iloc[20:40,9:]['result'].values)   
print("\n")
while True:model=input("请输入选择的模型!- - - - - - - - - - - - - - - - - - -")if model == '逻辑回归':lg_hgui()print("\n")elif model == '决策树':jue_cs()print("\n")elif model=='随机森林':sj_sl()else:print("\n")in_out()break

真实数据： ['正常人' '正常人' '糖尿病患者' '糖尿病患者' '糖尿病患者' '糖尿病患者' '糖尿病患者' '正常人' '正常人' '正常人''正常人' '糖尿病患者' '正常人' '正常人' '正常人' '正常人' '正常人' '糖尿病患者' '糖尿病患者' '糖尿病患者']

逻辑回归预测结果： ['正常人' '正常人' '糖尿病患者' '正常人' '正常人' '正常人' '糖尿病患者' '正常人' '糖尿病患者' '正常人' '正常人''糖尿病患者' '正常人' '正常人' '正常人' '正常人' '正常人' '正常人' '正常人' '正常人']

决策树预测结果： ['糖尿病患者' '正常人' '糖尿病患者' '正常人' '糖尿病患者' '糖尿病患者' '糖尿病患者' '正常人' '正常人' '正常人''正常人' '糖尿病患者' '正常人' '正常人' '正常人' '正常人' '糖尿病患者' '正常人' '正常人' '正常人']

随机森林预测结果： ['正常人' '正常人' '糖尿病患者' '正常人' '糖尿病患者' '正常人' '糖尿病患者' '正常人' '正常人' '正常人' '正常人''糖尿病患者' '正常人' '正常人' '正常人' '正常人' '糖尿病患者' '糖尿病患者' '正常人' '正常人']

预测结果结束！

实验二：

混淆矩阵、模型评估报告、准确率

基于逻辑回归模型糖尿病的预测模型

In [1288]:

%%time
import pandas as pd
from sklearn import metrics
##忽略警告
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression      
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import cross_val_score
import numpy as np
def lg_re():X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=25)sc = StandardScaler()X_train = sc.fit_transform(X_train)X_test = sc.transform(X_test)lg=LogisticRegression(penalty='l2',max_iter=5)lg.fit(X_train,y_train)y_predict=lg.predict(X_test)print('逻辑回归混淆矩阵:')confusion_matrix=metrics.confusion_matrix(y_test,y_predict)plt.figure(figsize=(3, 3))# 设置x轴和y轴的刻度标签heatmap = plt.imshow(confusion_matrix, cmap=plt.cm.Reds)# # 去掉网格线plt.grid(False)for i in range(confusion_matrix.shape[0]):for j in range(confusion_matrix.shape[1]):plt.text(j, i, format(confusion_matrix[i, j], 'd'), ha="center", va="center")plt.colorbar(heatmap)plt.xticks([0,1])plt.yticks([1,0])plt.xlabel('Predicted labels')plt.ylabel('True labels')plt.show()print("\n")print("逻辑回归模型评估报告:")print(classification_report(y_test,y_predict))#模型评估报告print("\n")# print("逻辑回归准确率:")print("逻辑回归准确率:",accuracy_score(y_test,y_predict).round(2))#准确率score_tr=lg.score(X_train,y_train)score_te=lg.score(X_test,y_test)print("逻辑回归模型训练集准确率：",score_tr.round(2))print("逻辑回归模型测试集准确率：",score_te.round(2))score_tc= cross_val_score(lg,X,y,cv=10,scoring = 'accuracy')#使用交叉验证print("逻辑回归十次交叉验证准确率:",score_tc.round(2))
lg_re()##逻辑回归模型的准确率约为0.82

逻辑回归混淆矩阵:

逻辑回归模型评估报告:precision    recall  f1-score   support0       0.86      0.88      0.87       1601       0.72      0.68      0.70        71accuracy                           0.82       231macro avg       0.79      0.78      0.78       231
weighted avg       0.82      0.82      0.82       231逻辑回归准确率: 0.82
逻辑回归模型训练集准确率： 0.76
逻辑回归模型测试集准确率： 0.82
逻辑回归十次交叉验证准确率: [0.69 0.69 0.68 0.62 0.69 0.77 0.7  0.73 0.71 0.66]
CPU times: total: 734 ms
Wall time: 720 ms

基于决策树模型糖尿病的预测模型

In [818]:

%%time
from sklearn.tree import DecisionTreeClassifier
sc = StandardScaler()
X= sc.fit_transform(X)
def j_cs():X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=30)sc = StandardScaler()X_train = sc.fit_transform(X_train)X_test = sc.transform(X_test)clf=DecisionTreeClassifier(criterion='gini',max_depth=3,splitter='best')clf.fit(X_train,y_train)y_predict=clf.predict(X_test)print('决策树混淆矩阵：')confusion_matrix=metrics.confusion_matrix(y_test,y_predict)plt.figure(figsize=(3, 3))# 设置x轴和y轴的刻度标签heatmap = plt.imshow(confusion_matrix, cmap=plt.cm.Reds)for i in range(confusion_matrix.shape[0]):for j in range(confusion_matrix.shape[1]):plt.text(j, i, format(confusion_matrix[i, j], 'd'), ha="center", va="center")plt.colorbar(heatmap)# # 去掉网格线plt.grid(False)plt.xticks([0,1])plt.yticks([1,0])plt.xlabel('Predicted labels')plt.ylabel('True labels')plt.show()print("\n")print('决策树模型评估报告：')print(classification_report(y_test,y_predict))print('\n')print('决策树准确率：',accuracy_score(y_test,y_predict).round(2))print("决策树模型训练集准确率：",clf.score(X_train,y_train).round(2))print("决策树模型测试集准确率：",clf.score(X_test,y_test).round(2))score_tc= cross_val_score(clf,X,y,cv=10,scoring = 'accuracy')#使用交叉验证print("决策树十次交叉验证准确率:",score_tc.round(2))
j_cs()##决策树模型的准确率约为0.78

决策树混淆矩阵：

决策树模型评估报告：precision    recall  f1-score   support0       0.82      0.89      0.85       1591       0.69      0.56      0.62        72accuracy                           0.78       231macro avg       0.75      0.72      0.73       231
weighted avg       0.78      0.78      0.78       231决策树准确率： 0.78
决策树模型训练集准确率： 0.78
决策树模型测试集准确率： 0.78
决策树十次交叉验证准确率: [0.73 0.73 0.74 0.68 0.71 0.75 0.71 0.81 0.71 0.78]
CPU times: total: 844 ms
Wall time: 839 ms

基于随机森林模型糖尿病的预测模型

In [1280]:

%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import cross_val_score
def sj_sl():X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=25)sc = StandardScaler()X_train = sc.fit_transform(X_train)X_test = sc.transform(X_test)rfc=RandomForestClassifier(n_estimators=19,max_leaf_nodes=7,max_depth=4)rfc.fit(X_train,y_train)y_predict=rfc.predict(X_test)print('随机森林混淆矩阵：')confusion_matrix=metrics.confusion_matrix(y_test,y_predict)plt.figure(figsize=(3, 3))# 设置x轴和y轴的刻度标签heatmap = plt.imshow(confusion_matrix, cmap=plt.cm.Reds)for i in range(confusion_matrix.shape[0]):for j in range(confusion_matrix.shape[1]):plt.text(j, i, format(confusion_matrix[i, j], 'd'), ha="center", va="center")# # 去掉网格线plt.grid(False)plt.colorbar(heatmap)plt.xticks([0,1])plt.yticks([1,0])plt.xlabel('Predicted labels')plt.ylabel('True labels')plt.show()print('\n')print('随机森林模型评估报告：')print(classification_report(y_test,y_predict))print('\n')print('随机森林准确率：',accuracy_score(y_test,y_predict).round(2))print("随机森林模型训练集准确率：",rfc.score(X_train,y_train).round(2))print("随机森林模型测试集准确率：",rfc.score(X_test,y_test).round(2))score_tc= cross_val_score(rfc,X,y,cv=10,scoring = 'accuracy')#使用交叉验证print("随机森林十次交叉验证准确率:",score_tc.round(2))
sj_sl()##随机森林模型的准确率约为0.84

随机森林混淆矩阵：

随机森林模型评估报告：precision    recall  f1-score   support0       0.87      0.90      0.88       1601       0.75      0.69      0.72        71accuracy                           0.84       231macro avg       0.81      0.80      0.80       231
weighted avg       0.83      0.84      0.83       231随机森林准确率： 0.84
随机森林模型训练集准确率： 0.79
随机森林模型测试集准确率： 0.84
随机森林十次交叉验证准确率: [0.73 0.73 0.75 0.64 0.73 0.78 0.78 0.78 0.7  0.82]
CPU times: total: 1.89 s
Wall time: 1.87 s

逻辑回归、决策树、随机森林十次验证准确率

In [191]:

##导包
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.family'] = ['SimHei']   #设置字体为黑体
plt.rcParams['axes.unicode_minus'] = False #解决保存图像时负号“-”显示为方块的问题
#由上述分别得到逻辑回归、决策树、随机森林的十次交叉验证准确率
##逻辑回归十次交叉验证准确率0.69 0.69 0.68 0.62 0.69 0.77 0.7  0.73 0.71 0.66
y1_Logistic=np.array([0.69,0.69,0.68,0.62,0.69,0.77,0.7,0.73,0.71,0.66]).tolist()
##决策树十次交叉验证准确率0.73 0.73 0.74 0.68 0.71 0.75 0.71 0.81 0.71 0.78
y2_Decision=np.array([0.73,0.73,0.74,0.68,0.71,0.75,0.71,0.81,0.71,0.78]).tolist()
##随机森林十次交叉验证准确率0.73,0.73,0.75,0.64,0.73,0.78,0.78,0.78,0.7,0.82
y3_Random=np.array([0.73,0.73,0.75,0.64,0.73,0.78,0.78,0.78,0.7,0.82]).tolist()
##因为是十次所以现在设置x轴时，要确定x轴的范围是1~10
x_data=[1,2,3,4,5,6,7,8,9,10]
plt.plot(x_data,y1_Logistic,color="red" ,label="逻辑回归")
plt.plot(x_data,y2_Decision,color="skyblue" ,label="决策树")
plt.plot(x_data,y3_Random,color="blue" ,label="随机森林")
plt.xticks(range(1,11))
plt.yticks([0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,1.00])
plt.legend()
plt.xlabel("十次交叉验证")
plt.ylabel("十次交叉验证准确率")
plt.show()

逻辑回归准确率、决策树准确率、随机森林准确率柱形图

In [196]:

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif']='SimHei'# 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False  # 用来正常显示负号
import pandas as pd# 假设我们有一些数据
data = {'Model': ['逻辑回归', '决策树', '随机森林'],'Value': [0.82, 0.78, 0.84]
}# 将数据转换为Pandas DataFrame
df = pd.DataFrame(data)# 使用Seaborn的 barplot函数绘制柱形图
# 在这里，我们不需要hue参数，因为我们只有一个分类变量
plt.figure(figsize=(8, 8))
sns.barplot(x='Model', y='Value', data=df)
# # 去掉网格线
plt.grid(False)
# 添加标题和轴标签
plt.title('三种算法模型的准确率比较',fontsize=20,color="blue")
plt.xlabel('模型',fontsize=15,color="purple")
plt.ylabel('准确率',fontsize=15,color="purple")# 在每个柱子上方添加准确率数值
for i, v in enumerate(df['Value']):plt.text(i, v + 0.01, f"{v:.2f}", ha='center', va='bottom',bbox=dict(facecolor='skyblue', alpha=0.5))# 显示图表
plt.show()

In [194]:

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif']='SimHei'# 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False  # 用来正常显示负号
import pandas as pd# 假设我们有一些数据
data = {'Model': ['逻辑回归', '决策树', '随机森林'],'Value': [0.0996, 0.1385, 0.0952]
}# 将数据转换为Pandas DataFrame
df = pd.DataFrame(data)# 使用Seaborn的 barplot函数绘制柱形图
# 在这里，我们不需要hue参数，因为我们只有一个分类变量
plt.figure(figsize=(8, 8))
sns.barplot(x='Model', y='Value', data=df)
# # 去掉网格线
plt.grid(False)
# 添加标题和轴标签
plt.title('混淆矩阵的假阴率比较',fontsize=20,color="blue")
plt.xlabel('模型',fontsize=15,color="purple")
# 在每个柱子上方添加准确率数值（百分比形式）
for i, v in enumerate(df['Value']):plt.text(i, v + 0.001, f"{v*100:.2f}%", ha='center', va='bottom',bbox=dict(facecolor='skyblue', alpha=0.5))  # 将浮点数转换为百分比并保留一位小数
ax=plt.gca()
frame=plt.gca()
# y 轴不可见
frame.axes.get_yaxis().set_visible(False)
##去除x轴横线
for spine in ax.spines.values():spine.set_visible(False)
plt.show()