当前位置: 首页 > news >正文

k均值聚类+成分分析降维+自编码器降维

代码

# 3-1k均值聚类
import pandas
import numpy as np

# 参数设置
k = 4  # 群组数量
# 读入轮椅数据集
df = pandas.read_csv('wheelchair_dataset 1.csv')
data = np.array(df)
m_all = np.shape(data)[0]
d = np.shape(data)[1] - 1
classes = np.amax(data[:, d])
# 特征缩放
data = data.astype(float)
mean = np.mean(data[:, 0:d], axis=0)
std = np.std(data[:, 0:d], axis=0, ddof=1)
data[:, 0:d] = (data[:, 0:d] - mean) / std
#  分别保存输入特征与标注
x = data[:, 0:d].T  # d*m维
y = data[:, d]  # 1*m维
# 构造随机种子指定值的随机数生成器
rng=np.random.default_rng(1)
# 初始化
C=np.zeros((d, k))  # k个形心向量
cluster_index=np.zeros(m_all)  # 样本的群组序号
x_expansion=np.expand_dims(x.T, axis=2)  # 扩大后的输入特征数组
# 随机选择k个样本的输入特征作为k个形心的初始值
C[:, :]=x[:, rng.integers(0, m_all, k)]
# 主循环
converged=False  # 是否收敛标志
iterations=1  # 迭代次数
while not converged:
    # 第一步:将所有样本都划到k个群组之一
    prev_cluster_index=np.copy(cluster_index)  # 保存样本的群组序号
    diff=x_expansion-C  # 样本输入特征向量与形心向量之差(使用广播操作)
    dist_squared=np.sum(diff*diff, axis=1)  # 计算距离的平方
    cluster_index=np.argmin(dist_squared, axis=1)  # 将样本划分到距离的平方最小的群组
    cluster_indicator = np.zeros(dist_squared.shape)  # 清零one-hot向量数组
    cluster_indicator[np.arange(cluster_index.size), cluster_index]=1  # 赋值one-hot向量数组
    # 比较群组号序号数组,判断是否收敛
    if np.array_equal(cluster_index, prev_cluster_index):
        converged = True
        # 第二步,重新计算形心
    C[:, :] = np.dot(x, cluster_indicator) / np.sum(cluster_indicator, axis=0).reshape((1, -1))
    # 迭代次数加1
    iterations=iterations+1
# 打印聚类结果
print('number of iterations:', iterations)
print('assigned cluster:\n', cluster_index+1)
print('actual classes:\n', y.astype(int))
# 若k=4,以百分比的形式统计混淆矩阵并打印
if k==classes:
    conf_mat=np.zeros((k, k))  # 行为预测值,列为标注
    for i in range(m_all):
        conf_mat[cluster_index[i], y[i].astype(int)-1]=conf_mat[cluster_index[i], y[i].astype(int)-1]+1
    print('confusion matrix in percentage:\n', np.round(conf_mat/np.sum(conf_mat, axis=0), 3))

#  3-3成分分析降维

import pandas
import numpy as np
import matplotlib.pyplot as plt
# 参数设置
d_tilde = 1  # 降维后的输入特征
# 读入气温数据集
df = pandas.read_csv('temperature_dataset.csv')
data = np.array(df)
m_all = np.shape(data)[0]
d = np.shape(data)[1]  # 输入特征的维度为五维
# 零均值化输入特征
mean = np.mean(data, axis=0)
data = data - mean
X = data.T
# 计算协方差矩阵
C = np.dot(X, X.T) / (m_all - 1)
# 特征分解
lam, q = np.linalg.eig(C)
# 组成 Q_tilde矩阵
order = np.argsort(lam)
Q_tilde = np.zeros((d, d_tilde))  # d * d_tilde维
for j in range(d_tilde):
    Q_tilde[:, j] = q[:, order[d-1-j]]
# 对输入特征降维
X_tilde2 = np.dot(Q_tilde.T, X)
# 画出特征值及降维结果
plt.figure()
plt.plot([i+1 for i in range(d)], lam[order[::-1]], 'o--r', markersize=8, linewidth=2)
plt.xlabel('Indices')
plt.ylabel('Eigenvalues')
plt.title('Sorted eigenvalues')
plt.show()
if d_tilde == 2:
    plt.figure()
    plt.plot(X_tilde2[0, :], X_tilde2[1, :], 'xm', markersize=8)
    plt.xlabel('NewFeature[0]')
    plt.ylabel('NewFeature[1]')
    plt.title('Dimension-reduced input features')
    plt.show()

#  3-6自编码器降维
import numpy as np
import matplotlib.pyplot as plt
# 螺旋线数据集参数设置
feature_offset = 0  # 输入特征的偏移
m_train = 200  # 训练样本的数量
d = 3  # 输入特征的维数
# 生成螺旋线数据集
theta = np.linspace(0, 4 * np.pi, m_train)
x1 = np.sin(theta) + feature_offset  
x2 = np.cos(theta) + feature_offset
x3 = np.linspace(feature_offset, feature_offset + 2, m_train)  
X_train = np.zeros((d, m_train))
X_train[0, :] = x1
X_train[1, :] = x2
X_train[2, :] = x3
# 画三维螺旋线
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(x1, x2, x3, 'b', linewidth=2, label='Helix curve')
ax.legend()
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('x3')
plt.show()
# 自编码器参数设置
d_tilde = 3  # 降维后输入特征的维数
iterations_ae = 100000  # 自编码器训练过程中的迭代次数
learning_rate_ae = 0.01  # 自编码器的学习率
# 构造随机种子为指定值的随机数生成器
rng = np.random.default_rng(1)
# 初始化自编码器
W_1_ae = rng.random((d, d_tilde))  
b_1_ae = rng.random((d_tilde, 1))  
W_2_ae = rng.random((d_tilde, d))  
b_2_ae = rng.random((d, 1))
v = np.ones((1, m_train))  
costs_ae_saved = []
# 迭代循环自编码器
for i in range(iterations_ae):
    # 正向传播
    Z_1_ae = np.dot(W_1_ae.T, X_train) + np.dot(b_1_ae, v)
    X_train_tilde2 = Z_1_ae * (Z_1_ae > 0)
    Z_2_ae = np.dot(W_2_ae.T, X_train_tilde2) + np.dot(b_2_ae, v)
    X_train_hat = Z_2_ae * (Z_2_ae > 0)
    # 更新权重与偏差
    e_X = X_train_hat - X_train
    e_X_Z_2 = e_X * (Z_2_ae > 0)
    W_2_e_X_Z_2_Z_1 = np.dot(W_2_ae, e_X_Z_2) * (Z_1_ae > 0)
    W_1_ae = W_1_ae - 2 * learning_rate_ae * np.dot(X_train, W_2_e_X_Z_2_Z_1.T) / (m_train * d)
    b_1_ae = b_1_ae - 2 * learning_rate_ae * np.dot(W_2_e_X_Z_2_Z_1, v.T) / (m_train * d)
    W_2_ae = W_2_ae - 2 * learning_rate_ae * np.dot(X_train_tilde2, e_X_Z_2.T) / (m_train * d)
    b_2_ae = b_2_ae - 2 * learning_rate_ae * np.dot(e_X_Z_2, v.T) / (m_train * d)
    # 保存代价函数的值
    costs = np.trace(np.dot(e_X, e_X.T)) / (m_train * d)
    costs_ae_saved.append(costs.item(0))
# 打印最新的权重与偏差
print('W1 of AE =', np.array2string(W_1_ae, precision=3))
print('b1 of AE =', np.array2string(np.squeeze(b_1_ae, axis=1), precision=3))
print('W2 of AE =', np.array2string(W_2_ae, precision=3))
print('b2 of AE =', np.array2string(np.squeeze(b_2_ae, axis=1), precision=3))
# 画代价函数的值
plt.plot(costs_ae_saved, 'r-o', linewidth=2, markersize=5)
plt.ylabel('Costs')
plt.xlabel('Iterations')
plt.title('Learning rate (autoencoder) = ' + str(learning_rate_ae))
plt.show()
# 计算降维后的输入特征,以及输入特征的预测值
Z_1_ae = np.dot(W_1_ae.T, X_train) + np.dot(b_1_ae, v)
X_train_tilde2 = Z_1_ae * (Z_1_ae > 0)
Z_2_ae = np.dot(W_2_ae.T, X_train_tilde2) + np.dot(b_2_ae, v)
X_train_hat = Z_2_ae * (Z_2_ae > 0)
# 画降维后的输入特征
if d_tilde == 1:
    plt.figure()
    plt.plot(np.arange(1, m_train + 1), X_train_tilde2.T, 'r', linewidth=2, label='x1 (reduced)')
    plt.xlabel('Index')
    plt.ylabel('Amplitude')
    plt.legend()
elif d_tilde == 2:
    plt.figure()
    plt.plot(X_train_tilde2[0], X_train_tilde2[1], 'r', linewidth=2)
    plt.xlabel('x1 (reduced)')
    plt.ylabel('x2 (reduced)')
elif d_tilde == 3:
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot(X_train_tilde2[0, :], X_train_tilde2[1, :], X_train_tilde2[2, :], 'r', linewidth=2)
    ax.set_xlabel('x1 (reduced)')
    ax.set_ylabel('x2 (reduced)')
    ax.set_zlabel('x3 (reduced)')
plt.show()
# 画各维输入特征及其预测值
for j in range(d):
    plt.figure()
    plt.plot(np.arange(0, m_train), X_train[j, :], '--b', linewidth=2, label='x' + str(j + 1) + ' (original)')
    plt.plot(np.arange(0, m_train), X_train_hat[j, :], 'r', linewidth=2, label='x' + str(j + 1) + ' (reproduced)')
    plt.xlabel('Index')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.show()
# 画输入特征及其预测值
for j in range(d):
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot(X_train[j % d, :], X_train[(j + 1) % d, :], X_train[(j + 2) % d, :], '--b', linewidth=2,
            label='Original helix curve')
    ax.plot(X_train_hat[j % d, :], X_train_hat[(j + 1) % d, :], X_train_hat[(j + 2) % d, :], 'r', linewidth=2,
            label='Reproduced curve')
    ax.legend()
    ax.set_xlabel('x' + str(j % d + 1))
    ax.set_ylabel('x' + str((j + 1) % d + 1))
    ax.set_zlabel('x' + str((j + 2) % d + 1))
    plt.show()

结果图

相关文章:

  • Spring三层架构
  • VS和VS Code 对比和区别
  • Springboot整合ollama运行本地AI大模型
  • 利用Enigma Virtual Box将QT生成的软件打包成一个exe可执行文件
  • C++ map代码练习 1、2、priority_queue基础概念、对象创建、数据插入、获取堆顶、出队操作、大小操作,自定义结构、代码练习 1 2
  • Linux -- Ext系列文件系统介绍
  • 游戏引擎学习路径与技术栈指南
  • python+uniapp微信小程序的共享雨伞租赁系统
  • Day.34
  • JVM: 内存、类与垃圾
  • API 管理系统实践指南:监控、安全、性能全覆盖
  • MCP基本概念
  • synchronized 做了哪些优化?
  • 【Algorithm】图论入门
  • 软件体系结构-论述、设计、问答
  • 每天一个前端小知识 Day 4 - TypeScript 核心类型系统与实践
  • 跨境卖家警报。抽绳背包版权案立案,TRO在即速排查
  • 二维数组 结构体01 day15,16
  • 【大模型:知识库管理】--MinerU本地部署
  • SpringBoot Starter设计:依赖管理的革命
  • wordpress整站搬家/网络推广怎么找客户资源
  • 网站建设怎么学习/企业网络搭建
  • 回老家做PHP网站/线上推广外包公司
  • 电脑网站模版/最新新闻热点事件2022
  • 手机网站设计制作公司/优化是什么意思
  • 如何做网站frontpage/信息流广告公司排名