当前位置：首页 > news >正文

k均值聚类+成分分析降维+自编码器降维

news 2025/8/6 18:13:18

代码

# 3-1k均值聚类
import pandas
import numpy as np

# 参数设置
k = 4 # 群组数量
# 读入轮椅数据集
df = pandas.read_csv('wheelchair_dataset 1.csv')
data = np.array(df)
m_all = np.shape(data)[0]
d = np.shape(data)[1] - 1
classes = np.amax(data[:, d])
# 特征缩放
data = data.astype(float)
mean = np.mean(data[:, 0:d], axis=0)
std = np.std(data[:, 0:d], axis=0, ddof=1)
data[:, 0:d] = (data[:, 0:d] - mean) / std
# 分别保存输入特征与标注
x = data[:, 0:d].T # d*m维
y = data[:, d] # 1*m维
# 构造随机种子指定值的随机数生成器
rng=np.random.default_rng(1)
# 初始化
C=np.zeros((d, k)) # k个形心向量
cluster_index=np.zeros(m_all) # 样本的群组序号
x_expansion=np.expand_dims(x.T, axis=2) # 扩大后的输入特征数组
# 随机选择k个样本的输入特征作为k个形心的初始值
C[:, :]=x[:, rng.integers(0, m_all, k)]
# 主循环
converged=False # 是否收敛标志
iterations=1 # 迭代次数
while not converged:
    # 第一步：将所有样本都划到k个群组之一
    prev_cluster_index=np.copy(cluster_index) # 保存样本的群组序号
    diff=x_expansion-C # 样本输入特征向量与形心向量之差（使用广播操作）
    dist_squared=np.sum(diff*diff, axis=1) # 计算距离的平方
    cluster_index=np.argmin(dist_squared, axis=1) # 将样本划分到距离的平方最小的群组
    cluster_indicator = np.zeros(dist_squared.shape) # 清零one-hot向量数组
    cluster_indicator[np.arange(cluster_index.size), cluster_index]=1 # 赋值one-hot向量数组
    # 比较群组号序号数组，判断是否收敛
    if np.array_equal(cluster_index, prev_cluster_index):
        converged = True
        # 第二步，重新计算形心
    C[:, :] = np.dot(x, cluster_indicator) / np.sum(cluster_indicator, axis=0).reshape((1, -1))
    # 迭代次数加1
    iterations=iterations+1
# 打印聚类结果
print('number of iterations:', iterations)
print('assigned cluster:\n', cluster_index+1)
print('actual classes:\n', y.astype(int))
# 若k=4，以百分比的形式统计混淆矩阵并打印
if k==classes:
    conf_mat=np.zeros((k, k)) # 行为预测值，列为标注
    for i in range(m_all):
        conf_mat[cluster_index[i], y[i].astype(int)-1]=conf_mat[cluster_index[i], y[i].astype(int)-1]+1
    print('confusion matrix in percentage:\n', np.round(conf_mat/np.sum(conf_mat, axis=0), 3))

# 3-3成分分析降维

import pandas
import numpy as np
import matplotlib.pyplot as plt
# 参数设置
d_tilde = 1 # 降维后的输入特征
# 读入气温数据集
df = pandas.read_csv('temperature_dataset.csv')
data = np.array(df)
m_all = np.shape(data)[0]
d = np.shape(data)[1] # 输入特征的维度为五维
# 零均值化输入特征
mean = np.mean(data, axis=0)
data = data - mean
X = data.T
# 计算协方差矩阵
C = np.dot(X, X.T) / (m_all - 1)
# 特征分解
lam, q = np.linalg.eig(C)
# 组成 Q_tilde矩阵
order = np.argsort(lam)
Q_tilde = np.zeros((d, d_tilde)) # d * d_tilde维
for j in range(d_tilde):
    Q_tilde[:, j] = q[:, order[d-1-j]]
# 对输入特征降维
X_tilde2 = np.dot(Q_tilde.T, X)
# 画出特征值及降维结果
plt.figure()
plt.plot([i+1 for i in range(d)], lam[order[::-1]], 'o--r', markersize=8, linewidth=2)
plt.xlabel('Indices')
plt.ylabel('Eigenvalues')
plt.title('Sorted eigenvalues')
plt.show()
if d_tilde == 2:
    plt.figure()
    plt.plot(X_tilde2[0, :], X_tilde2[1, :], 'xm', markersize=8)
    plt.xlabel('NewFeature[0]')
    plt.ylabel('NewFeature[1]')
    plt.title('Dimension-reduced input features')
    plt.show()

# 3-6自编码器降维
import numpy as np
import matplotlib.pyplot as plt
# 螺旋线数据集参数设置
feature_offset = 0 # 输入特征的偏移
m_train = 200 # 训练样本的数量
d = 3 # 输入特征的维数
# 生成螺旋线数据集
theta = np.linspace(0, 4 * np.pi, m_train)
x1 = np.sin(theta) + feature_offset
x2 = np.cos(theta) + feature_offset
x3 = np.linspace(feature_offset, feature_offset + 2, m_train)
X_train = np.zeros((d, m_train))
X_train[0, :] = x1
X_train[1, :] = x2
X_train[2, :] = x3
# 画三维螺旋线
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(x1, x2, x3, 'b', linewidth=2, label='Helix curve')
ax.legend()
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('x3')
plt.show()
# 自编码器参数设置
d_tilde = 3 # 降维后输入特征的维数
iterations_ae = 100000 # 自编码器训练过程中的迭代次数
learning_rate_ae = 0.01 # 自编码器的学习率
# 构造随机种子为指定值的随机数生成器
rng = np.random.default_rng(1)
# 初始化自编码器
W_1_ae = rng.random((d, d_tilde))
b_1_ae = rng.random((d_tilde, 1))
W_2_ae = rng.random((d_tilde, d))
b_2_ae = rng.random((d, 1))
v = np.ones((1, m_train))
costs_ae_saved = []
# 迭代循环自编码器
for i in range(iterations_ae):
    # 正向传播
    Z_1_ae = np.dot(W_1_ae.T, X_train) + np.dot(b_1_ae, v)
    X_train_tilde2 = Z_1_ae * (Z_1_ae > 0)
    Z_2_ae = np.dot(W_2_ae.T, X_train_tilde2) + np.dot(b_2_ae, v)
    X_train_hat = Z_2_ae * (Z_2_ae > 0)
    # 更新权重与偏差
    e_X = X_train_hat - X_train
    e_X_Z_2 = e_X * (Z_2_ae > 0)
    W_2_e_X_Z_2_Z_1 = np.dot(W_2_ae, e_X_Z_2) * (Z_1_ae > 0)
    W_1_ae = W_1_ae - 2 * learning_rate_ae * np.dot(X_train, W_2_e_X_Z_2_Z_1.T) / (m_train * d)
    b_1_ae = b_1_ae - 2 * learning_rate_ae * np.dot(W_2_e_X_Z_2_Z_1, v.T) / (m_train * d)
    W_2_ae = W_2_ae - 2 * learning_rate_ae * np.dot(X_train_tilde2, e_X_Z_2.T) / (m_train * d)
    b_2_ae = b_2_ae - 2 * learning_rate_ae * np.dot(e_X_Z_2, v.T) / (m_train * d)
    # 保存代价函数的值
    costs = np.trace(np.dot(e_X, e_X.T)) / (m_train * d)
    costs_ae_saved.append(costs.item(0))
# 打印最新的权重与偏差
print('W1 of AE =', np.array2string(W_1_ae, precision=3))
print('b1 of AE =', np.array2string(np.squeeze(b_1_ae, axis=1), precision=3))
print('W2 of AE =', np.array2string(W_2_ae, precision=3))
print('b2 of AE =', np.array2string(np.squeeze(b_2_ae, axis=1), precision=3))
# 画代价函数的值
plt.plot(costs_ae_saved, 'r-o', linewidth=2, markersize=5)
plt.ylabel('Costs')
plt.xlabel('Iterations')
plt.title('Learning rate (autoencoder) = ' + str(learning_rate_ae))
plt.show()
# 计算降维后的输入特征，以及输入特征的预测值
Z_1_ae = np.dot(W_1_ae.T, X_train) + np.dot(b_1_ae, v)
X_train_tilde2 = Z_1_ae * (Z_1_ae > 0)
Z_2_ae = np.dot(W_2_ae.T, X_train_tilde2) + np.dot(b_2_ae, v)
X_train_hat = Z_2_ae * (Z_2_ae > 0)
# 画降维后的输入特征
if d_tilde == 1:
    plt.figure()
    plt.plot(np.arange(1, m_train + 1), X_train_tilde2.T, 'r', linewidth=2, label='x1 (reduced)')
    plt.xlabel('Index')
    plt.ylabel('Amplitude')
    plt.legend()
elif d_tilde == 2:
    plt.figure()
    plt.plot(X_train_tilde2[0], X_train_tilde2[1], 'r', linewidth=2)
    plt.xlabel('x1 (reduced)')
    plt.ylabel('x2 (reduced)')
elif d_tilde == 3:
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot(X_train_tilde2[0, :], X_train_tilde2[1, :], X_train_tilde2[2, :], 'r', linewidth=2)
    ax.set_xlabel('x1 (reduced)')
    ax.set_ylabel('x2 (reduced)')
    ax.set_zlabel('x3 (reduced)')
plt.show()
# 画各维输入特征及其预测值
for j in range(d):
    plt.figure()
    plt.plot(np.arange(0, m_train), X_train[j, :], '--b', linewidth=2, label='x' + str(j + 1) + ' (original)')
    plt.plot(np.arange(0, m_train), X_train_hat[j, :], 'r', linewidth=2, label='x' + str(j + 1) + ' (reproduced)')
    plt.xlabel('Index')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.show()
# 画输入特征及其预测值
for j in range(d):
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot(X_train[j % d, :], X_train[(j + 1) % d, :], X_train[(j + 2) % d, :], '--b', linewidth=2,
            label='Original helix curve')
    ax.plot(X_train_hat[j % d, :], X_train_hat[(j + 1) % d, :], X_train_hat[(j + 2) % d, :], 'r', linewidth=2,
            label='Reproduced curve')
    ax.legend()
    ax.set_xlabel('x' + str(j % d + 1))
    ax.set_ylabel('x' + str((j + 1) % d + 1))
    ax.set_zlabel('x' + str((j + 2) % d + 1))
    plt.show()