k均值聚类+成分分析降维+自编码器降维
代码
# 3-1k均值聚类
import pandas
import numpy as np
# 参数设置
k = 4 # 群组数量
# 读入轮椅数据集
df = pandas.read_csv('wheelchair_dataset 1.csv')
data = np.array(df)
m_all = np.shape(data)[0]
d = np.shape(data)[1] - 1
classes = np.amax(data[:, d])
# 特征缩放
data = data.astype(float)
mean = np.mean(data[:, 0:d], axis=0)
std = np.std(data[:, 0:d], axis=0, ddof=1)
data[:, 0:d] = (data[:, 0:d] - mean) / std
# 分别保存输入特征与标注
x = data[:, 0:d].T # d*m维
y = data[:, d] # 1*m维
# 构造随机种子指定值的随机数生成器
rng=np.random.default_rng(1)
# 初始化
C=np.zeros((d, k)) # k个形心向量
cluster_index=np.zeros(m_all) # 样本的群组序号
x_expansion=np.expand_dims(x.T, axis=2) # 扩大后的输入特征数组
# 随机选择k个样本的输入特征作为k个形心的初始值
C[:, :]=x[:, rng.integers(0, m_all, k)]
# 主循环
converged=False # 是否收敛标志
iterations=1 # 迭代次数
while not converged:
# 第一步:将所有样本都划到k个群组之一
prev_cluster_index=np.copy(cluster_index) # 保存样本的群组序号
diff=x_expansion-C # 样本输入特征向量与形心向量之差(使用广播操作)
dist_squared=np.sum(diff*diff, axis=1) # 计算距离的平方
cluster_index=np.argmin(dist_squared, axis=1) # 将样本划分到距离的平方最小的群组
cluster_indicator = np.zeros(dist_squared.shape) # 清零one-hot向量数组
cluster_indicator[np.arange(cluster_index.size), cluster_index]=1 # 赋值one-hot向量数组
# 比较群组号序号数组,判断是否收敛
if np.array_equal(cluster_index, prev_cluster_index):
converged = True
# 第二步,重新计算形心
C[:, :] = np.dot(x, cluster_indicator) / np.sum(cluster_indicator, axis=0).reshape((1, -1))
# 迭代次数加1
iterations=iterations+1
# 打印聚类结果
print('number of iterations:', iterations)
print('assigned cluster:\n', cluster_index+1)
print('actual classes:\n', y.astype(int))
# 若k=4,以百分比的形式统计混淆矩阵并打印
if k==classes:
conf_mat=np.zeros((k, k)) # 行为预测值,列为标注
for i in range(m_all):
conf_mat[cluster_index[i], y[i].astype(int)-1]=conf_mat[cluster_index[i], y[i].astype(int)-1]+1
print('confusion matrix in percentage:\n', np.round(conf_mat/np.sum(conf_mat, axis=0), 3))
# 3-3成分分析降维
import pandas
import numpy as np
import matplotlib.pyplot as plt
# 参数设置
d_tilde = 1 # 降维后的输入特征
# 读入气温数据集
df = pandas.read_csv('temperature_dataset.csv')
data = np.array(df)
m_all = np.shape(data)[0]
d = np.shape(data)[1] # 输入特征的维度为五维
# 零均值化输入特征
mean = np.mean(data, axis=0)
data = data - mean
X = data.T
# 计算协方差矩阵
C = np.dot(X, X.T) / (m_all - 1)
# 特征分解
lam, q = np.linalg.eig(C)
# 组成 Q_tilde矩阵
order = np.argsort(lam)
Q_tilde = np.zeros((d, d_tilde)) # d * d_tilde维
for j in range(d_tilde):
Q_tilde[:, j] = q[:, order[d-1-j]]
# 对输入特征降维
X_tilde2 = np.dot(Q_tilde.T, X)
# 画出特征值及降维结果
plt.figure()
plt.plot([i+1 for i in range(d)], lam[order[::-1]], 'o--r', markersize=8, linewidth=2)
plt.xlabel('Indices')
plt.ylabel('Eigenvalues')
plt.title('Sorted eigenvalues')
plt.show()
if d_tilde == 2:
plt.figure()
plt.plot(X_tilde2[0, :], X_tilde2[1, :], 'xm', markersize=8)
plt.xlabel('NewFeature[0]')
plt.ylabel('NewFeature[1]')
plt.title('Dimension-reduced input features')
plt.show()
# 3-6自编码器降维
import numpy as np
import matplotlib.pyplot as plt
# 螺旋线数据集参数设置
feature_offset = 0 # 输入特征的偏移
m_train = 200 # 训练样本的数量
d = 3 # 输入特征的维数
# 生成螺旋线数据集
theta = np.linspace(0, 4 * np.pi, m_train)
x1 = np.sin(theta) + feature_offset
x2 = np.cos(theta) + feature_offset
x3 = np.linspace(feature_offset, feature_offset + 2, m_train)
X_train = np.zeros((d, m_train))
X_train[0, :] = x1
X_train[1, :] = x2
X_train[2, :] = x3
# 画三维螺旋线
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(x1, x2, x3, 'b', linewidth=2, label='Helix curve')
ax.legend()
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('x3')
plt.show()
# 自编码器参数设置
d_tilde = 3 # 降维后输入特征的维数
iterations_ae = 100000 # 自编码器训练过程中的迭代次数
learning_rate_ae = 0.01 # 自编码器的学习率
# 构造随机种子为指定值的随机数生成器
rng = np.random.default_rng(1)
# 初始化自编码器
W_1_ae = rng.random((d, d_tilde))
b_1_ae = rng.random((d_tilde, 1))
W_2_ae = rng.random((d_tilde, d))
b_2_ae = rng.random((d, 1))
v = np.ones((1, m_train))
costs_ae_saved = []
# 迭代循环自编码器
for i in range(iterations_ae):
# 正向传播
Z_1_ae = np.dot(W_1_ae.T, X_train) + np.dot(b_1_ae, v)
X_train_tilde2 = Z_1_ae * (Z_1_ae > 0)
Z_2_ae = np.dot(W_2_ae.T, X_train_tilde2) + np.dot(b_2_ae, v)
X_train_hat = Z_2_ae * (Z_2_ae > 0)
# 更新权重与偏差
e_X = X_train_hat - X_train
e_X_Z_2 = e_X * (Z_2_ae > 0)
W_2_e_X_Z_2_Z_1 = np.dot(W_2_ae, e_X_Z_2) * (Z_1_ae > 0)
W_1_ae = W_1_ae - 2 * learning_rate_ae * np.dot(X_train, W_2_e_X_Z_2_Z_1.T) / (m_train * d)
b_1_ae = b_1_ae - 2 * learning_rate_ae * np.dot(W_2_e_X_Z_2_Z_1, v.T) / (m_train * d)
W_2_ae = W_2_ae - 2 * learning_rate_ae * np.dot(X_train_tilde2, e_X_Z_2.T) / (m_train * d)
b_2_ae = b_2_ae - 2 * learning_rate_ae * np.dot(e_X_Z_2, v.T) / (m_train * d)
# 保存代价函数的值
costs = np.trace(np.dot(e_X, e_X.T)) / (m_train * d)
costs_ae_saved.append(costs.item(0))
# 打印最新的权重与偏差
print('W1 of AE =', np.array2string(W_1_ae, precision=3))
print('b1 of AE =', np.array2string(np.squeeze(b_1_ae, axis=1), precision=3))
print('W2 of AE =', np.array2string(W_2_ae, precision=3))
print('b2 of AE =', np.array2string(np.squeeze(b_2_ae, axis=1), precision=3))
# 画代价函数的值
plt.plot(costs_ae_saved, 'r-o', linewidth=2, markersize=5)
plt.ylabel('Costs')
plt.xlabel('Iterations')
plt.title('Learning rate (autoencoder) = ' + str(learning_rate_ae))
plt.show()
# 计算降维后的输入特征,以及输入特征的预测值
Z_1_ae = np.dot(W_1_ae.T, X_train) + np.dot(b_1_ae, v)
X_train_tilde2 = Z_1_ae * (Z_1_ae > 0)
Z_2_ae = np.dot(W_2_ae.T, X_train_tilde2) + np.dot(b_2_ae, v)
X_train_hat = Z_2_ae * (Z_2_ae > 0)
# 画降维后的输入特征
if d_tilde == 1:
plt.figure()
plt.plot(np.arange(1, m_train + 1), X_train_tilde2.T, 'r', linewidth=2, label='x1 (reduced)')
plt.xlabel('Index')
plt.ylabel('Amplitude')
plt.legend()
elif d_tilde == 2:
plt.figure()
plt.plot(X_train_tilde2[0], X_train_tilde2[1], 'r', linewidth=2)
plt.xlabel('x1 (reduced)')
plt.ylabel('x2 (reduced)')
elif d_tilde == 3:
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(X_train_tilde2[0, :], X_train_tilde2[1, :], X_train_tilde2[2, :], 'r', linewidth=2)
ax.set_xlabel('x1 (reduced)')
ax.set_ylabel('x2 (reduced)')
ax.set_zlabel('x3 (reduced)')
plt.show()
# 画各维输入特征及其预测值
for j in range(d):
plt.figure()
plt.plot(np.arange(0, m_train), X_train[j, :], '--b', linewidth=2, label='x' + str(j + 1) + ' (original)')
plt.plot(np.arange(0, m_train), X_train_hat[j, :], 'r', linewidth=2, label='x' + str(j + 1) + ' (reproduced)')
plt.xlabel('Index')
plt.ylabel('Amplitude')
plt.legend()
plt.show()
# 画输入特征及其预测值
for j in range(d):
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot(X_train[j % d, :], X_train[(j + 1) % d, :], X_train[(j + 2) % d, :], '--b', linewidth=2,
label='Original helix curve')
ax.plot(X_train_hat[j % d, :], X_train_hat[(j + 1) % d, :], X_train_hat[(j + 2) % d, :], 'r', linewidth=2,
label='Reproduced curve')
ax.legend()
ax.set_xlabel('x' + str(j % d + 1))
ax.set_ylabel('x' + str((j + 1) % d + 1))
ax.set_zlabel('x' + str((j + 2) % d + 1))
plt.show()