Dlib机器学习算法C++实现示例
C++调用Dlib C++库中主成分分析PCA算法、K-means聚类算法、Fuzzy C-Means(FCM)聚类算D、判别分析(LDA/QDA)算法、KNN分类和交叉验证算法、混淆矩阵(confusion_matrix)、准确率、精确率、召回率、R²回归评分等计算模块示例。
完整C++代码实现
#include <iostream>
#include <vector>
#include <dlib/matrix.h>
#include <dlib/statistics.h>
#include <dlib/clustering.h>
#include <dlib/svm.h>
#include <dlib/global_optimization.h>
#include <random>using namespace dlib;
using namespace std;// 生成示例数据
matrix<double> generate_sample_data(int samples, int features) {matrix<double> data(samples, features);std::random_device rd;std::mt19937 gen(rd());std::normal_distribution<> d(0, 1);for (int i = 0; i < samples; ++i) {for (int j = 0; j < features; ++j) {data(i, j) = d(gen);}}return data;
}// PCA算法示例
void pca_example() {cout << "\n=== PCA算法示例 ===" << endl;// 生成样本数据matrix<double> data = generate_sample_data(100, 5);// 计算PCAmatrix<double> cov = cov(data);matrix<double> eigenvals, eigenvects;eigen_symmetric(cov, eigenvals, eigenvects);cout << "原始数据维度: " << data.nr() << " x " << data.nc() << endl;cout << "特征值: " << trans(eigenvals) << endl;// 选择前2个主成分matrix<double> pca_projection = data * subm(eigenvects, 0, 0, eigenvects.nr(), 2);cout << "降维后数据: " << pca_projection.nr() << " x " << pca_projection.nc() << endl;
}// K-means聚类示例
void kmeans_example() {cout << "\n=== K-means聚类算法示例 ===" << endl;// 生成聚类数据std::vector<matrix<double,0,1>> samples;std::random_device rd;std::mt19937 gen(rd());// 生成三个簇的数据for (int i = 0; i < 50; ++i) {matrix<double,0,1> m(2);m(0) = -2 + gen()%100/50.0;m(1) = -2 + gen()%100/50.0;samples.push_back(m);}for (int i = 0; i < 50; ++i) {matrix<double,0,1> m(2);m(0) = 1 + gen()%100/50.0;m(1) = 1 + gen()%100/50.0;samples.push_back(m);}for (int i = 0; i < 50; ++i) {matrix<double,0,1> m(2);m(0) = 3 + gen()%100/50.0;m(1) = -1 + gen()%100/50.0;samples.push_back(m);}// 执行K-means聚类std::vector<matrix<double,0,1>> initial_centers;pick_initial_centers(3, initial_centers, samples, linear_kernel<matrix<double,0,1>>());find_clusters_using_kmeans(samples, initial_centers);cout << "聚类中心数量: " << initial_centers.size() << endl;for (size_t i = 0; i < initial_centers.size(); ++i) {cout << "中心 " << i << ": " << trans(initial_centers[i]) << endl;}
}// 手动实现Fuzzy C-Means聚类
std::vector<matrix<double>> fuzzy_c_means(const std::vector<matrix<double>>& data, int clusters, double m = 2.0, double epsilon = 1e-5) {int n = data.size();int dim = data[0].size();// 初始化隶属度矩阵matrix<double> U(n, clusters);std::random_device rd;std::mt19937 gen(rd());std::uniform_real_distribution<> dis(0, 1);for (int i = 0; i < n; ++i) {double sum = 0;for (int j = 0; j < clusters; ++j) {U(i, j) = dis(gen);sum += U(i, j);}for (int j = 0; j < clusters; ++j) {U(i, j) /= sum;}}std::vector<matrix<double>> centers(clusters, matrix<double>(dim));matrix<double> U_old;do {U_old = U;// 更新聚类中心for (int j = 0; j < clusters; ++j) {matrix<double> numerator(dim);numerator = 0;double denominator = 0;for (int i = 0; i < n; ++i) {double u_pow = std::pow(U(i, j), m);numerator += u_pow * data[i];denominator += u_pow;}centers[j] = numerator / denominator;}// 更新隶属度for (int i = 0; i < n; ++i) {for (int j = 0; j < clusters; ++j) {double sum = 0;double dist_ij = length(data[i] - centers[j]);for (int k = 0; k < clusters; ++k) {double dist_ik = length(data[i] - centers[k]);if (dist_ik == 0) {sum = (j == k) ? 1 : 0;break;}sum += std::pow(dist_ij / dist_ik, 2.0 / (m - 1));}if (sum == 0) {U(i, j) = (j == 0) ? 1 : 0;} else {U(i, j) = 1.0 / sum;}}}} while (length(U - U_old) > epsilon);return centers;
}void fcm_example() {cout << "\n=== Fuzzy C-Means聚类算法示例 ===" << endl;// 生成测试数据std::vector<matrix<double>> samples;std::random_device rd;std::mt19937 gen(rd());for (int i = 0; i < 30; ++i) {matrix<double> m(2);m(0) = 1 + gen()%100/100.0;m(1) = 1 + gen()%100/100.0;samples.push_back(m);}for (int i = 0; i < 30; ++i) {matrix<double> m(2);m(0) = 3 + gen()%100/100.0;m(1) = 3 + gen()%100/100.0;samples.push_back(m);}// 执行FCM聚类auto centers = fuzzy_c_means(samples, 2);cout << "FCM聚类中心:" << endl;for (size_t i = 0; i < centers.size(); ++i) {cout << "中心 " << i << ": " << trans(centers[i]) << endl;}
}// KNN分类和交叉验证示例
void knn_cross_validation_example() {cout << "\n=== KNN分类和交叉验证示例 ===" << endl;// 生成分类数据std::vector<matrix<double,0,1>> samples;std::vector<double> labels;std::random_device rd;std::mt19937 gen(rd());// 两类数据for (int i = 0; i < 50; ++i) {matrix<double,0,1> m(2);m(0) = 1 + gen()%100/100.0;m(1) = 1 + gen()%100/100.0;samples.push_back(m);labels.push_back(1);}for (int i = 0; i < 50; ++i) {matrix<double,0,1> m(2);m(0) = 3 + gen()%100/100.0;m(1) = 3 + gen()%100/100.0;samples.push_back(m);labels.push_back(2);}// 使用径向基核函数的KNNtypedef radial_basis_kernel<matrix<double,0,1>> kernel_type;// 设置KNN参数krr_trainer<kernel_type> trainer;trainer.set_kernel(kernel_type(0.1));// 交叉验证randomize_samples(samples, labels);cout << "执行交叉验证..." << endl;// 计算准确率指标matrix<double> confusion_matrix = cross_validate_multiclass_trainer(trainer, samples, labels, 5);cout << "混淆矩阵:\n" << confusion_matrix << endl;// 计算各项指标double accuracy = sum(diag(confusion_matrix)) / sum(confusion_matrix);cout << "准确率: " << accuracy << endl;// 精确率和召回率for (long i = 0; i < confusion_matrix.nr(); ++i) {double precision = confusion_matrix(i,i) / sum(colm(confusion_matrix, i));double recall = confusion_matrix(i,i) / sum(rowm(confusion_matrix, i));cout << "类别 " << i+1 << " - 精确率: " << precision << ", 召回率: " << recall << endl;}
}// 混淆矩阵和评估指标计算
void evaluation_metrics_example() {cout << "\n=== 混淆矩阵和评估指标示例 ===" << endl;// 模拟预测结果和真实标签std::vector<double> true_labels = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};std::vector<double> pred_labels = {0, 1, 0, 0, 0, 1, 1, 1, 0, 1};// 计算混淆矩阵matrix<double> conf_mat(2, 2);conf_mat = 0;for (size_t i = 0; i < true_labels.size(); ++i) {int true_idx = static_cast<int>(true_labels[i]);int pred_idx = static_cast<int>(pred_labels[i]);conf_mat(true_idx, pred_idx) += 1;}cout << "混淆矩阵:\n" << conf_mat << endl;// 计算评估指标double TP = conf_mat(1, 1); // True Positivedouble TN = conf_mat(0, 0); // True Negativedouble FP = conf_mat(0, 1); // False Positivedouble FN = conf_mat(1, 0); // False Negativedouble accuracy = (TP + TN) / (TP + TN + FP + FN);double precision = TP / (TP + FP);double recall = TP / (TP + FN);double f1_score = 2 * (precision * recall) / (precision + recall);cout << "准确率: " << accuracy << endl;cout << "精确率: " << precision << endl;cout << "召回率: " << recall << endl;cout << "F1分数: " << f1_score << endl;
}// R²回归评分示例
void r2_score_example() {cout << "\n=== R²回归评分示例 ===" << endl;// 生成回归数据std::vector<matrix<double,0,1>> samples;std::vector<double> targets;std::random_device rd;std::mt19937 gen(rd());std::normal_distribution<> noise(0, 0.1);for (int i = 0; i < 100; ++i) {matrix<double,0,1> m(1);double x = i / 10.0;m(0) = x;samples.push_back(m);targets.push_back(2*x + 1 + noise(gen)); // y = 2x + 1 + noise}// 使用线性回归krr_trainer<linear_kernel<matrix<double,0,1>>> trainer;decision_function<linear_kernel<matrix<double,0,1>>> df = trainer.train(samples, targets);// 计算预测值std::vector<double> predictions;for (const auto& sample : samples) {predictions.push_back(df(sample));}// 计算R²分数double ss_res = 0, ss_tot = 0;double mean_target = 0;for (double target : targets) {mean_target += target;}mean_target /= targets.size();for (size_t i = 0; i < targets.size(); ++i) {ss_res += std::pow(targets[i] - predictions[i], 2);ss_tot += std::pow(targets[i] - mean_target, 2);}double r2 = 1 - (ss_res / ss_tot);cout << "R²分数: " << r2 << endl;// 显示前几个预测值和真实值cout << "前5个样本的预测结果:" << endl;for (int i = 0; i < 5; ++i) {cout << "真实值: " << targets[i] << ", 预测值: " << predictions[i] << ", 误差: " << std::abs(targets[i] - predictions[i]) << endl;}
}int main() {try {cout << "Dlib机器学习算法示例程序" << endl;cout << "=========================" << endl;// 执行各个算法示例pca_example();kmeans_example();fcm_example();knn_cross_validation_example();evaluation_metrics_example();r2_score_example();cout << "\n所有算法示例执行完成!" << endl;} catch (std::exception& e) {cout << "错误: " << e.what() << endl;}return 0;
}
CMakeLists.txt 配置文件
cmake_minimum_required(VERSION 3.10)
project(DlibMachineLearning)set(CMAKE_CXX_STANDARD 11)# 查找Dlib库
find_package(dlib REQUIRED)add_executable(machine_learning_demo main.cpp)# 链接Dlib库
target_link_libraries(machine_learning_demo dlib::dlib)
技术论述
1. PCA(主成分分析)
算法原理:
PCA通过线性变换将原始数据投影到新的坐标系中,使得投影后的数据在第一主成分上方差最大,后续主成分依次递减。Dlib中使用特征值分解实现PCA。
Dlib实现特点:
- 使用
cov()
计算协方差矩阵 - 通过
eigen_symmetric()
进行特征分解 - 可灵活选择主成分数量
2. K-means聚类
算法原理:
K-means通过迭代优化,将数据划分为K个簇,使得每个数据点到其所属簇中心的距离平方和最小。
Dlib实现优势:
pick_initial_centers()
提供智能初始中心选择find_clusters_using_kmeans()
实现高效聚类- 支持任意维度的数据
3. Fuzzy C-Means聚类
算法特点:
与K-means的硬划分不同,FCM允许数据点以不同的隶属度属于多个簇,更适用于边界模糊的数据集。
实现关键:
- 隶属度矩阵的初始化与更新
- 模糊参数m的控制
- 收敛条件的判断
4. KNN分类与交叉验证
KNN算法:
基于实例的学习方法,通过计算测试样本与训练样本的距离进行分类。
交叉验证:
- 将数据集分为k个子集
- 轮流使用k-1个子集训练,1个子集测试
- 减少模型评估的方差
5. 评估指标系统
混淆矩阵:
预测正例 预测负例
真实正例 TP FN
真实负例 FP TN
关键指标:
- 准确率:(TP+TN)/(TP+TN+FP+FN) - 整体分类正确率
- 精确率:TP/(TP+FP) - 预测为正例中的真正正例比例
- 召回率:TP/(TP+FN) - 真实正例中被正确预测的比例
- F1分数:精确率和召回率的调和平均数
6. R²回归评分
计算公式:
R² = 1 - SS_res / SS_tot
意义:
- 衡量回归模型对数据变动的解释程度
- 取值范围(-∞, 1],越接近1说明模型拟合越好
- 考虑了基准模型(均值模型)的对比
编译和运行
# 编译
mkdir build && cd build
cmake ..
make# 运行
./machine_learning_demo
实际应用建议
- 数据预处理:在实际应用中应先进行数据标准化
- 参数调优:使用交叉验证选择最优参数
- 模型选择:根据问题特点选择合适的算法
- 评估验证:使用多种评估指标全面评价模型性能