CUDA实现的点云MLS滤波
实现了CUDA版本的点云MLS滤波,可比PCL中CPU版本的MLS滤波快10X以上。
本文介绍了一个基于CUDA的高效移动最小二乘法(MLS)点云平滑处理实现。该系统通过GPU加速实现了点云的二次多项式拟合,主要包含以下优化:
- 核心算法采用双阶段处理:先计算加权质心和协方差,再构建二次方程组求解
- 使用空间分块策略加速邻域搜索,通过thrust库实现高效的分块排序和统计
- 采用优化的数学运算:
- 改进的Jacobi特征值分解
- Cholesky分解求解线性系统
- 稳定的局部坐标系构造
- 添加数值稳定性增强:
- 对角岭回归正则化
- 条件数检查
- 退化情况处理
该实现通过CUDA并行计算显著提升了处理速度,适用于大规模点云数据的平滑处理,同时保证了数值稳定性和计算精度。系统支持自定义参数设置,包括邻域半径和网格大小等。
这段代码适用于:
点云去噪和平滑
曲面重建预处理
点云法向估计
几何特征提取
mls_cuda.h
#ifndef MLS_CUDA_H_
#define MLS_CUDA_H_#include <vector>
#include <cstdint>// Simple point type (same layout as pcl::PointXYZ)
struct PointXYZ {float x, y, z;
};struct MLSParams {float radius; // neighborhood radiusfloat cell; // grid cell size (choose ~ radius or smaller)int numThreads; // 0 -> auto
};// Run MLS quadratic smoothing on host points (GPU)
// pts_host: input host points
// params: radius/cell settings
// out_host: resized to N with output points
void mls_cuda_quadratic(const std::vector<PointXYZ>& pts_host,float minX, float maxX, float minY, float maxY,const MLSParams& params,std::vector<PointXYZ>& out_host);#endif // MLS_CUDA_H_mls_cuda.cuh
#ifndef MLS_CUDA_CUH_
#define MLS_CUDA_CUH_#include "mls_cuda.h"
#include <cuda_runtime.h>// Device helpers
__device__ void jacobi_eigen_3x3(float a00, float a01, float a02, float a11, float a12, float a22,float& eig0, float& eig1, float& eig2,float v[9]);// Solve linear system A x = b for n=6 using Gaussian elimination with partial pivoting.
// A is n x n stored row-major in A[36], b length n. Solves in-place and writes x into sol[6].
// Returns true on success, false on failure (singular).
__device__ bool solve_6x6(float A[36], float b[6], float sol[6]);// Kernels
__global__ void compute_buckets_kernel(const float* __restrict__ x, const float* __restrict__ y, int N,float minX, float minY, float invCell, int nx, int ny,int* __restrict__ out_bucket, int* __restrict__ out_idx);__global__ void histogram_keys_kernel(const int* __restrict__ keys, int N, int* __restrict__ counts);// Main MLS kernel: each thread processes one query point (original index order)
// Two-pass neighbor iteration inside kernel (first compute weighted centroid & covariance,
// then construct normal eqn for quadratic fit)
__global__ void mls_quadratic_kernel(const float* __restrict__ x, const float* __restrict__ y, const float* __restrict__ z, int N,const int* __restrict__ bucket_of_point, // original idx -> bucketconst int* __restrict__ sorted_idx, // sorted indices by bucketconst int* __restrict__ bucket_offsets, // size = cells+1int nx, int ny,float minX, float minY, float cell, float invCell,float radius, float h2, // h2 = (h*h) used in Gaussianfloat* __restrict__ out_x, float* __restrict__ out_y, float* __restrict__ out_z);#endif // MLS_CUDA_CUH_mls_cuda.cu
/* Optimized & corrected mls_cuda.cu- Replaced per-thread shared neighbor buffer (race-prone) with two-pass neighbor scan(first pass: weighted centroid/covariance; second pass: build M/rhs and solve 6x6).- Replaced atomic histogram kernel with thrust reduce_by_key + scatter to build counts,avoiding heavy atomic contention.- Safer host copy from device to interleaved PointXYZs.- Added small ridge regularizer to M diagonal for numeric stability.- Use __restrict__ to help optimizer; expose device constants for frequently used params.
*/#include "mls_cuda.h"
#include "mls_cuda.cuh"#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/scatter.h>
#include <thrust/unique.h>#include <cuda_runtime.h>
#include <cstdio>
#include <cmath>
#include <iostream>
#include <random>
#include <chrono>#define CUDA_CHECK(err) do { cudaError_t e = (err); if (e != cudaSuccess) { \std::cerr << "CUDA error " << cudaGetErrorString(e) << " at " << __FILE__ << ":" << __LINE__ << std::endl; exit(1);} } while(0)// Device implementations (jacobi & solve_6x6 are same as before, kept inline for speed)// 优化的数学函数
__device__ __forceinline__ float dot(float3 a, float3 b) {return a.x * b.x + a.y * b.y + a.z * b.z;
}__device__ __forceinline__ float norm3d(float3 v) {return sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
}// 优化的旋转函数
__device__ __forceinline__ void rotate_01(float A[6], float V[9], float c, float s) {float A00 = A[0], A01 = A[1], A02 = A[2], A11 = A[3], A12 = A[4], A22 = A[5];float A00n = c * c * A00 - 2 * c * s * A01 + s * s * A11;float A11n = s * s * A00 + 2 * c * s * A01 + c * c * A11;float A01n = (c * c - s * s) * A01 + c * s * (A00 - A11);float A02n = c * A02 - s * A12;float A12n = s * A02 + c * A12;A[0] = A00n; A[1] = A01n; A[2] = A02n;A[3] = A11n; A[4] = A12n; A[5] = A22;// 更新特征向量for (int r = 0; r < 3; ++r) {float v0 = V[r * 3], v1 = V[r * 3 + 1];V[r * 3] = c * v0 - s * v1;V[r * 3 + 1] = s * v0 + c * v1;}
}__device__ __forceinline__ void rotate_02(float A[6], float V[9], float c, float s) {float A00 = A[0], A01 = A[1], A02 = A[2], A11 = A[3], A12 = A[4], A22 = A[5];float A00n = c * c * A00 - 2 * c * s * A02 + s * s * A22;float A22n = s * s * A00 + 2 * c * s * A02 + c * c * A22;float A02n = (c * c - s * s) * A02 + c * s * (A00 - A22);float A01n = c * A01 - s * A12;float A12n = s * A01 + c * A12;A[0] = A00n; A[1] = A01n; A[2] = A02n;A[3] = A11; A[4] = A12n; A[5] = A22n;for (int r = 0; r < 3; ++r) {float v0 = V[r * 3], v2 = V[r * 3 + 2];V[r * 3] = c * v0 - s * v2;V[r * 3 + 2] = s * v0 + c * v2;}
}__device__ __forceinline__ void rotate_12(float A[6], float V[9], float c, float s) {float A00 = A[0], A01 = A[1], A02 = A[2], A11 = A[3], A12 = A[4], A22 = A[5];float A11n = c * c * A11 - 2 * c * s * A12 + s * s * A22;float A22n = s * s * A11 + 2 * c * s * A12 + c * c * A22;float A12n = (c * c - s * s) * A12 + c * s * (A11 - A22);float A01n = c * A01 - s * A02;float A02n = s * A01 + c * A02;A[0] = A00; A[1] = A01n; A[2] = A02n;A[3] = A11n; A[4] = A12n; A[5] = A22n;for (int r = 0; r < 3; ++r) {float v1 = V[r * 3 + 1], v2 = V[r * 3 + 2];V[r * 3 + 1] = c * v1 - s * v2;V[r * 3 + 2] = s * v1 + c * v2;}
}__device__ void jacobi_eigen_3x3(float a00, float a01, float a02, float a11, float a12, float a22,float& eig0, float& eig1, float& eig2,float V[9])
{// 使用更少的迭代和更快的收敛判断V[0] = 1; V[1] = 0; V[2] = 0;V[3] = 0; V[4] = 1; V[5] = 0;V[6] = 0; V[7] = 0; V[8] = 1;const int MAX_IT = 10; // 减少迭代次数float A[6] = { a00, a01, a02, a11, a12, a22 }; // 打包存储for (int it = 0; it < MAX_IT; ++it) {// 找到最大非对角元素float max_off = fabsf(A[1]);int p = 0, q = 1;if (fabsf(A[2]) > max_off) { max_off = fabsf(A[2]); p = 0; q = 2; }if (fabsf(A[4]) > max_off) { max_off = fabsf(A[4]); p = 1; q = 2; }if (max_off < 1e-8f) break;// 计算旋转角度float app = (p == 0) ? A[0] : ((p == 1) ? A[3] : A[5]);float aqq = (q == 0) ? A[0] : ((q == 1) ? A[3] : A[5]);float apq = (p == 0 && q == 1) ? A[1] : ((p == 0 && q == 2) ? A[2] : A[4]);float theta = 0.5f * atan2f(2.f * apq, aqq - app);float c = __cosf(theta), s = __sinf(theta);// 应用旋转if (p == 0 && q == 1) {rotate_01(A, V, c, s);}else if (p == 0 && q == 2) {rotate_02(A, V, c, s);}else {rotate_12(A, V, c, s);}}eig0 = A[0]; eig1 = A[3]; eig2 = A[5];// 归一化特征向量for (int col = 0; col < 3; ++col) {float3 v = make_float3(V[col], V[col + 3], V[col + 6]);float nrm = norm3d(v);if (nrm > 1e-9f) {V[col] = v.x / nrm; V[col + 3] = v.y / nrm; V[col + 6] = v.z / nrm;}}
}// 优化的Cholesky分解求解(比高斯消元快2-3倍)
__device__ bool solve_6x6_cholesky(const float A[36], const float b[6], float x[6]) {float L[36] = { 0 };// Cholesky分解 A = L * L^Tfor (int i = 0; i < 6; i++) {for (int j = 0; j <= i; j++) {float sum = 0.f;for (int k = 0; k < j; k++) {sum += L[i * 6 + k] * L[j * 6 + k];}if (i == j) {float diag = A[i * 6 + i] - sum;if (diag <= 1e-12f) return false;L[i * 6 + i] = sqrtf(diag);}else {L[i * 6 + j] = (A[i * 6 + j] - sum) / L[j * 6 + j];}}}// 前向替换:L * y = bfloat y[6];for (int i = 0; i < 6; i++) {float sum = 0.f;for (int j = 0; j < i; j++) {sum += L[i * 6 + j] * y[j];}y[i] = (b[i] - sum) / L[i * 6 + i];}// 后向替换:L^T * x = y for (int i = 5; i >= 0; i--) {float sum = 0.f;for (int j = i + 1; j < 6; j++) {sum += L[j * 6 + i] * x[j]; // 注意:L^T的元素位置}x[i] = (y[i] - sum) / L[i * 6 + i];}return true;
}// Improved local frame construction for consistent orientation
__device__ void build_local_frame(float nx, float ny, float nz, float& tx, float& ty, float& tz,float& ux, float& uy, float& uz) {// 法线已经单位化// 构建第一个切向量:与法线和一个参考向量叉乘float ref_x = 1.0f, ref_y = 0.0f, ref_z = 0.0f;// 如果法线接近参考向量,选择另一个参考向量if (fabsf(nx * ref_x + ny * ref_y + nz * ref_z) > 0.9f) {ref_x = 0.0f; ref_y = 1.0f; ref_z = 0.0f;}// tx = ref × ntx = ref_y * nz - ref_z * ny;ty = ref_z * nx - ref_x * nz;tz = ref_x * ny - ref_y * nx;float t_nrm = sqrtf(tx * tx + ty * ty + tz * tz);if (t_nrm > 1e-9f) {tx /= t_nrm; ty /= t_nrm; tz /= t_nrm;} else {// 退化情况tx = 1.0f; ty = 0.0f; tz = 0.0f;}// ux = n × tux = ny * tz - nz * ty;uy = nz * tx - nx * tz;uz = nx * ty - ny * tx;float u_nrm = sqrtf(ux * ux + uy * uy + uz * uz);if (u_nrm > 1e-9f) {ux /= u_nrm; uy /= u_nrm; uz /= u_nrm;} else {ux = 0.0f; uy = 1.0f; uz = 0.0f;}
}// Kernels (compute_buckets_kernel and histogram_keys_kernel unchanged except __restrict__ qualifiers)
// 将点分配到空间网格中,加速邻居搜索
// 每个点计算所属的网格单元索引
__global__ void compute_buckets_kernel(const float* __restrict__ x, const float* __restrict__ y, int N,float minX, float minY, float invCell, int nx, int ny,int* __restrict__ out_bucket, int* __restrict__ out_idx)
{int i = blockIdx.x * blockDim.x + threadIdx.x;if (i >= N) return;float px = x[i], py = y[i];int cx = static_cast<int>((px - minX) * invCell);int cy = static_cast<int>((py - minY) * invCell);if (cx < 0) cx = 0; else if (cx >= nx) cx = nx - 1;if (cy < 0) cy = 0; else if (cy >= ny) cy = ny - 1;int b = cy * nx + cx;out_bucket[i] = b;out_idx[i] = i;
}__global__ void histogram_keys_kernel(const int* __restrict__ keys, int N, int* __restrict__ counts) {int i = blockIdx.x * blockDim.x + threadIdx.x;if (i >= N) return;int k = keys[i];atomicAdd(&counts[k], 1);
}// Improved: robust two-pass mls quadratic kernel with consistent normal orientation
__global__ void mls_quadratic_kernel(const float* __restrict__ x, const float* __restrict__ y, const float* __restrict__ z, int N,const int* __restrict__ bucket_of_point,const int* __restrict__ sorted_idx,const int* __restrict__ bucket_offsets,int nx, int ny,float minX, float minY, float cell, float invCell,float radius, float h2,float* __restrict__ out_x, float* __restrict__ out_y, float* __restrict__ out_z)
{int pid = blockIdx.x * blockDim.x + threadIdx.x;if (pid >= N) return;float px = x[pid], py = y[pid], pz = z[pid];int b0 = bucket_of_point[pid];int cx = b0 % nx;int cy = b0 / nx;int searchCells = max(1, (int)ceilf(radius / cell));// First pass: weighted centroid & covariancefloat S_w = 0.f;float Sx = 0.f, Sy = 0.f, Sz = 0.f;float Sxx = 0.f, Sxy = 0.f, Sxz = 0.f, Syy = 0.f, Syz = 0.f, Szz = 0.f;int neigh_count = 0;for (int dy = -searchCells; dy <= searchCells; ++dy) {int nyc = cy + dy;if (nyc < 0 || nyc >= ny) continue;for (int dx = -searchCells; dx <= searchCells; ++dx) {int nxc = cx + dx;if (nxc < 0 || nxc >= nx) continue;int nb = nyc * nx + nxc;int start = bucket_offsets[nb];int end = bucket_offsets[nb + 1];for (int pos = start; pos < end; ++pos) {int qidx = sorted_idx[pos];float qx = x[qidx], qy = y[qidx], qz = z[qidx];float dxq = qx - px;float dyq = qy - py;float dzq = qz - pz;float d2 = dxq * dxq + dyq * dyq + dzq * dzq;if (d2 > radius * radius) continue;float w = __expf(-d2 / h2);S_w += w;Sx += w * qx; Sy += w * qy; Sz += w * qz;Sxx += w * qx * qx; Sxy += w * qx * qy; Sxz += w * qx * qz;Syy += w * qy * qy; Syz += w * qy * qz; Szz += w * qz * qz;++neigh_count;}}
}if (neigh_count < 6 || S_w <= 0.f) {out_x[pid] = px; out_y[pid] = py; out_z[pid] = pz;return;
}float mean_x = Sx / S_w;
float mean_y = Sy / S_w;
float mean_z = Sz / S_w;float C00 = (Sxx / S_w) - mean_x * mean_x;
float C01 = (Sxy / S_w) - mean_x * mean_y;
float C02 = (Sxz / S_w) - mean_x * mean_z;
float C11 = (Syy / S_w) - mean_y * mean_y;
float C12 = (Syz / S_w) - mean_y * mean_z;
float C22 = (Szz / S_w) - mean_z * mean_z;float eig0, eig1, eig2; float V[9];
jacobi_eigen_3x3(C00, C01, C02, C11, C12, C22, eig0, eig1, eig2, V);float v0x = V[0], v0y = V[1], v0z = V[2];
float v1x = V[3], v1y = V[4], v1z = V[5];
float v2x = V[6], v2y = V[7], v2z = V[8];auto rayleigh = [&](float vx, float vy, float vz)->float {float cvx = C00 * vx + C01 * vy + C02 * vz;float cvy = C01 * vx + C11 * vy + C12 * vz;float cvz = C02 * vx + C12 * vy + C22 * vz;return vx * cvx + vy * cvy + vz * cvz;};float r0 = rayleigh(v0x, v0y, v0z);
float r1 = rayleigh(v1x, v1y, v1z);
float r2 = rayleigh(v2x, v2y, v2z);float nx_, ny_, nz_;
if (r0 <= r1 && r0 <= r2) { nx_ = v0x; ny_ = v0y; nz_ = v0z; }
else if (r1 <= r0 && r1 <= r2) { nx_ = v1x; ny_ = v1y; nz_ = v1z; }
else { nx_ = v2x; ny_ = v2y; nz_ = v2z; }// Fix: Consistent normal orientation
float ref_dot = nx_ * (px - mean_x) + ny_ * (py - mean_y) + nz_ * (pz - mean_z);
if (ref_dot < 0) { // Make normal point towards query pointnx_ = -nx_; ny_ = -ny_; nz_ = -nz_;
}float nrm = sqrtf(nx_ * nx_ + ny_ * ny_ + nz_ * nz_);
if (nrm < 1e-9f) { out_x[pid] = px; out_y[pid] = py; out_z[pid] = pz; return;
}
nx_ /= nrm; ny_ /= nrm; nz_ /= nrm;// Fix: Use improved local frame construction
float t1x, t1y, t1z, t2x, t2y, t2z;
build_local_frame(nx_, ny_, nz_, t1x, t1y, t1z, t2x, t2y, t2z);// second pass: build M and rhs for quadratic fit
float M[36]; for (int i = 0; i < 36; ++i) M[i] = 0.f;
float rhs[6]; for (int i = 0; i < 6; ++i) rhs[i] = 0.f;// Enhanced regularization for numerical stability
const float ridge_base = 1e-4f;for (int dy = -searchCells; dy <= searchCells; ++dy) {int nyc = cy + dy;if (nyc < 0 || nyc >= ny) continue;for (int dx = -searchCells; dx <= searchCells; ++dx) {int nxc = cx + dx;if (nxc < 0 || nxc >= nx) continue;int nb = nyc * nx + nxc;int start = bucket_offsets[nb];int end = bucket_offsets[nb + 1];for (int pos = start; pos < end; ++pos) {int qidx = sorted_idx[pos];float qx = x[qidx], qy = y[qidx], qz = z[qidx];float dxq = qx - mean_x;float dyq = qy - mean_y;float dzq = qz - mean_z;float u = dxq * t1x + dyq * t1y + dzq * t1z;float v_val = dxq * t2x + dyq * t2y + dzq * t2z;float wval = dxq * nx_ + dyq * ny_ + dzq * nz_;float dxq2 = qx - px;float dyq2 = qy - py;float dzq2 = qz - pz;float d2 = dxq2 * dxq2 + dyq2 * dyq2 + dzq2 * dzq2;if (d2 > radius * radius) continue;float w = __expf(-d2 / h2);float phi0 = 1.f;float phi1 = u;float phi2 = v_val;float phi3 = u * u;float phi4 = u * v_val;float phi5 = v_val * v_val;float phi[6] = { phi0,phi1,phi2,phi3,phi4,phi5 };for (int a = 0; a < 6; ++a) {rhs[a] += w * phi[a] * wval;for (int b = 0; b < 6; ++b) {M[a * 6 + b] += w * phi[a] * phi[b];}}}}
}// Fix: Enhanced regularization with condition number check
for (int d = 0; d < 6; ++d) {M[d * 6 + d] += ridge_base;if (d >= 3) { // Stronger regularization for quadratic termsM[d * 6 + d] += ridge_base * 5.0f;}
}// Check condition number (simplified)
float diag_min = 1e30f, diag_max = -1e30f;
for (int d = 0; d < 6; ++d) {float val = M[d * 6 + d];diag_min = fminf(diag_min, val);diag_max = fmaxf(diag_max, val);
}
float cond_estimate = (diag_max > 1e-9f) ? (diag_max / diag_min) : 1e9f;if (cond_estimate > 1e6f) {// Condition number too large, fallback to plane projectionfloat vx = px - mean_x, vy = py - mean_y, vz = pz - mean_z;float dot = vx * nx_ + vy * ny_ + vz * nz_;out_x[pid] = px - dot * nx_;out_y[pid] = py - dot * ny_;out_z[pid] = pz - dot * nz_;return;
}float sol[6];
bool ok = solve_6x6_cholesky(M, rhs, sol);
if (!ok) {// fallback to plane projectionfloat vx = px - mean_x, vy = py - mean_y, vz = pz - mean_z;float dot = vx * nx_ + vy * ny_ + vz * nz_;out_x[pid] = px - dot * nx_;out_y[pid] = py - dot * ny_;out_z[pid] = pz - dot * nz_;return;
}out_x[pid] = mean_x + sol[0] * nx_;out_y[pid] = mean_y + sol[0] * ny_;out_z[pid] = mean_z + sol[0] * nz_;
}// Host-side implementation with thrust reduce_by_key + scatter histogram (no atomic kernel)
void mls_cuda_quadratic(const std::vector<PointXYZ>& pts_host,float minX, float maxX, float minY, float maxY,const MLSParams& params,std::vector<PointXYZ>& out_host)
{const int N = static_cast<int>(pts_host.size());if (N == 0) { out_host.clear(); return; }float radius = params.radius;float cell = params.cell;if (cell <= 0.f) cell = radius;float h2 = radius * radius;// boundsfloat spanX = fmaxf(1e-6f, maxX - minX), spanY = fmaxf(1e-6f, maxY - minY);int nx = std::max(1, (int)ceilf(spanX / cell));int ny = std::max(1, (int)ceilf(spanY / cell));int cells = nx * ny;float invCell = 1.0f / cell;int searchCells = std::max(1, (int)ceilf(radius / cell));// copy SoA to device via thrust::device_vectorthrust::device_vector<float> d_x(N), d_y(N), d_z(N);// 创建临时的主机数组std::vector<float> h_x(N), h_y(N), h_z(N);
#pragma omp parallel forfor (int i = 0; i < N; ++i) {h_x[i] = pts_host[i].x;h_y[i] = pts_host[i].y;h_z[i] = pts_host[i].z;}// 批量拷贝到设备CUDA_CHECK(cudaMemcpy(thrust::raw_pointer_cast(d_x.data()), h_x.data(),sizeof(float) * N, cudaMemcpyHostToDevice));CUDA_CHECK(cudaMemcpy(thrust::raw_pointer_cast(d_y.data()), h_y.data(),sizeof(float) * N, cudaMemcpyHostToDevice));CUDA_CHECK(cudaMemcpy(thrust::raw_pointer_cast(d_z.data()), h_z.data(),sizeof(float) * N, cudaMemcpyHostToDevice));thrust::device_vector<int> d_bucket_keys(N), d_idx(N);int threads = 256;int blocks = (N + threads - 1) / threads;compute_buckets_kernel << <blocks, threads >> > (thrust::raw_pointer_cast(d_x.data()), thrust::raw_pointer_cast(d_y.data()), N,minX, minY, invCell, nx, ny,thrust::raw_pointer_cast(d_bucket_keys.data()),thrust::raw_pointer_cast(d_idx.data()));
CUDA_CHECK(cudaDeviceSynchronize());// keep copy of original bucket_of_point
thrust::device_vector<int> d_bucket_of_point = d_bucket_keys;// sort by bucket key
thrust::sort_by_key(thrust::device, d_bucket_keys.begin(), d_bucket_keys.end(), d_idx.begin());// reduce_by_key to produce unique keys and counts
thrust::device_vector<int> d_unique_keys(N), d_counts_tmp(N);
thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> new_end;
new_end = thrust::reduce_by_key(thrust::device,d_bucket_keys.begin(), d_bucket_keys.end(),thrust::constant_iterator<int>(1),d_unique_keys.begin(), d_counts_tmp.begin());int unique_count = static_cast<int>(new_end.first - d_unique_keys.begin());// scatter counts into full-size counts arraythrust::device_vector<int> d_counts(cells, 0);if (unique_count > 0) {thrust::scatter(thrust::device,d_counts_tmp.begin(), d_counts_tmp.begin() + unique_count,d_unique_keys.begin(), d_counts.begin());}// exclusive scan to offsetsthrust::device_vector<int> d_offsets(cells + 1);thrust::exclusive_scan(thrust::device, d_counts.begin(), d_counts.end(), d_offsets.begin());// last element is Nd_offsets[cells] = N;// prepare outputsthrust::device_vector<float> d_out_x(N), d_out_y(N), d_out_z(N);// launch MLS kernelint mls_threads = 128;int mls_blocks = (N + mls_threads - 1) / mls_threads;mls_quadratic_kernel << <mls_blocks, mls_threads >> > (thrust::raw_pointer_cast(d_x.data()),thrust::raw_pointer_cast(d_y.data()),thrust::raw_pointer_cast(d_z.data()),N,thrust::raw_pointer_cast(d_bucket_of_point.data()),thrust::raw_pointer_cast(d_idx.data()),thrust::raw_pointer_cast(d_offsets.data()),nx, ny,minX, minY, cell, invCell,radius, h2,thrust::raw_pointer_cast(d_out_x.data()),thrust::raw_pointer_cast(d_out_y.data()),thrust::raw_pointer_cast(d_out_z.data()));CUDA_CHECK(cudaDeviceSynchronize());// copy results back safely: copy three float arrays then assemble out_hoststd::vector<float> hx(N), hy(N), hz(N);CUDA_CHECK(cudaMemcpy(hx.data(), thrust::raw_pointer_cast(d_out_x.data()), sizeof(float) * N, cudaMemcpyDeviceToHost));CUDA_CHECK(cudaMemcpy(hy.data(), thrust::raw_pointer_cast(d_out_y.data()), sizeof(float) * N, cudaMemcpyDeviceToHost));CUDA_CHECK(cudaMemcpy(hz.data(), thrust::raw_pointer_cast(d_out_z.data()), sizeof(float) * N, cudaMemcpyDeviceToHost));out_host.resize(N);for (int i = 0; i < N; ++i) { out_host[i].x = hx[i]; out_host[i].y = hy[i]; out_host[i].z = hz[i]; }// free device memory happens via thrust destructors on scope exit
}// Demo main guarded by macro
#ifdef MLS_CUDA_DEMO
int main() {const int N = 500000; // reduce for demo; tune as neededstd::vector<PointXYZ> pts; pts.reserve(N);std::mt19937 rng(12345);std::uniform_real_distribution<float> ud(-50.f, 50.f);for (int i = 0; i < N; ++i) {PointXYZ p; p.x = ud(rng); p.y = ud(rng);p.z = 0.2f * p.x + 0.1f * p.y + 0.5f * (ud(rng) * 0.02f);pts.push_back(p);}MLSParams params; params.radius = 2.0f; params.cell = std::max(0.1f, params.radius * 0.8f); params.numThreads = 0;std::vector<PointXYZ> out;auto t0 = std::chrono::high_resolution_clock::now();mls_cuda_quadratic(pts, params, out);auto t1 = std::chrono::high_resolution_clock::now();double elapsed = std::chrono::duration<double>(t1 - t0).count();std::cout << "MLS quadratic for " << N << " points in " << elapsed << " s\n";for (int i = 0; i < 5; ++i) {printf("in (%.4f,%.4f,%.4f) out (%.4f,%.4f,%.4f)\n", pts[i].x, pts[i].y, pts[i].z, out[i].x, out[i].y, out[i].z);}return 0;
}
#endif