SELECT AVG(POWER(y - predict_sales(x1,x2,x3), 2)) FROM regression_data
R²
WITH ... AS (SELECT y, predict_sales(...) AS y_hat FROM ...) SELECT 1 - SUM(POWER(y-y_hat,2))/SUM(POWER(y - (SELECT AVG(y) FROM ...),2))
均方误差(MSE)-公式
R²-公式
2. 逻辑回归混淆矩阵
-- 生成混淆矩阵WITH filled_sales AS(SELECTmonth,COALESCE(promo_expenditure,AVG(promo_expenditure)OVER())AS promo_expenditure,COALESCE(customer_traffic,AVG(customer_traffic)OVER())AS customer_traffic,COALESCE(avg_price,AVG(avg_price)OVER())AS avg_price,sales_volumeFROM sales_prediction
),
standardized_churn AS(SELECT user_id,(registration_days -AVG(registration_days)OVER())/ STDDEV(registration_days)OVER()AS reg_days_std,(monthly_usage -AVG(monthly_usage)OVER())/ STDDEV(monthly_usage)OVER()AS usage_std,service_score,churn_statusFROM churn_data
),
prediction_results AS(SELECT user_id,churn_probability,CASEWHEN churn_probability >=0.5THENTRUEELSEFALSEENDAS predicted_churnFROM(SELECT sc.user_id,1/(1+EXP(-(mp1.param_value + mp2.param_value*sc.reg_days_std + mp3.param_value*sc.usage_std + mp4.param_value*sc.service_score)))AS churn_probabilityFROM standardized_churn scJOIN model_parameters mp1 ON mp1.model_name ='logistic_regression'AND mp1.param_name ='beta0'JOIN model_parameters mp2 ON mp2.model_name ='logistic_regression'AND mp2.param_name ='beta1'JOIN model_parameters mp3 ON mp3.model_name ='logistic_regression'AND mp3.param_name ='beta2'JOIN model_parameters mp4 ON mp4.model_name ='logistic_regression'AND mp4.param_name ='beta3') subquery
),
predictions AS(SELECT churn_status AS actual,predicted_churn AS predictedFROM churn_data JOIN prediction_results USING(user_id))SELECTSUM(CASEWHEN actual AND predicted THEN1ELSE0END)AS true_positive,SUM(CASEWHEN actual ANDNOT predicted THEN1ELSE0END)AS false_negative,SUM(CASEWHENNOT actual AND predicted THEN1ELSE0END)AS false_positive,SUM(CASEWHENNOT actual ANDNOT predicted THEN1ELSE0END)AS true_negative
FROM predictions;
6.2.6 案例实战:客户流失预测
1. 数据概况
样本量:5000条客户记录
特征分布:
特征
均值
标准差
最小值
最大值
注册时长
125天
45天
10
365
月均使用时长
18.5小时
5.2小时
2
40
服务评分
3.2分
1.1分
1
5
流失率:18%
2. 模型训练结果
参数
估计值
标准误差
z值
p值
截距(β0)
-1.23
0.15
-8.2
<0.001
注册时长(β1)
-0.85
0.08
-10.6
<0.001
使用时长(β2)
1.52
0.12
12.7
<0.001
服务评分(β3)
0.98
0.09
10.9
<0.001
3. 可视化评估(ROC曲线)
6.2.7 最佳实践与性能优化
1. 数据划分策略
-- 按7:3比例划分训练集和测试集-- 创建临时表存储划分结果CREATETEMPORARYTABLE train_test_split_temp ASSELECT*,CASEWHEN RANDOM()<0.7THEN'train'ELSE'test'ENDAS dataset
FROM churn_data;-- 创建训练数据集SELECT*INTO train_data FROM train_test_split_temp WHERE dataset ='train';-- 创建测试数据集SELECT*INTO test_data FROM train_test_split_temp WHERE dataset ='test';-- 删除临时表DROPTABLE train_test_split_temp;
2. 存储过程优化技巧
批量处理:使用SET SETTINGS提高事务处理效率
索引优化:对特征列创建索引加速数据访问
并行计算:利用PostgreSQL 12+的并行聚合功能提升计算速度
3. 与外部工具集成
-- 导出模型参数到CSV供可视化工具使用
COPY (SELECT param_name, param_value FROM model_parameters WHERE model_name ='logistic_regression')TO'/tmp/model_params.csv'WITH CSV HEADER;