机器学习例题——预测facebook签到位置(K近邻算法)和葡萄酒质量预测(线性回归)
一、预测facebook签到位置
代码展示:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score# 1.获取数据集
df = pd.read_csv("./data/train.csv", encoding="utf-8")
print(df.shape)
# 2.基本数据处理
# 2.1 缩⼩数据范围
df_data = df.query('(x >= 2) & (x <= 3) & (y >= 2) & (y <= 3)').copy()
print(df_data.shape)
# 2.2 选择时间特征
# 假设这里我们提取时间特征中的小时部分作为新特征
df_data['hour'] = pd.to_datetime(df_data['time'], unit='s').dt.hour
print(df_data.hour)# 2.3 去掉签到较少的地⽅
# 假设我们去掉签到次数小于10次的地方
place_counts = df_data['place_id'].value_counts()
less_visited_places = place_counts[place_counts < 10].index
df_data = df_data[~df_data['place_id'].isin(less_visited_places)]
print(df_data.shape)# 2.4 确定特征值和⽬标值
# 选择x, y, hour作为特征值,place_id作为目标值
X = df_data[['x', 'y', 'hour']]
y = df_data['place_id']# 2.5 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# 3.特征工程 -- 特征预处理(标准化)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 4.机器学习 -- knn+cv
# 假设我们使用KNN算法,K值设为5
knn = KNeighborsClassifier(n_neighbors=5)
# 使用交叉验证评估模型
cv_scores = cross_val_score(knn, X_train, y_train, cv=5)
print("交叉验证得分:", cv_scores)
print("平均交叉验证得分:", cv_scores.mean())# 在训练集上训练模型
knn.fit(X_train, y_train)# 5.模型评估
y_pred = knn.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("测试集上的准确率:", test_accuracy)
结果展示:
交叉验证得分: [0.38221957 0.38024076 0.38277611 0.37953993 0.38213712]
平均交叉验证得分: 0.3813826936554397
测试集上的准确率: 0.38908035552330855
二、葡萄酒质量预测
from sklearn.datasets import load_wine
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import joblib# 1.获取数据集
df = load_wine()
# 2.基本数据处理
x_train, x_test, y_train, y_test = train_test_split(df.data, df.target, test_size=0.2, random_state=20)
# 3.特征工程
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)# 4.机器学习 - 正规方程
lr = LinearRegression()
lr.fit(x_train, y_train)joblib.dump(lr,"./test.pkl")lr = joblib.load("./test.pkl")# 5.模型预测与评估 - 正规方程
lr_predict = lr.predict(x_test)
lr_mse = mean_squared_error(y_test, lr_predict)
print("正规方程的均方误差:", lr_mse)# 4.机器学习 - 梯度下降法
sgd = SGDRegressor()
sgd.fit(x_train, y_train)# 5.模型预测与评估 - 梯度下降法
sgd_predict = sgd.predict(x_test)
sgd_mse = mean_squared_error(y_test, sgd_predict)
print("梯度下降法的均方误差:", sgd_mse)
结果展示:
正规方程的均方误差: 0.06709703764885735
梯度下降法的均方误差: 0.06637844373293354