协同过滤实现电影推荐
以下是一个基于用户和物品的协同过滤算法实现电影推荐系统的Python示例。该示例包含数据处理、相似度计算和推荐生成等核心功能。
这个实现包含以下核心功能:
- 数据处理:构建用户-物品评分矩阵,支持稀疏数据处理
- 相似度计算:
- 基于用户的协同过滤:计算用户间的余弦相似度
- 基于物品的协同过滤:计算物品间的余弦相似度
- 推荐生成:
- 基于用户的推荐:找到相似用户并加权汇总他们的评分
- 基于物品的推荐:根据用户历史评分物品找到相似物品
- 结果展示:支持关联电影元数据(名称、类型等)
使用时只需创建MovieRecommender
对象,调用fit()
方法训练模型,然后使用recommend()
方法为指定用户生成推荐。代码中包含了示例数据和使用方式,方便理解和测试。
你可以根据实际需求扩展此代码,例如添加降维技术(SVD)处理大规模数据,或使用更复杂的相似度计算方法。
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdictclass MovieRecommender:def __init__(self, similarity_method='user'):"""初始化电影推荐系统:param similarity_method: 相似度计算方法,'user'表示基于用户,'item'表示基于物品"""self.similarity_method = similarity_methodself.user_item_matrix = Noneself.similarity_matrix = Noneself.user_id_mapping = Noneself.item_id_mapping = Noneself.item_id_inverse_mapping = Noneself.movies_df = Nonedef fit(self, ratings_df, movies_df=None):"""训练推荐模型:param ratings_df: 评分数据,包含user_id, item_id, rating三列:param movies_df: 电影元数据,包含item_id, title等列(可选)"""# 构建用户-物品评分矩阵self._build_user_item_matrix(ratings_df)# 计算相似度矩阵if self.similarity_method == 'user':self._compute_user_similarity()else:self._compute_item_similarity()# 保存电影元数据(如果提供)self.movies_df = movies_dfdef _build_user_item_matrix(self, ratings_df):"""构建用户-物品评分矩阵"""# 创建用户和物品的ID映射unique_users = ratings_df['user_id'].unique()unique_items = ratings_df['item_id'].unique()self.user_id_mapping = {user_id: idx for idx, user_id in enumerate(unique_users)}self.item_id_mapping = {item_id: idx for idx, item_id in enumerate(unique_items)}self.item_id_inverse_mapping = {idx: item_id for idx, item_id in enumerate(unique_items)}# 初始化评分矩阵n_users = len(unique_users)n_items = len(unique_items)self.user_item_matrix = np.zeros((n_users, n_items))# 填充评分矩阵for _, row in ratings_df.iterrows():user_idx = self.user_id_mapping[row['user_id']]item_idx = self.item_id_mapping[row['item_id']]self.user_item_matrix[user_idx, item_idx] = row['rating']def _compute_user_similarity(self):"""计算用户相似度矩阵"""# 处理缺失值:用0填充未评分项matrix_for_sim = np.nan_to_num(self.user_item_matrix)# 计算余弦相似度self.similarity_matrix = cosine_similarity(matrix_for_sim)# 自己与自己的相似度设为0(避免推荐自己)np.fill_diagonal(self.similarity_matrix, 0)def _compute_item_similarity(self):"""计算物品相似度矩阵"""# 转置用户-物品矩阵为物品-用户矩阵item_user_matrix = self.user_item_matrix.T# 处理缺失值matrix_for_sim = np.nan_to_num(item_user_matrix)# 计算余弦相似度self.similarity_matrix = cosine_similarity(matrix_for_sim)# 自己与自己的相似度设为0np.fill_diagonal(self.similarity_matrix, 0)def recommend(self, user_id, n_recommendations=10, exclude_rated=True):"""为指定用户生成推荐:param user_id: 用户ID:param n_recommendations: 推荐数量:param exclude_rated: 是否排除已评分物品:return: 推荐物品列表(包含物品ID和推荐得分)"""if user_id not in self.user_id_mapping:raise ValueError(f"用户ID {user_id} 不存在")user_idx = self.user_id_mapping[user_id]if self.similarity_method == 'user':return self._user_based_recommendation(user_idx, n_recommendations, exclude_rated)else:return self._item_based_recommendation(user_idx, n_recommendations, exclude_rated)def _user_based_recommendation(self, user_idx, n_recommendations, exclude_rated):"""基于用户的协同过滤推荐"""# 获取相似用户及其相似度similar_users = self.similarity_matrix[user_idx]# 获取目标用户的评分历史user_ratings = self.user_item_matrix[user_idx]# 计算加权评分weighted_ratings = np.zeros(self.user_item_matrix.shape[1])for similar_user_idx, similarity in enumerate(similar_users):if similarity <= 0: # 忽略负相似度continue# 加权累加相似用户的评分weighted_ratings += similarity * self.user_item_matrix[similar_user_idx]# 排除已评分物品(如果需要)if exclude_rated:weighted_ratings[user_ratings > 0] = -np.inf# 获取推荐物品索引recommended_item_indices = np.argsort(weighted_ratings)[::-1][:n_recommendations]# 转换为物品ID并返回return [{'item_id': self.item_id_inverse_mapping[idx],'score': weighted_ratings[idx]}for idx in recommended_item_indices]def _item_based_recommendation(self, user_idx, n_recommendations, exclude_rated):"""基于物品的协同过滤推荐"""# 获取用户的评分历史user_ratings = self.user_item_matrix[user_idx]# 已评分物品的索引rated_item_indices = np.where(user_ratings > 0)[0]# 计算推荐得分recommendation_scores = np.zeros(self.user_item_matrix.shape[1])for item_idx in rated_item_indices:# 获取该物品与其他物品的相似度item_similarities = self.similarity_matrix[item_idx]# 加权累加相似度(权重为用户对该物品的评分)recommendation_scores += user_ratings[item_idx] * item_similarities# 排除已评分物品(如果需要)if exclude_rated:recommendation_scores[rated_item_indices] = -np.inf# 获取推荐物品索引recommended_item_indices = np.argsort(recommendation_scores)[::-1][:n_recommendations]# 转换为物品ID并返回return [{'item_id': self.item_id_inverse_mapping[idx],'score': recommendation_scores[idx]}for idx in recommended_item_indices]def get_recommendation_details(self, recommendations):"""获取推荐结果的详细信息(如果有电影元数据):param recommendations: 推荐结果列表:return: 包含详细信息的推荐列表"""if self.movies_df is None:return recommendationsdetails = []for rec in recommendations:item_id = rec['item_id']movie_info = self.movies_df[self.movies_df['item_id'] == item_id].to_dict('records')if movie_info:detail = {**rec, **movie_info[0]}details.append(detail)else:details.append(rec)return details# 示例用法
if __name__ == "__main__":# 创建示例数据data = {'user_id': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],'item_id': [101, 102, 101, 103, 102, 104, 103, 104, 101, 105],'rating': [5, 4, 4, 5, 3, 5, 5, 4, 4, 5]}movies = {'item_id': [101, 102, 103, 104, 105],'title': ['肖申克的救赎', '泰坦尼克号', '阿甘正传', '盗梦空间', '星际穿越'],'genres': ['剧情', '爱情', '励志', '科幻', '科幻']}ratings_df = pd.DataFrame(data)movies_df = pd.DataFrame(movies)# 基于用户的推荐user_cf_recommender = MovieRecommender(similarity_method='user')user_cf_recommender.fit(ratings_df, movies_df)# 为用户1生成推荐user1_recommendations = user_cf_recommender.recommend(1, n_recommendations=3)print("基于用户的推荐结果:")for rec in user_cf_recommender.get_recommendation_details(user1_recommendations):print(f"电影: {rec['title']}, 类型: {rec['genres']}, 推荐得分: {rec['score']:.2f}")# 基于物品的推荐item_cf_recommender = MovieRecommender(similarity_method='item')item_cf_recommender.fit(ratings_df, movies_df)# 为用户1生成推荐user1_recommendations = item_cf_recommender.recommend(1, n_recommendations=3)print("\n基于物品的推荐结果:")for rec in item_cf_recommender.get_recommendation_details(user1_recommendations):print(f"电影: {rec['title']}, 类型: {rec['genres']}, 推荐得分: {rec['score']:.2f}")