针对文本聚类优化
优化TF-IDF特征工程
from sklearn.feature_extraction.text import TfidfVectorizertfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=2, max_df=0.8, token_pattern=r"\b\w+\b"
)
动态选择最佳簇数 n_clusters
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
X = tfidf.fit_transform(comments_to_cluster)
best_k = 0
best_silhouette = -1for k in range(5, 9):kmeans = KMeans(n_clusters=k, random_state=42)labels = kmeans.fit_predict(X)score = silhouette_score(X, labels)if score > best_silhouette:best_silhouette = scorebest_k = k
改进聚类算法
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
kmeans_predictor = make_pipeline(TfidfVectorizer(tokenizer=jieba.lcut, ngram_range=(1, 2), max_features=5000, min_df=2, max_df=0.8, token_pattern=r"\b\w+\b" ),Normalizer(norm="l2"), KMeans(n_clusters=best_k, random_state=42, n_init=10)
)
comments_data_clean = comments_data[comments_data["sentiment_category"].isin([1, 3])]
kmeans_predictor.fit(comments_data_clean["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data_clean["comment_text"])kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):top_feature_indices = cluster_centers[i].argsort()[::-1]top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])kmeans_top_word.append(top_word)comments_data.loc[comments_data["sentiment_category"].isin([1, 3]), "positive_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]
提交得分
