1、数据加载
import pandas as pd
video_data = pd.read_csv("origin_videos_data.csv")
comments_data = pd.read_csv("origin_comments_data.csv")
2、将文本内容进行和并
video_data["text"] = video_data["video_desc"].fillna("") + " " + video_data["video_tags"].fillna("")
3、预测产品名称
# 构建预测模型管道 依次执行向量化和分类 jieba.lcut指定中文分词 SGDClassifier():使用随机梯度下降的线性分类器,适合大规模稀疏数据。
product_name_predictor = make_pipeline(
TfidfVectorizer(tokenizer=jieba.lcut, max_features=50), SGDClassifier()
)
#模型训练 找出不为空的进行训练
product_name_predictor.fit(
video_data[~video_data["product_name"].isnull()]["text"],
video_data[~video_data["product_name"].isnull()]["product_name"],
)
#预测缺失的产品名 但会覆盖现有的
video_data["product_name"] = product_name_predictor.predict(video_data["text"])
4、情感分析
通过一段评论文本 comment_text 来预测多个标签字段(如情感、场景、提问等),分别训练四个独立的模型。
#依次处理
for col in ['sentiment_category', 'user_scenario', 'user_question', 'user_suggestion']:
#每轮循环创造一个新的Predictor
predictor = make_pipeline(
TfidfVectorizer(tokenizer=jieba.lcut), SGDClassifier()
)
#模型训练
predictor.fit(
comments_data[~comments_data[col].isnull()]["comment_text"],
comments_data[~comments_data[col].isnull()][col],
)
#模型预测
comments_data[col] = predictor.predict(comments_data["comment_text"])
5、评论聚类
#构建聚类模型 将数据分为两个聚类
kmeans_predictor = make_pipeline(
TfidfVectorizer(tokenizer=jieba.lcut),
KMeans(n_clusters=2)
)
#聚类建模 筛选出sentiment_category为1或3的评论进行训练和预测
kmeans_predictor.fit(
comments_data[comments_data["sentiment_category"].isin([1, 3])]["comment_text"]
)
kmeans_cluster_label = kmeans_predictor.predict(
comments_data[comments_data["sentiment_category"].isin([1, 3])]["comment_text"]
)
#提取每个聚类主题关键词
kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
#对每个聚类中心:排序出权重最高的 TF-IDF 特征(即最具代表性的关键词)并拼接成字符串,作为该聚类的“主题关键词”
for i in range(kmeans_model.n_clusters):
top_feature_indices = cluster_centers[i].argsort()[::-1]
top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
kmeans_top_word.append(top_word)
#给每条评论打上聚类主题标签
comments_data.loc[
comments_data["sentiment_category"].isin([1, 3]),
"positive_cluster_theme"
] = [kmeans_top_word[x] for x in kmeans_cluster_label]
重复上述操作 改变分簇类型 并根据用户场景、用户疑问、用户建议进行簇分类
kmeans_predictor = make_pipeline(
TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)
kmeans_predictor.fit(comments_data[comments_data["user_scenario"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_scenario"].isin([1])]["comment_text"])
kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
top_feature_indices = cluster_centers[i].argsort()[::-1]
top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
kmeans_top_word.append(top_word)
comments_data.loc[comments_data["user_scenario"].isin([1]), "scenario_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]
kmeans_predictor = make_pipeline(
TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)
kmeans_predictor.fit(comments_data[comments_data["sentiment_category"].isin([2, 3])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["sentiment_category"].isin([2, 3])]["comment_text"])
kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
top_feature_indices = cluster_centers[i].argsort()[::-1]
top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
kmeans_top_word.append(top_word)
comments_data.loc[comments_data["sentiment_category"].isin([2, 3]), "negative_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]
kmeans_predictor = make_pipeline(
TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)
kmeans_predictor.fit(comments_data[comments_data["user_question"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_question"].isin([1])]["comment_text"])
kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
top_feature_indices = cluster_centers[i].argsort()[::-1]
top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
kmeans_top_word.append(top_word)
comments_data.loc[comments_data["user_question"].isin([1]), "question_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]
kmeans_predictor = make_pipeline(
TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)
kmeans_predictor.fit(comments_data[comments_data["user_suggestion"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_suggestion"].isin([1])]["comment_text"])
kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
top_feature_indices = cluster_centers[i].argsort()[::-1]
top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
kmeans_top_word.append(top_word)
comments_data.loc[comments_data["user_suggestion"].isin([1]), "suggestion_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]
聚类的流程整体如下
评论文本 → TF-IDF → KMeans 聚类 → 聚类标签
↓
聚类中心 → 每类的代表性关键词
↓
每条评论 → 所属类的主题词 → 写入 positive_cluster_theme 列
1391

被折叠的 条评论
为什么被折叠?



