Datawhale AI夏令营 学习笔记 2.1Baseline理解

1、数据加载

import pandas as pd
video_data = pd.read_csv("origin_videos_data.csv")
comments_data = pd.read_csv("origin_comments_data.csv")

2、将文本内容进行和并

video_data["text"] = video_data["video_desc"].fillna("") + " " + video_data["video_tags"].fillna("")

3、预测产品名称

# 构建预测模型管道 依次执行向量化和分类 jieba.lcut指定中文分词 SGDClassifier():使用随机梯度下降的线性分类器,适合大规模稀疏数据。
product_name_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut, max_features=50), SGDClassifier()
)
#模型训练 找出不为空的进行训练
product_name_predictor.fit(
    video_data[~video_data["product_name"].isnull()]["text"],
    video_data[~video_data["product_name"].isnull()]["product_name"],
)
#预测缺失的产品名 但会覆盖现有的
video_data["product_name"] = product_name_predictor.predict(video_data["text"])

4、情感分析

通过一段评论文本 comment_text 来预测多个标签字段(如情感、场景、提问等),分别训练四个独立的模型。

#依次处理
for col in ['sentiment_category', 'user_scenario', 'user_question', 'user_suggestion']:
  #每轮循环创造一个新的Predictor 
 predictor = make_pipeline(
        TfidfVectorizer(tokenizer=jieba.lcut), SGDClassifier()
    )
#模型训练
    predictor.fit(
        comments_data[~comments_data[col].isnull()]["comment_text"],
        comments_data[~comments_data[col].isnull()][col],
    )
#模型预测
    comments_data[col] = predictor.predict(comments_data["comment_text"])

5、评论聚类

#构建聚类模型 将数据分为两个聚类
kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), 
    KMeans(n_clusters=2)
)

#聚类建模 筛选出sentiment_category为1或3的评论进行训练和预测
kmeans_predictor.fit(
    comments_data[comments_data["sentiment_category"].isin([1, 3])]["comment_text"]
)

kmeans_cluster_label = kmeans_predictor.predict(
    comments_data[comments_data["sentiment_category"].isin([1, 3])]["comment_text"]
)

#提取每个聚类主题关键词
kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
#对每个聚类中心:排序出权重最高的 TF-IDF 特征(即最具代表性的关键词)并拼接成字符串,作为该聚类的“主题关键词”
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

#给每条评论打上聚类主题标签
comments_data.loc[
    comments_data["sentiment_category"].isin([1, 3]), 
    "positive_cluster_theme"
] = [kmeans_top_word[x] for x in kmeans_cluster_label]

重复上述操作 改变分簇类型 并根据用户场景、用户疑问、用户建议进行簇分类 

kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)

kmeans_predictor.fit(comments_data[comments_data["user_scenario"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_scenario"].isin([1])]["comment_text"])

kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

comments_data.loc[comments_data["user_scenario"].isin([1]), "scenario_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]
kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)

kmeans_predictor.fit(comments_data[comments_data["sentiment_category"].isin([2, 3])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["sentiment_category"].isin([2, 3])]["comment_text"])

kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

comments_data.loc[comments_data["sentiment_category"].isin([2, 3]), "negative_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]
kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)

kmeans_predictor.fit(comments_data[comments_data["user_question"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_question"].isin([1])]["comment_text"])

kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

comments_data.loc[comments_data["user_question"].isin([1]), "question_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]
kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)

kmeans_predictor.fit(comments_data[comments_data["user_suggestion"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_suggestion"].isin([1])]["comment_text"])

kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

comments_data.loc[comments_data["user_suggestion"].isin([1]), "suggestion_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]

聚类的流程整体如下

评论文本 → TF-IDF → KMeans 聚类 → 聚类标签
                                ↓
               聚类中心 → 每类的代表性关键词
                                ↓
       每条评论 → 所属类的主题词 → 写入 positive_cluster_theme 列
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值