Datawhale AI夏令营学习笔记 2.1Baseline理解

最新推荐文章于 2025-10-25 20:25:37 发布

原创最新推荐文章于 2025-10-25 20:25:37 发布 · 119 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#人工智能 #学习 #笔记

1、数据加载

import pandas as pd
video_data = pd.read_csv("origin_videos_data.csv")
comments_data = pd.read_csv("origin_comments_data.csv")

2、将文本内容进行和并

video_data["text"] = video_data["video_desc"].fillna("") + " " + video_data["video_tags"].fillna("")

3、预测产品名称

# 构建预测模型管道 依次执行向量化和分类 jieba.lcut指定中文分词 SGDClassifier()：使用随机梯度下降的线性分类器，适合大规模稀疏数据。
product_name_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut, max_features=50), SGDClassifier()
)
#模型训练 找出不为空的进行训练
product_name_predictor.fit(
    video_data[~video_data["product_name"].isnull()]["text"],
    video_data[~video_data["product_name"].isnull()]["product_name"],
)
#预测缺失的产品名 但会覆盖现有的
video_data["product_name"] = product_name_predictor.predict(video_data["text"])

4、情感分析

通过一段评论文本 comment_text 来预测多个标签字段（如情感、场景、提问等），分别训练四个独立的模型。

#依次处理
for col in ['sentiment_category', 'user_scenario', 'user_question', 'user_suggestion']:

  #每轮循环创造一个新的Predictor 
 predictor = make_pipeline(
        TfidfVectorizer(tokenizer=jieba.lcut), SGDClassifier()
    )
#模型训练
    predictor.fit(
        comments_data[~comments_data[col].isnull()]["comment_text"],
        comments_data[~comments_data[col].isnull()][col],
    )
#模型预测
    comments_data[col] = predictor.predict(comments_data["comment_text"])

5、评论聚类

#构建聚类模型 将数据分为两个聚类
kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), 
    KMeans(n_clusters=2)
)

#聚类建模 筛选出sentiment_category为1或3的评论进行训练和预测
kmeans_predictor.fit(
    comments_data[comments_data["sentiment_category"].isin([1, 3])]["comment_text"]
)

kmeans_cluster_label = kmeans_predictor.predict(
    comments_data[comments_data["sentiment_category"].isin([1, 3])]["comment_text"]
)

#提取每个聚类主题关键词
kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
#对每个聚类中心：排序出权重最高的 TF-IDF 特征（即最具代表性的关键词）并拼接成字符串，作为该聚类的“主题关键词”
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

#给每条评论打上聚类主题标签
comments_data.loc[
    comments_data["sentiment_category"].isin([1, 3]), 
    "positive_cluster_theme"
] = [kmeans_top_word[x] for x in kmeans_cluster_label]

重复上述操作改变分簇类型并根据用户场景、用户疑问、用户建议进行簇分类

kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)

kmeans_predictor.fit(comments_data[comments_data["user_scenario"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_scenario"].isin([1])]["comment_text"])

kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

comments_data.loc[comments_data["user_scenario"].isin([1]), "scenario_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]

kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)

kmeans_predictor.fit(comments_data[comments_data["sentiment_category"].isin([2, 3])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["sentiment_category"].isin([2, 3])]["comment_text"])

kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

comments_data.loc[comments_data["sentiment_category"].isin([2, 3]), "negative_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]

kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)

kmeans_predictor.fit(comments_data[comments_data["user_question"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_question"].isin([1])]["comment_text"])

kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

comments_data.loc[comments_data["user_question"].isin([1]), "question_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]

kmeans_predictor = make_pipeline(
    TfidfVectorizer(tokenizer=jieba.lcut), KMeans(n_clusters=2)
)

kmeans_predictor.fit(comments_data[comments_data["user_suggestion"].isin([1])]["comment_text"])
kmeans_cluster_label = kmeans_predictor.predict(comments_data[comments_data["user_suggestion"].isin([1])]["comment_text"])

kmeans_top_word = []
tfidf_vectorizer = kmeans_predictor.named_steps['tfidfvectorizer']
kmeans_model = kmeans_predictor.named_steps['kmeans']
feature_names = tfidf_vectorizer.get_feature_names_out()
cluster_centers = kmeans_model.cluster_centers_
for i in range(kmeans_model.n_clusters):
    top_feature_indices = cluster_centers[i].argsort()[::-1]
    top_word = ' '.join([feature_names[idx] for idx in top_feature_indices[:top_n_words]])
    kmeans_top_word.append(top_word)

comments_data.loc[comments_data["user_suggestion"].isin([1]), "suggestion_cluster_theme"] = [kmeans_top_word[x] for x in kmeans_cluster_label]

聚类的流程整体如下

评论文本 → TF-IDF → KMeans 聚类 → 聚类标签
↓
聚类中心 → 每类的代表性关键词
↓
每条评论 → 所属类的主题词 → 写入 positive_cluster_theme 列