二级标题
此数据类型为评论 + label即分数数据只有两列
import jieba
import pandas as pd
df = pd.read_csv(“D:\3\1\2022020687.csv”,encoding=“gbk”)
df.head()
df[‘cut’] = df[“评论”].apply(jieba.lcut)
df[“cut”].head()
#划分训练集测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.cut, df[“评分”], test_size=0.3)
x_train[:2]
初始化word2vec模型和词表
from gensim.models.word2vec import Word2Vec
n_dim = 300 # 指定向量维度,大样本量时300~500较好
w2vmodel = Word2Vec(vector_size = n_dim, min_count = 10)
w2vmodel.build_vocab(x_train) # 生成词表
w2vmodel.train(x_train,total_examples = w2vmodel.corpus_count, epochs = 10)
#相似度匹配
w2vmodel.wv.most_similar(“不错”)
#生成整句所对应的所有词条的词向量矩阵
pd.DataFrame([w2vmodel.wv[w] for w in df.cut[0] if w in w2vmodel.wv]).head()
用各个词向量直接平均的方式生成整句对应的向量
def m_avgvec(words, w2vmodel):
return pd.DataFrame([w2vmodel.wv[w] for w in words if w in w2vmodel.wv]).agg(“mean”)
生成建模用矩阵,耗时较长
train_vecs = pd.DataFrame([m_avgvec(s, w2vmodel) for s in x_train])
train_vecs.head()
#若数据有空值存在则用均值填补
print(train_vecs.isnull().any().sum())
print(y_train.isnull().any().sum())#查看空值总数
train_vecs=train_vecs.fillna(train_vecs.mean())
print(train_vecs.isnull().any().sum())
# 用转换后的矩阵拟合SVM等其他模型
from sklearn.svm import SVC
clf2 = SVC(kernel = ‘rbf’, verbose = True)
clf2.fit(train_vecs, y_train)
clf2.score(train_vecs, y_train)
#lightgbm亲测0.917
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
clf = RandomForestClassifier(class_weight=‘balanced’,random_state=37)
clf.fit(train_vecs, y_train)
print(clf.score(train_vecs, y_train))
#随机森林0.90
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
clf = RandomForestClassifier(class_weight=‘balanced’,random_state=37)
clf.fit(train_vecs, y_train)
print(clf.score(train_vecs, y_train))