1.前言
针对Sklearn在前面已经通过代码实战讲解了其中的各个主要模块,现在将从整体的角度深度理解一下Sklearn, 本文主要以代码形式讲解,在代码中有注释,话不多说,开车!!!(请坐稳)
数据链接
密码:a6vy
2.数据处理
class Sentiment:
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
POSITIVE = "POSITIVE"
class Review:
def __init__(self, text, score):
self.text = text
self.score = score
self.sentiment = self.get_sentiment() # 调用类内函数
def get_sentiment(self):
if self.score <= 2:
return Sentiment.NEGATIVE # 类的属性调用(类间调用)
elif self.score == 3:
return Sentiment.NEUTRAL
else:
return Sentiment.POSITIVE
class ReviewContainer: # 对训练集、测试集处理
def __init__(self,reviews):
self.reviews = reviews
def get_text(self):
return [x.text for x in self.reviews] # 将“text”放一起
def get_sentiment(self):
return [x.sentiment for x in self.reviews] # 将“sentiment”放一起
def evenly_distribute(self): # 均匀分配数据
negative = list(filter(lambda x : x.sentiment == Sentiment.NEGATIVE,self.reviews)) # 筛选NEGATIVE
positive = list(filter(lambda x : x.sentiment == Sentiment.POSITIVE,self.reviews)) # 筛选POSITIVE
positive_shrunk = positive[:len(negative)] # 切片,使积极的样本与消极的样本一样多
self.reviews = negative + positive_shrunk # 最终样本
random.shuffle(self.reviews) #洗牌
#filter() 函数用于过滤序列,过滤掉不符合条件的元素,返回一个迭代器对象,如果要转换为列表,可以使用 list() 来转换
#接收两个参数,第一个为函数,第二个为序列,序列的每个元素作为参数传递给函数进行判,然后返回 True 或 False,最后将返回 True 的元素放到新列表中
接下来就是读取数据并利用上面的类处理数据:
import json
reviews = []
with open("books_small_10000.json") as f:
for line in f:
review = json.loads(line) # 对数据进行解码
reviews.append(Review(review["reviewText"], review["overall"]))
print(reviews[5].text) # 类的函数调用
print(reviews[5].score)
print(reviews[5].sentiment)
再进行训练集测试集拆分,并分别拿到对应的特征和标签:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size=0.33, random_state=42) # 拆分数据
train_container = ReviewContainer(training) # 实例化训练集对象
test_container = ReviewContainer(test) # 实例化测试集对象
train_container.evenly_distribute() # 先对训练集取相同样本再打乱
train_x = train_container.get_text() # 取训练数据
train_y = train_container.get_sentiment() # 取训练标签
test_container.evenly_distribute() # 先对测试集取相同样本再打乱
test_x = test_container.get_text() # 取测试数据
test_y = test_container.get_sentiment() # 取测试标签
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))
# print(train_x_vectors[0])
# print(train_x_vectors[0].toarray())
最后用TfidfVectorizer把原始文本转化为tf-idf的特征矩阵:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x) # 对训练数据用fit_transform
test_x_vectors = vectorizer.transform(test_x) # 对测试数据用仅用transform
print(vectorizer.get_feature_names())
3.模型构建
3.1.支持向量机
from sklearn.svm import SVC
from sklearn.metrics import f1_score
clf_svm = SVC(kernel="linear")
clf_svm.fit(train_x_vectors, train_y) # 训练数据
print(clf_svm.score(test_x_vectors, test_y)) # 用测试数据计算模型分类效果
print(clf_svm.predict(test_x_vectors[0])) #用训练好的模型预测测试数据
print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
3.2.决策树
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
print(clf_dec.score(test_x_vectors, test_y))
print(clf_dec.predict(test_x_vectors[0]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
3.3.逻辑回归
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
print(clf_log.score(test_x_vectors, test_y))
print(clf_log.predict(test_x_vectors[0]))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
4.网格搜索寻找最优结果
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':("linear","rbf"), "C":(1,4,8,16,32)}
svc = SVC()
clf = GridSearchCV(svc, parameters, cv=5) #五折交叉验证
clf.fit(train_x_vectors, train_y)
print(clf.score(test_x_vectors, test_y))
print(f1_score(test_y, clf.predict(test_x_vectors),average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
5.保存模型+提取模型
保存模型:
import pickle
with open("sklearn.pkl","wb") as f:
pickle.dump(clf, f)
提取模型:
with open("sklearn.pkl","rb") as f:
loaded = pickle.load(f)
用提取出的模型预测:
print(test_x[0])
loaded.predict(test_x_vectors[0])