《少年的你》短评情感分析——机器学习之逻辑回归

原文网址:
https://segmentfault.com/a/1190000021947908


import pandas as pd
import jieba
import re
#逻辑回归建模需要的库
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from pandas import DataFrame

df1 = [{"name":"整儿钱小姐","short":"少年的你值得一看"}]
df2 = [{"rating":[('50','力荐')]}]
data = pd.merge(df1,df2,how = 'outer')
print(data.shape)

#划分等级
def rating(e):
    if '50' in e:
        return 5
    if '40' in e:
        return 4
    if '30' in e:
        return 3
    if '20' in e:
        return 2
    if '10' in e:
        return 1

data['new_rating'] = data['rating'].map(rating)
print(data.head())

#剔除中性的评价
new_data = data[data['new_rating'] != 3]
new_data['sentiment'] = new_data['new_rating'].apply(lambda x : +1 if x>3 else -1)

print(new_data['sentiment'].value_counts())

#分词
def cut_word(text):
    text = jieba.cut(str(text), cut_all = False)
    return " ".join(text)
new_data['new_short'] = new_data['short'].apply(cut_word)

#删除数字
def remove_num(new_short):
    return re.sub(r'\d+','',new_short)

#删除字母
def remove_word(new_short):
    return re.sub(r'[a-z]+','',new_short)

new_data['new_short'] = new_data['new_short'].apply(remove_num)
new_data['new_short'] = new_data['new_short'].apply(remove_word)


#逻辑回归分析与建模
#第一步需要对分析好的数据进行数据划分,分为训练集和测试集
train_data, test_data = train_test_split(new_data, train_size = 0.8,random_stat=0)

#文本提取
transfer = CountVectorizer()
train_word = transfer.fit_transform(train_data['new_short'])
test_word = transfer.transform(test_data['new_short'])

#稀疏矩阵
print('new_data:\n', train_word.toarray())

#特征值
print('feature_name:\n',transfer.get_feature_names())

#第二步对分词后的文本进行特征提取,可以生成一个对应的稀疏矩阵,并且得到稀疏矩阵对应的特征值
#第三步利用逻辑回归建模,即让训练集中的特征值和目标值进行拟合,从而生成一个模型
x_train, x_test,y_train,y_test = train_test_split(new_data['new_short'],new_data['sentiment'],train_size = 0.8, random_state = 0)
x_train = train_word
x_test = test_word
model = LogisticRegression()
model.fit(x_train,y_train)
y_predict = model.predict(x_test)
print('布尔比对:\n',y_predict==y_test)
score = model.score(x_test,y_test)
print('模型准确率:\n',score)


example = test_data[50:55]
example[['short','new_rating','sentiment']]

possibility = model.predict_proba(test_word)[:,1]
test_data.loc[:,'possibility'] = possibility
print(test_data.head())
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值