python 逻辑回归算法实现文本情感分类(Logistic Regression)

一、算法介绍
逻辑回归模型是一种判别概率模型,直接学习条件概率分布 P(Y | X)作为预测模型。

· 二元逻辑回归模型:
设 x = {x1, x2, ……, xn} 为输入, Y = {0, 1}为输出,w = {w1, w2, ……, wn}和b是参数。
对于给定的输入x,可以求得P(Y = 1 | X = x)和P(Y = 0 | X = x),比较这两个条件概率的大小,将实例x分到概率较大的类。
在这里插入图片描述
在这里插入图片描述
· 参数估计:
此部分参考博客https://blog.csdn.net/zouxy09/article/details/20319673的推导
假设我们有n个独立的训练样本{(x1, y1) ,(x2, y2),…, (xn, yn)},y={0, 1},则似然函数如下所示。
显然,我们要让模型最大化地满足训练集,即最大似然估计,因此要求使得L(θ)最大的θ。
在这里插入图片描述
通过上述推导可知,对应的梯度上升算法如下:
在这里插入图片描述
这里我们使用随机梯度上升,即一次仅用一个样本点的回归误差来更新回归系数。
这样,每次梯度上升的值为 α ×(y - y.hat)× xi,其中α为更新速率,y是标签(0或1),y.hat是sigmoid计算出来的概率 P(Y = 1 | X = xi),y - y.hat即为损失,xi为该文本的特征向量。

二、代码实现
这里使用酒店评价中文数据集,分为积极、消极两类。训练集共有评价6000条左右,测试集共有评价202条。积极、消极评论各占一半。
· 数据预处理函数textprocess.py:

# coding=utf-8
import os
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer

class Textprocess:
    def __init__(self):
        # 原始语料库路径
        self.corpus_path = ''
        self.neg_path = ''
        self.pos_path = ''
        self.data_list = []
        self.label_list = []

    def preprocess(self):
        dir_list = os.listdir(self.corpus_path)
        for mydir in dir_list:
            if mydir == 'neg':
                save_path = self.neg_path
            elif mydir == 'pos':
                save_path = self.pos_path
            class_path = self.corpus_path + '/' +mydir + '/'
            files = os.listdir(class_path)
            for file in files:
                file_path = class_path + file
                with open(file_path, 'r',encoding='gb18030', errors='ignore') as f:
                    file_content = f.read()
                corpus_array = file_content.splitlines()
                corpus = ''
                for line in corpus_array:
                    line = line.strip()
                    corpus += line
                corpus += '\n'
                with open(save_path, 'a+', encoding='gb18030', errors='ignore') as f:
                    f.write(corpus)

    def segment(self,pospath, segpath):
        f = open(pospath, 'r', encoding='gb18030', errors='ignore')
        for line in f.readlines():
            seg_line = jieba.cut(line,cut_all=False)
            seg_line = ' '.join(seg_line)
            with open(segpath, 'a+', encoding='gb18030', errors='ignore') as save_file:
                save_file.write(seg_line)
        f.close()

    # path1 为训练集neg分词结果
    # path2 为训练集pos分词结果
    def data_set(self, path1, path2):
        f = open(path1, 'r', encoding='gb18030', errors='ignore')
        for line in f.readlines():
            self.data_list.append(line)
            self.label_list.append('0')
        f.close()
        f = open(path2, 'r', encoding='gb18030', errors='ignore')
        for line in f.readlines():
            self.data_list.append(line)
            self.label_list.append('1')

    def getstopword(self,stopword_path):
        stop_file =open(stopword_path,'rb')
        stop_content = stop_file.read()
        stopword_list = stop_content.splitlines()
        stop_file.close()
        self.stopword_list = stopword_list

    def tfidf_set(self):
        vectorizer = TfidfVectorizer(stop_words=self.stopword_list,sublinear_tf = True, max_features =10000)
        self.vectorizer = vectorizer
        # tf-idf 权重
        tfidf = vectorizer.fit_transform(self.data_list)
        self.weight = tfidf
        # 获得词库
        self.myvocabulary = vectorizer.vocabulary_

· 逻辑回归函数logistic_regression.py:

import textprocess
import math
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
train = textprocess.Textprocess()
train_neg_seg = r'.\酒店评价\train_neg_seg'
train_pos_seg = r'.\酒店评价\train_pos_seg'
train.data_set(train_neg_seg, train_pos_seg)
train.getstopword('stopword.txt')
train.tfidf_set()

myvocabulary = train.myvocabulary
train_tfidf = pd.DataFrame(train.weight.toarray(),columns=train.vectorizer.get_feature_names())
train_weight = train_tfidf.values
train_label = train.label_list

test = textprocess.Textprocess()
test_pos_seg = r'.\酒店评价\test_pos_seg'
test_neg_seg = r'.\酒店评价\test_neg_seg'
test.data_set(test_neg_seg, test_pos_seg)
vectorizer = TfidfVectorizer(vocabulary = myvocabulary)
test_tfidf = vectorizer.fit_transform(test.data_list)
test_weight = pd.DataFrame(test_tfidf.toarray(),columns=vectorizer.get_feature_names())

test_weight = test_weight.values
test_label = test.label_list

# 训练集样本数为5798,特征数为10000
num_samples = 5798
num_feature = 10000
weight = np.array([1 for i in range(num_feature)])
bias = 1

max_iter = 500
lr = 0.01

# 计算概率
def sigmoid(x,w,b):
    return 1.0 / (1 + math.exp(-(np.dot(x,w) + b)))

def train_model(w, b):
    for i in range(max_iter):
        for j in range(num_samples):
            output = sigmoid(train_weight[j, :], w, b)
            error = int(train_label[j]) - output
            w = w + lr * train_weight[j, :] * error
            b = b + lr*error
    return w, b

def predict(new_w, new_b):
    count = 0
    test_sum = len(test_weight)
    for i in range(test_sum):
        output = sigmoid(test_weight[i, :], new_w, new_b)
        if output > 0.5:
            precision = 1
        else:
            precision = 0
        if precision == int(test_label[i]):
            count += 1
    return float(count) / test_sum

new_w, new_b = train_model(weight, bias)
acc = predict(new_w, new_b)
print(acc)

最终正确率为0.8465346534653465。

三、多元逻辑回归(未完待续)

  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值