Python文本分类总结:贝叶斯,逻辑回归,决策树,随机森林,SVM,词向量,TFIDF,神经网络,CNN,LSTM,GRU,双向RNN,LDA

本文通过多种机器学习和深度学习方法对文本进行分类实验,包括TF-IDF、词向量、CNN、GRU等,并对比了不同模型的效果。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

前言

上篇文本分类实验结果
本篇首发日期:2020-9-27
本篇实验详细结果及代码
本篇python版本:3.7.4
本篇sklearn版本:0.21.3
本篇keras版本:2.3.1

语料

统计

类别数量类别数量
science2093car2066
finance2052sports2017
military2007medicine2000
entertainment1906politics1865
education1749fashion1712

待测数据是否已知?

  • 实际工作中,待测数据可能已知,也可能未知,其监督学习方法可以有所不同
  • 此实验待测数据已知
训练
预测
特征工程
训练
训练
预测
待测数据未知
标注数据
有监督学习模型
未知数据
待测数据已知
未标注数据+标注数据
特征
有监督学习模型
标注数据
未标注数据

代码

TFIDF+【贝叶斯、逻辑回归、决策树、随机森林】

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import jieba, warnings
from segment import tk
from data10 import load_xy
warnings.filterwarnings('ignore')  # 不打印警告
DISCARD_FLAG = {'c', 'd', 'e', 'o', 'p', 'r', 'u', 'uv', 'y', 'NUM'}  # 词性排除
N = 25000  # 最大词数


def cut1(text):
    for word in tk.cut(text):
        if tk.get_flag(word) not in DISCARD_FLAG:
            yield word


def cut2(text):
    words = tk.lcut(text)
    for i in range(len(words) - 1):
        yield words[i]
        yield words[i] + words[i + 1]
    yield words[-1]


def cut_flag(text):
    for word in tk.cut(text):
        flag = tk.get_flag(word)
        if tk.get_flag(word) not in DISCARD_FLAG:
            yield word
            yield flag


def experiment(test_size=.5, random_state=7):
    x, (train, test, y_train, y_test) = load_xy(test_size, random_state)
    vec_ls = (
        CountVectorizer(tokenizer=tk.cut, max_features=N),
        CountVectorizer(tokenizer=cut1, max_features=N),
        CountVectorizer(tokenizer=cut2, max_features=N),
        CountVectorizer(tokenizer=cut_flag, max_features=N),
        CountVectorizer(tokenizer=jieba.cut, max_features=N),
        TfidfVectorizer(tokenizer=tk.cut, max_features=N),
        TfidfVectorizer(tokenizer=cut1, max_features=N),
        TfidfVectorizer(tokenizer=cut2, max_features=N),
        TfidfVectorizer(tokenizer=cut_flag, max_features=N),
        TfidfVectorizer(tokenizer=jieba.cut, max_features=N),
    )
    clf_ls = (
        MultinomialNB(),
        LogisticRegression(solver='liblinear'),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        SVC(kernel='linear'),
    )
    for x_fit in (train, x):
        for vec in vec_ls:
            vec.fit(x_fit)
            x_train = vec.transform(train)
            x_test = vec.transform(test)
            for clf in clf_ls:
                t0 = tk.second
                clf.fit(x_train, y_train)
                print(clf.__class__.__name__, tk.second - t0, clf.score(x_test, y_test))
            tk.yellow(vec)


experiment()

词向量+TFIDF+【贝叶斯、逻辑回归、决策树、随机森林、SVM】

from gensim.models import Word2Vec
from math import log10
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np, copy, warnings
from segment import tk, clean
from data10 import load_xy
warnings.filterwarnings('ignore')  # 不打印警告
DISCARD_FLAG = {'c', 'd', 'e', 'o', 'p', 'r', 'u', 'uv', 'y', 'NUM'}


def cut1(text):
    for word in tk.cut(text):
        if tk.get_flag(word) not in DISCARD_FLAG:
            yield word


class Word2Vector:
    def __init__(self, cut, size=75, window=7, min_count=3):
        self.size = size
        self.window = window
        self.min_count = min_count
        self.cut = cut
        self.wv = None
        self.vectors = None
        self.w2i = None

    def unit_vector(self):
        self.vectors = self.vectors / np.linalg.norm(self.vectors, axis=1).reshape(-1, 1)

    def idf(self, texts):
        texts = [set(self.cut(t)) for t in texts]
        lent = len(texts)
        self.vectors = self.vectors * np.array(
            [[log10(lent / (sum((w in t) for t in texts) + 1))] for w in self.wv.index2word])

    def fit(self, texts):
        sentences = [list(self.cut(s)) for t in texts for s in clean.text2clause(t)]  # 文本切分
        wv = Word2Vec(sentences, size=self.size, window=self.window, min_count=self.min_count).wv  # 词向量
        self.w2i = {w: i for i, w in enumerate(wv.index2word)}
        self.vectors = wv.vectors
        self.wv = wv

    def text2vector(self, texts):
        return [[self.vectors[self.w2i[w]] for w in self.cut(t) if w in self.w2i] for t in texts]

    def vector_sum(self, texts):
        return [np.sum(v, axis=0) if v else np.zeros(self.size) for v in self.text2vector(texts)]

    def vector_mean(self, texts):
        return [np.mean(v, axis=0) if v else np.zeros(self.size) for v in self.text2vector(texts)]


def experiment(test_size=.5, random_state=7):
    x, (train, test, y_train, y_test) = load_xy(test_size, random_state)
    vec_ls = (
        Word2Vector(tk.cut),
        Word2Vector(cut1),
        Word2Vector(tk.cut, 150),
        Word2Vector(cut1, 150),
    )
    clf_ls = (
        LogisticRegression(solver='liblinear'),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        SVC(kernel='linear', max_iter=500),
        SVC(max_iter=1000),
    )
    for x_fit in (train, x):
        for vec in vec_ls:
            vec.fit(x_fit)
            # sum
            tk.yellow('*****sum')
            x_train, x_test = vec.vector_sum(train), vec.vector_sum(test)
            for clf in clf_ls:
                t0 = tk.second
                clf.fit(x_train, y_train)
                print(clf.__class__.__name__, tk.second - t0, clf.score(x_test, y_test))
            # mean
            tk.yellow('*****mean')
            x_train, x_test = vec.vector_mean(train), vec.vector_mean(test)
            for clf in clf_ls:
                t0 = tk.second
                clf.fit(x_train, y_train)
                print(clf.__class__.__name__, tk.second - t0, clf.score(x_test, y_test))
            # sum unit_vector
            tk.yellow('*****sum unit_vector')
            vec1 = copy.deepcopy(vec)
            vec1.unit_vector()
            x_train, x_test = vec1.vector_sum(train), vec1.vector_sum(test)
            for clf in clf_ls:
                t0 = tk.second
                clf.fit(x_train, y_train)
                print(clf.__class__.__name__, tk.second - t0, clf.score(x_test, y_test))
            # mean unit_vector
            tk.yellow('*****mean unit_vector')
            x_train, x_test = vec1.vector_mean(train), vec1.vector_mean(test)
            for clf in clf_ls:
                t0 = tk.second
                clf.fit(x_train, y_train)
                print(clf.__class__.__name__, tk.second - t0, clf.score(x_test, y_test))
            del vec1
            # sum idf
            tk.yellow('*****sum idf')
            vec.idf(x_fit)
            x_train, x_test = vec.vector_sum(train), vec.vector_sum(test)
            for clf in clf_ls:
                t0 = tk.second
                clf.fit(x_train, y_train)
                print(clf.__class__.__name__, tk.second - t0, clf.score(x_test, y_test))
            # mean idf
            tk.yellow('*****mean idf')
            x_train, x_test = vec.vector_mean(train), vec.vector_mean(test)
            for clf in clf_ls:
                t0 = tk.second
                clf.fit(x_train, y_train)
                print(clf.__class__.__name__, tk.second - t0, clf.score(x_test, y_test))


experiment()

CNN、GRU、LSTM、双向GRU、双向LSTM

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Bidirectional, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import collections, warnings
from segment import tk, clean
from data10 import load_xy
warnings.filterwarnings('ignore')

"""配置"""
N = 25000  # 最大词数
input_dim = N + 1  # 词库大小
output_dim = 150  # 词嵌入维度
kernel_size = 7  # 卷积核大小
units = filters = 64  # RNN神经元数量、卷积滤波器数量
maxlen = 200  # 序列长度
batch_size = 128  # 每批数据量大小
epochs = 999  # 训练最大轮数
verbose = 2  # 训练过程展示
patience = 1  # 没有进步的训练轮数
callbacks = [EarlyStopping('val_acc', patience=patience)]
DISCARD_FLAG = {'c', 'd', 'e', 'o', 'p', 'r', 'u', 'uv', 'y', 'NUM'}
C = ['science', 'car', 'finance', 'sports', 'military', 'medicine', 'entertainment', 'politics', 'education', 'fashion']
num_classes = len(C)


def cut1(text):
    for clause in clean.SEP45(text):
        for word in tk.cut(clause):
            if tk.get_flag(word) not in DISCARD_FLAG:
                yield word


"""读数据"""
x, (x1, x2, y1, y2) = load_xy()
y1 = to_categorical([C.index(i) for i in y1], num_classes)
y2 = to_categorical([C.index(i) for i in y2], num_classes)
# 词编码
w2i = {wf[0]: e for e, wf in enumerate(collections.Counter(w for t in x for w in cut1(t)).most_common(N), 1)}
# pad
x1 = pad_sequences([[w2i[w] for w in cut1(t) if w in w2i] for t in x1], maxlen, dtype='float')
x2 = pad_sequences([[w2i[w] for w in cut1(t) if w in w2i] for t in x2], maxlen, dtype='float')
# 验证集切分
validation_size = .1
x11, x12, y11, y12 = train_test_split(x1, y1, test_size=validation_size)


def experiment():
    for layer, layer_name in (
        (LSTM(units), 'LSTM'),
        (GRU(units), 'GRU'),
        (Bidirectional(LSTM(units)), 'BiLSTM'),
        (Bidirectional(GRU(units)), 'BiGRU'),
    ):
        tk.cyan(layer_name)
        # 建模
        model = Sequential()
        model.add(Embedding(input_dim, output_dim, input_length=maxlen, input_shape=(maxlen,)))
        model.add(layer)
        model.add(Dense(units=num_classes, activation='softmax'))
        model.compile('adam', 'categorical_crossentropy', ['acc'])
        # 训练、预测
        history = model.fit(x11, y11, batch_size, epochs, verbose, callbacks, validation_data=(x12, y12))
        e = len(history.history['acc'])
        print(model.evaluate(x2, y2, batch_size, verbose), e)
        # 验证集加入训练、预测
        model.fit(x12, y12, batch_size, int(e * validation_size) + 1, verbose, callbacks)
        tk.yellow(model.evaluate(x2, y2, batch_size, verbose))
    # CNN
    tk.cyan('CNN')
    model = Sequential()
    model.add(Embedding(input_dim, output_dim, input_length=maxlen, input_shape=(maxlen,)))
    model.add(Conv1D(units, kernel_size * 2, padding='same', activation='relu'))
    model.add(MaxPool1D(pool_size=2))  # strides默认等于pool_size
    model.add(Conv1D(units * 2, kernel_size, padding='same', activation='relu'))
    model.add(GlobalMaxPool1D())  # 对于时序数据的全局最大池化
    model.add(Dense(num_classes, activation='softmax'))
    model.compile('adam', 'categorical_crossentropy', ['acc'])
    # 训练、预测
    history = model.fit(x11, y11, batch_size, epochs, verbose, callbacks, validation_data=(x12, y12))
    e = len(history.history['acc'])
    print(model.evaluate(x2, y2, batch_size, verbose), e)
    # 验证集加入训练、预测
    model.fit(x12, y12, batch_size, int(e * validation_size) + 1, verbose, callbacks)
    tk.yellow(model.evaluate(x2, y2, batch_size, verbose))


experiment()

词向量+【CNN、GRU、LSTM、双向GRU、双向LSTM】

from gensim.models import Word2Vec
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Bidirectional, Conv1D, MaxPool1D, GlobalMaxPool1D
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np, warnings
from math import log10
from segment import tk, clean
from data10 import load_xy
warnings.filterwarnings('ignore')

"""配置"""
size = 150  # 词向量维度
window = kernel_size = 7  # 词窗、卷积核大小
units = filters = 64  # RNN神经元数量、卷积滤波器数量
maxlen = 200  # 序列长度
batch_size = 128  # 每批数据量大小
epochs = 999  # 训练最大轮数
verbose = 2  # 训练过程展示
patience = 1  # 没有进步的训练轮数
callbacks = [EarlyStopping('val_acc', patience=patience)]
DISCARD_FLAG = {'c', 'd', 'e', 'o', 'p', 'r', 'u', 'uv', 'y', 'NUM'}
C = ['science', 'car', 'finance', 'sports', 'military', 'medicine', 'entertainment', 'politics', 'education', 'fashion']
num_classes = len(C)


def cut1(text):
    for word in tk.cut(text):
        if tk.get_flag(word) not in DISCARD_FLAG:
            yield word


"""读数据"""
x, (x1, x2, y1, y2) = load_xy()
y1 = to_categorical([C.index(i) for i in y1], num_classes)
y2 = to_categorical([C.index(i) for i in y2], num_classes)

"""词向量"""
sentences = [list(cut1(s)) for t in x for s in clean.text2clause(t)]  # 文本切分
wv = Word2Vec(sentences, size=size, window=window, min_count=3, sg=1).wv
vectors = wv.vectors
texts = [set(cut1(t)) for t in x]
lent = len(texts)
vectors = vectors * np.array([[log10(lent / (sum((w in t) for t in texts) + 1))] for w in wv.index2word])  # idf
w2i = {w: i for i, w in enumerate(wv.index2word)}
# pad
x1 = pad_sequences([[vectors[w2i[w]] for w in cut1(t) if w in w2i] for t in x1], maxlen, dtype='float')
x2 = pad_sequences([[vectors[w2i[w]] for w in cut1(t) if w in w2i] for t in x2], maxlen, dtype='float')
# 验证集切分
validation_size = .1
x11, x12, y11, y12 = train_test_split(x1, y1, test_size=validation_size)


def experiment():
    for layer, layer_name in (
        (LSTM(units), 'LSTM'),
        (GRU(units), 'GRU'),
        (Bidirectional(LSTM(units), input_shape=(maxlen, size)), 'BiLSTM'),
        (Bidirectional(GRU(units), input_shape=(maxlen, size)), 'BiGRU'),
    ):
        tk.cyan(layer_name)
        # 建模
        model = Sequential()
        model.add(layer)
        model.add(Dense(units=num_classes, activation='softmax'))
        model.compile('adam', 'categorical_crossentropy', ['acc'])
        # 训练、预测
        history = model.fit(x11, y11, batch_size, epochs, verbose, callbacks, validation_data=(x12, y12))
        e = len(history.history['acc'])
        print(model.evaluate(x2, y2, batch_size, verbose), e)
        # 验证集加入训练、预测
        model.fit(x12, y12, batch_size, int(e * validation_size) + 1, verbose, callbacks)
        tk.yellow(model.evaluate(x2, y2, batch_size, verbose))
    # CNN
    tk.cyan('CNN')
    model = Sequential()
    model.add(Conv1D(units, kernel_size * 2, padding='same', activation='relu'))
    model.add(MaxPool1D(pool_size=2))  # strides默认等于pool_size
    model.add(Conv1D(units * 2, kernel_size, padding='same', activation='relu'))
    model.add(GlobalMaxPool1D())  # 对于时序数据的全局最大池化
    model.add(Dense(num_classes, activation='softmax'))
    model.compile('adam', 'categorical_crossentropy', ['acc'])
    # 训练、预测
    history = model.fit(x11, y11, batch_size, epochs, verbose, callbacks, validation_data=(x12, y12))
    tk.yellow(model.evaluate(x2, y2, batch_size, verbose))
    # 验证集加入训练、预测
    model.fit(x12, y12, batch_size, len(history.history['acc']) - 1, verbose, callbacks)
    tk.yellow(model.evaluate(x2, y2, batch_size, verbose))


experiment()

无监督

词向量+预设规则

from gensim.models import Word2Vec
from collections import Counter
from segment import tk, clean

DISCARD_FLAG = {'c', 'd', 'e', 'o', 'p', 'r', 'u', 'uv', 'y', 'NUM'}  # 词性排除
THRESHOLD = .5


def cut(text):
    for word in tk.cut(text):
        if tk.get_flag(word) not in DISCARD_FLAG:
            yield word


def texts2sentences(texts):
    return [list(cut(s)) for t in texts for s in clean.text2clause(t)]


class Model:
    def __init__(self, texts, keywords, themes):
        # 加入词库
        for word in keywords:
            tk.add_word(word)
        # 训练词向量
        self.wv = Word2Vec(texts2sentences(texts), window=11, sg=1).wv
        # 词库扩展
        self.dt = {w: {w: 1} for w in keywords}
        for word in keywords:
            for w, s in self.wv.similar_by_word(word, 99):
                if s < THRESHOLD:
                    break
                self.dt[w] = dict({word: s ** 2}, **self.dt.get(w, dict()))
        print(self.dt)
        self.themes = themes

    def ner(self, text):
        # 抽取扩展后的关键词
        c1 = Counter(w for w in cut(text) if w in self.dt)
        # 扩展后的关键词映射到原关键词
        c2 = Counter()
        for k1, v1 in c1.items():
            for k2, v2 in self.dt[k1].items():
                c2[k2] += v1 * v2
        return c2.most_common()

    def extract_theme(self, text):
        themes = Counter()
        for w, f in self.ner(text):
            themes[self.themes[w]] += f
        return themes and themes.most_common()[0][0]


# 读数据
from pandas import read_excel
ay = read_excel('data10/data10.xlsx').values
x, y = ay[:, 0], ay[:, 1]
WORDS = {
    '车': 'car', '车型': 'car', '汽车': 'car',
    '教育': 'education', '学生': 'education', '学校': 'education', '考生': 'education',
    '娱乐': 'entertainment', '观众': 'entertainment', '饰演': 'entertainment', '演员': 'entertainment',
    '电影': 'entertainment', '角色': 'entertainment', '影片': 'entertainment', '导演': 'entertainment',
    '肌肤': 'fashion', '时尚': 'fashion', '搭配': 'fashion', '穿': 'fashion', '裙': 'fashion',
    '珠宝': 'fashion', '装修': 'fashion', '保湿': 'fashion', '时髦': 'fashion', '面膜': 'fashion', '香水': '时尚',
    '比赛': 'sports', '球队': 'sports', '队': 'sports', '球员': 'sports', '体育': 'sports', '赛季': 'sports',
    '症状': 'medicine', '治疗': 'medicine', '临床': 'medicine',
    '疾病': 'medicine', '患者': 'medicine', '综合征': 'medicine',
    '科技': 'science', '研究': 'science', 'iPhone': 'science', '手机': 'science',
}
# 算法
model = Model(x, WORDS.keys(), WORDS)
y_predict = [model.ner(i) for i in x]
print(sum(y == y_predict) / len(y))
while True:
    try:
        x = x[int(input('输入数字').strip())]
        print(x, '\n\033[033m{}\033[0m'.format(model.ner(x)))
    except:
        pass

主题模型

LDA是垃圾
LDA是垃圾
LDA是垃圾
LDA是垃圾
LDA是垃圾
LDA是垃圾
LDA是垃圾
主题模型是垃圾
主题模型是垃圾
主题模型是垃圾
主题模型是垃圾
主题模型是垃圾
主题模型是垃圾
主题模型是垃圾
主题模型是垃圾

from gensim import corpora, models
from segment import tk, clean
import numpy as np

DISCARD_FLAG = {'a', 'ad', 'c', 'd', 'e', 'f', 'i', 'l', 'nb', 'm', 'o', 'p', 'r', 'u', 'uv', 'y', 'NUM', 'NA'}


def cut(text):
    for clause in clean.text2clause(text):
        for word in tk.cut(clause):
            if (tk.get_flag(word) not in DISCARD_FLAG) and (len(word) > 1):
                yield word


def experiment(texts, themes):
    words_ls = [list(cut(t)) for t in texts]
    # 构造词典
    dictionary = corpora.Dictionary(words_ls)
    # 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】
    corpus = [dictionary.doc2bow(words) for words in words_ls]
    # lda模型,num_topics设置主题的个数
    lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
    # 打印所有主题,每个主题显示20个词
    topics = lda.print_topics(num_words=20)
    for topic in topics:
        print(topic)
    # 主题对应编号(手动输入)
    dt = dict()
    _themes = ['science', 'car', 'finance', 'sports', 'military', 'medicine',
               'entertainment', 'politics', 'education', 'fashion']
    for i in range(10):
        print(topics[i])
        j = int(input('{}'.format([k for k in enumerate(_themes)])).strip())
        dt[i] = _themes[j]
        del _themes[j]

    # 主题推断
    inference = np.argmax(lda.inference(corpus)[0], axis=1)
    inference = [dt[i] for i in inference]
    print(np.mean(themes == inference))


# 读数据
from pandas import read_excel
ay = read_excel('data10/data10.xlsx').values
x, y = ay[:, 0], ay[:, 1]
experiment(x, y)

实验结果和结论

  • 逻辑回归整体最优(最高准度第2,平均准度第1,结果稳定,速度ok)
  • 独热编码优于词向量
  • 独热编码tfidf优于count
  • 线性svm训练时间过长,结果不稳定
  • 独热tfidf+线性svm准度最优
  • 深度学习的训练轮数不易确认,导致不稳定
  • 数据量较少情况下深度学习的准确度较机器学习低
  • 词向量相当于降维,低维空间上,逻辑回归和线性SVM效果一般,高斯核函数SVM效果更好但不稳定
  • 无监督学习中,【词向量+专家系统】有76%的准确率
  • 主题模型是垃圾!LDA是垃圾LDA是垃圾LDA是垃圾LDA是垃圾LDA是垃圾LDA是垃圾
### 文本分类数据集与情感分析方法 #### 常见的文本分类数据集 在进行文本分类任务时,通常会使用一些公开可用的数据集来训练和评估模型。以下是几个常用的文本分类数据集: 1. **IMDB电影评论数据集** IMDB是一个广泛使用的二元情感分类数据集,包含来自互联网电影数据库(IMDB)的50,000条标记为正面或负面的电影评论[^3]。 2. **Twitter情感分析数据集** 这一数据集由带有标签的情感推文组成,常被用来测试短文本的情感分类效果[^4]。 3. **Yelp Reviews数据集** Yelp Reviews提供了大量关于餐厅和其他本地企业的用户评价,适合多类别的评分预测任务[^5]。 4. **AG News数据集** AG新闻语料库涵盖了四个不同主题的文章类别——世界、体育、商业和技术,适用于更广泛的文本分类场景而非仅限于情感分析[^6]。 #### 情感预测的方法 对于基于机器学习的情感预测,可以采用多种算法实现高效建模: 1. **朴素贝叶斯(Naive Bayes)** 作为一种简单却有效的概率分类器,在处理小型到中型规模的文本数据上表现良好。通过计算先验概率并假设特征之间相互独立完成分类操作[^7]。 2. **支持向量机(Support Vector Machine, SVM)** 利用最大间隔原则划分样本空间的支持向量机构成了强大的线性和非线性边界定义工具,尤其当面对高维稀疏矩阵形式表示的文字资料时尤为适用[^8]。 3. **随机森林(Random Forests)** 和 **梯度提升决策树(Gradient Boosting Decision Trees, GBDT)** 它们属于集成学习范畴内的强大代表成员之一;前者依靠多个弱分类者的集体智慧达成最终判断结论,后者则强调逐步修正错误从而获得更加精确的结果呈现方式[^9][^10]。 4. **深度神经网络(Deep Neural Networks)** 如卷积神经网络(Convolutional Neural Network, CNN) 或长短记忆单元(Long Short-Term Memory Unit, LSTM),这些先进架构能捕捉复杂模式结构进而提高整体性能水平[^11]. ```python from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score # 加载数据... X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2) vectorizer = TfidfVectorizer() X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) model = MultinomialNB().fit(X_train_tfidf, y_train) predictions = model.predict(X_test_tfidf) print(f'Accuracy: {accuracy_score(y_test, predictions)}') ``` 上述代码片段展示了如何利用TF-IDF矢量化技术和多项式朴素贝叶斯构建基础版的情感分类流程实例演示过程。 ---
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小基基o_O

您的鼓励是我创作的巨大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值