搜狗新闻文本分类完整代码/数据集/实验报告

好多什么洋

已于 2024-07-28 10:07:34 修改

阅读量279

点赞数 8

文章标签：分类数据挖掘人工智能

于 2024-07-28 10:00:00 首次发布

本文链接：https://blog.csdn.net/wangzhengyang214/article/details/140742035

版权

数据集

http://t.csdnimg.cn/rP6qxhttp://t.csdnimg.cn/rP6qx实验报告

http://t.csdnimg.cn/1NuSahttp://t.csdnimg.cn/1NuSa

import os
import re

import pandas as pd
import jieba
import numpy as np
#from keras.src.optimizers import Adam
#from keras.optimizer_v1 import Adam
from tensorflow.keras.optimizers import Adam
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
#from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] #中文字体"SimHei"
matplotlib.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
from prettytable import PrettyTable
import tensorflow as tf



# 读取数据
data_path = 'CH10data/搜狗文本分类语料库迷你版'  # 更改为你的数据集路径
categories = ['体育', '健康', '军事', '教育', '汽车']
df = pd.DataFrame()
#
# for idx, category in enumerate(categories):
#     for filename in os.listdir(os.path.join(data_path, category)):
#         with open(os.path.join(data_path, category, filename), 'r', encoding='utf-8') as file:
#             text = file.read()
#             df = df.append({'text': text, 'label': category}, ignore_index=True)
for idx, category in enumerate(categories):
    for filename in os.listdir(os.path.join(data_path, category)):
        with open(os.path.join(data_path, category, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            new_row = pd.DataFrame({'text': [text], 'label': [category]})
            df = pd.concat([df, new_row], ignore_index=True)
# 加载停用词表
stopwords_path = '百度停用词表.txt'
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stopwords = set([line.strip() for line in file.readlines()])

# 预处理（分词和去停用词）
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in stopwords]))


# def clean_text(text):
#     # 去除无关字符，例如&nbsp;
#     text = re.sub(r'&nbsp;', '', text)
#
#     # 去除特殊符号，例如<, >, & 等
#     #text = re.sub(r'[<>&]', '', text)
#
#     # 去除多余的空格，包括行首行尾和多个空格
#     #text = re.sub(r'\s+', ' ', text).strip()
#
#     # 统一日期和时间的格式化表达，例如将所有日期替换为"DATE"，时间替换为"TIME"
#     # 这里需要根据实际文本中的日期时间格式来编写正则表达式
#     text = re.sub(r'\b\d{4}-\d{1,2}-\d{1,2}\b', 'DATE', text)  # 示例：替换YYYY-MM-DD格式的日期
#     text = re.sub(r'\b\d{1,2}:\d{1,2}\b', 'TIME', text)  # 示例：替换HH:MM格式的时间
#
#     # 返回清洗后的文本
#     return text


# 在这里应用清洗函数到DataFrame的文本列
df['text'] = df['text'].apply(clean_text)

# 继续之前的预处理流程
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in stopwords]))


# TF-IDF 特征提取
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(df['text'])

# TF 特征提取
vectorizer_tf = CountVectorizer()
X_tf = vectorizer_tf.fit_transform(df['text'])

# 标签编码
le = LabelEncoder()
y = le.fit_transform(df['label'])

# 划分数据集
X_train_tf, X_test_tf, y_train, y_test = train_test_split(X_tf, y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# 模型训练与评估
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, multi_class='ovr', random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    # "SVM": SVC(kernel='linear', probability=True, decision_function_shape='ovr', random_state=42)
}

def train_and_evaluate_models(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)
        results[name] = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred, average='weighted'),
            'AUC': roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
        }
    return results



results_tf = train_and_evaluate_models(models, X_train_tf, X_test_tf, y_train, y_test)
results_tfidf = train_and_evaluate_models(models, X_train_tfidf, X_test_tfidf, y_train, y_test)

# # 打印结果
# print("Model\tTF Accuracy\tTF F1 Score\tTF AUC\tTF-IDF Accuracy\tTF-IDF F1 Score\tTF-IDF AUC")
# for name in models:
#     print(f"{name}\t{results_tf[name]['Accuracy']:.4f}\t{results_tf[name]['F1 Score']:.4f}\t{results_tf[name]['AUC']:.4f}\t{results_tfidf[name]['Accuracy']:.4f}\t{results_tfidf[name]['F1 Score']:.4f}\t{results_tfidf[name]['AUC']:.4f}")

# 创建表格对象
table = PrettyTable()
# 添加表格头部
table.field_names = ["Model", "TF Accuracy", "TF F1 Score", "TF AUC", "TF-IDF Accuracy", "TF-IDF F1 Score", "TF-IDF AUC"]

# 将结果添加到表格中
for model_name in results_tf:
    table.add_row([
        model_name,
        results_tf[model_name]['Accuracy'],
        results_tf[model_name]['F1 Score'],
        results_tf[model_name]['AUC'],
        results_tfidf[model_name]['Accuracy'],
        results_tfidf[model_name]['F1 Score'],
        results_tfidf[model_name]['AUC']
    ])

# 打印表格
# print(table)


#
#
# #新增可视化 为每个类别生成词云
# for category in categories:
#     # 提取当前类别的所有文本数据
#     text_list = df[df['label'] == category]['text'].tolist()
#     # 合并所有文本为一个单一的字符串
#     text = ' '.join(text_list)
#
#     # 创建词云对象，设置 random_state 并关闭词组
#     wordcloud = WordCloud(width=800, height=400, background_color='white',
#                           font_path='E:\python_demo2023\自然语言处理3qimo\CH10sentimentanalysis\SimHei.ttf',
#                           random_state=42, collocations=False).generate(text)
#
#     # 显示词云
#     plt.figure(figsize=(10, 5))
#     plt.imshow(wordcloud, interpolation='bilinear')
#     plt.axis('off')  # 不显示坐标轴
#     plt.title(f'Word Cloud for Category: {category}')
#     plt.show()


print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

# 对序列进行填充，以便所有序列具有相同的长度
max_length = 100  # 可根据实际情况调整
X = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# 将标签编码为整数
labels = pd.get_dummies(df['label']).values  # 或使用之前定义的LabelEncoder

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# 构建LSTM模型
embedding_dim = 100  # 可根据需要调整
lstm_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_dim, input_length=max_length),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(labels.shape[1], activation='softmax')  # 输出层神经元数量应与类别数量相同
])
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
#lstm_model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
# 训练模型
history = lstm_model.fit(X_train, y_train, epochs=9, batch_size=64, validation_split=0.2)
# 评估模型
_, accuracy = lstm_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# 预测最可能的类别
# 获取最大概率对应的类别索引
y_pred_classes = np.argmax(lstm_model.predict(X_test), axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# 预测每个类别的概率（您已经正确使用了predict方法）
y_pred_proba = lstm_model.predict(X_test)

# 计算 F1 分数
f1_macro = f1_score(y_true_classes, y_pred_classes, average='macro')
print(f"Macro F1 Score: {f1_macro:.4f}")

# 计算 AUC
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print(f"AUC: {roc_auc:.4f}")
# # 训练模型并存储历史记录
# history = lstm_model.fit(X_train, y_train, epochs=7, batch_size=64, validation_split=0.2)

# 绘制训练损失和准确率
plt.figure(figsize=(12, 4))

# 绘制训练损失
plt.subplot(1, 2, 1)  # 1 行 2 列的第 1 个图
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# 绘制训练准确率
plt.subplot(1, 2, 2)  # 1 行 2 列的第 2 个图
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# 显示图表
plt.show()


# 错误样本分析
# 评估模型并获取预测结果
_, accuracy = lstm_model.evaluate(X_test, y_test)
y_pred_proba = lstm_model.predict(X_test)  # 预测概率，用于后续分析
y_pred_classes = np.argmax(y_pred_proba, axis=1)  # 预测类别
true_labels = np.argmax(y_test, axis=1)  # 真实类别

# 计算分类错误的样本个数
incorrect_count = np.sum(y_pred_classes != true_labels)

# 打印分类错误的样本个数
print(f"分类错误的样本个数: {incorrect_count}")


# 收集预测错误的样本信息
incorrect_samples = []
for idx in np.where(y_pred_classes != true_labels)[0]:
    true_label_idx = true_labels[idx]
    pred_label_idx = y_pred_classes[idx]
    incorrect_samples.append({
        'index': idx,
        'true_label': le.inverse_transform([true_label_idx])[0],
        'predicted_label': le.inverse_transform([pred_label_idx])[0],
        'text': df.iloc[idx]['text']
    })

# 错误样本审查函数
def review_incorrect_samples(incorrect_samples):
    print("错误样本及其标签：")
    for sample in incorrect_samples:
        print(f"样本索引: {sample['index']}")
        print(f"样本文本: {sample['text']}")
        print(f"真实标签: {sample['true_label']}, 预测标签: {sample['predicted_label']}")
        print("-" * 60)

# 调用审查函数
review_incorrect_samples(incorrect_samples)

# 可选：进行更深入的错误分析
# 例如，提取关键词、分析特征重要性等