lstm 文本分类 (pytorch)

本文介绍如何利用PyTorch的LSTM模型进行文本分类任务,详细阐述了从数据预处理到模型训练的全过程,适合对深度学习和自然语言处理感兴趣的读者。
摘要由CSDN通过智能技术生成
# encoding = 'utf-8'

import re
import string
import jieba
import copy
import time
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
    
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
from torchtext import data
from torchtext.vocab import Vectors
import pandas as pd

data = pd.read_csv('all_data.csv', header=0, encoding='utf-8',error_bad_lines=False)
shuffle_df = data.sample(frac=1).reset_index()

train_df = shuffle_df.iloc[0:int(len(shuffle_df)*0.7), :]
val_df = shuffle_df.iloc[int(len(shuffle_df)*0.7):int(len(shuffle_df)*0.85),:]
test_df = shuffle_df.iloc[int(len(shuffle_df)*0.85):,:]

f = open("stopwords.txt")
stop_word_list = []
for line in f.readlines():
    stop_word_list.append(line.strip())

import re
import jieba

stop_word = set(stop_word_list)

def chinese_pre(text_data):
    ## 字母转化为小写,去除数字,
    text_data = text_data.lower()
    text_data = re.sub("\d+", "", text_data)
    ## 分词,使用精确模式
    text_data = list(jieba.cut(text_data,cut_all=False)) 
    ## 去停用词和多余空格
    text_data = [word.strip() for word in text_data if word not in stop_word]
    ## 处理后的词语使用空格连接为字符串
    text_data = " ".join(text_data)
    return text_data

train_test_list = list(train_df['text'])

train_cutword_list = []
for i in range(len(train_test_list)):
    print(i)
    train_cutword_list.append(chinese_pre(train_test_list[i]))

val_text_list = list(val_df['text'])
val_cutword_list = []
for i in range(len(val_text_list)):
    print(i)
    val_cutword_list.append(chinese_pre(val_text_list[i]))

test_text_list = list(test_df['text'])
test_cutword_list = []
for i in range(len(test_text_list)):
    print(i)
    test_cutword_list.append(chinese_pre(test_text_list[i]))

train_df['cutword'] = pd.DataFrame(train_cutword_list)
val_df = val_df.drop('index', 1).reset_index()
val_df['cutword'] = pd.DataFrame(val_cutword_list)
val_df = val_df.drop('index',1)
val_df


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值