导入必要的库
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import seaborn as sns
from wordcloud import WordCloud
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchtext import data
from torchtext.vocab import Vectors, GloVe
定义加载文本数据的函数
def load_text_data(path):
##获取文件夹的最后一个字段
text_data = []
label = []
for dset in ["pos","neg"]:
path_dset = os.path.join(path,dset)
path_list = os.listdir(path_dset)
##读取文件下pos和neg文件
for fname in path_list:
if fname.endswith(".txt"):
filename = os.path.join(path_dset,fname)
with open(filename) as f:
text_data.append(f.read())
if dset == "pos":
label.append(1)
else:
label.append(0)
return np.array(text_data), np.array(label)
读取训练集和测试集
train_path = "data/chap/imdb/train"
train_text,train_label = load_text_data(train_path)
test_path = "data/chap/imdb/test"
test_text,test_label = load_text_data(test_path)
print(len(train_text),len(train_label))
print(len(test_text),len(test_label))
文本预处理
def text_preprocess(text_data):
text_pre = []
for text1 in text_data:
##去除指定的字符
text1 = re.sub("<br /><br />"," ",text1)
##转化为小写,去除数字,去除标点符号
text1 = text1.lower()
text1 = re.sub("\d+","",text1)
text1 = text1.translate(str.maketrans("","",string.punctuation.replace("'","")))
text1 = text1.strip()
text_pre.append(text1)
return np.array(text_pre)
train_text_pre = text_preprocess(train_text)
test_text_pre = text_preprocess(test_text)
去除停用词和词干提取
def stop_stem_word(datalist, stop_words):
datalist_pre = []
for text in datalist:
text_words = word_tokenize(text)
# 去除停用词
text_words = [word for word in text_words if word.lower() not in stop_words]
# 删除带有"'"的词语
text_words = [word for word in text_words if "'" not in word]
datalist_pre.append(text_words)
return datalist_pre
# 文本符号化处理,去除停用词
stop_words = stopwords.words("english")
stop_words = set(stop_words)
train_text_pre2 = stop_stem_word(train_text_pre, stop_words)
test_text_pre2 = stop_stem_word(test_text_pre, stop_words)
print(train_text_pre[10000])
print("=" * 10)
print(test_text_pre2[10000])
保存预处理后的文本到CSV
##将处理好的文本保存到CSV中
texts = [" ".join(words) for words in train_text_pre2]
traindatasave = pd.DataFrame({"text":texts,"label":train_label})
texts = [" ".join(words) for words in test_text_pre2]
testdatasave = pd.DataFrame({"text":texts,"label":test_label})
traindatasave.to_csv("data/chap6/imdb_train.csv")
testdatasave.to_csv("data/chap6/imdb_test.csv")
##将预处理好的文本转化为数据表
traindata = pd.DataFrame({"train_text":train_text,
"train_word":train_text_pre2,
"train_label":train_label})
##计算每个影评使用词的数量
train_word_num = [len(text) for text in train_text_pre2]
traindata["train_word_num"] = train_word_num
##可视化影评词语长度分布
plt.figure(figsize=(8,5))
_ = plt.hist(train_word_num,bins=100)
plt.xlabel("word number")
plt.ylabel("Freq")
plt.show()
数据可视化
##使用词云可视化两种情感的词频差距
plt.figure(figsize=(16,10))
for ii in np.unique(train_label):
##准备每种情感所有词语
text = np.array(traindata.train_word[traindata.train_label == ii])
text = " ".join(np.concatenate(text))
plt.subplot(1,2,ii+1)
##生成词云
wordcod = WordCloud(margin=5,width=1800,
height=1000,max_words=500,
min_font_size=5,background_color='white',
max_font_size=150)
wordcod.generate_from_text(text)
plt.imshow(wordcod)
plt.axis("off")
if ii == 1:
plt.title("Positive")
else:
plt.title("Negative")
plt.subplots_adjust(wspace=0.05)
plt.show()