本文包括一下流程:
1.下载数据集,这次是使用的斯坦福提供的数据集[IMDB]
2.数据集处理
3.构建网络模型
4.训练和测试
一.数据集处理
首先加载数据集.我的项目路径如下图:
# 读取文本数据,data形式:[[['hello','word'],标签]....]
def load_data(path, flag='train'):
labels = ['pos', 'neg']
data = []
for label in labels:
files = os.listdir(os.path.join(path, flag, label))
# 去除标点符号
r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n。!,]+'
for file in files:
with open(os.path.join(path, flag, label, file), 'r', encoding='utf8') as rf:
temp = rf.read().replace('\n', '')
temp = temp.replace('<br /><br />', ' ')
temp = re.sub(r, '', temp)
temp = temp.split(' ')
temp = [temp[i].lower() for i in range(len(temp)) if temp[i] != '']
if label == 'pos':
data.append([temp, 1])
elif label == 'neg':
data.append([temp, 0])
return data
train_data = load_data('Imdb', 'train')
test_data = load_data('Imdb', 'test')
得到数据集后,需要统计词频和构建词典
def fit(sentence):
"""
统计词频
:param sentence:
:return:
"""
for word in sentence:
# 字典(Dictionary) get(key,default=None) 函数返回指定键的值,如果值不在字典中返回默认值。
word_count[word] = word_count.get(word, 0) + 1
def build_vocab(min_count=5, max_count=1000, max_features=25000):
"""
:param min_count: 最小词频
:param max_count: 最大词频
:param max_features: 最大词语数
:return:
"""
global word_count
global word_idx
word_count = {
word: count for word, count in word_count.items() if count > min_count}
if max_count is not None:
word_count = {
word: count for word, count in word_count.items() if count <= max_count}
if max_features is not None:
# 排序
word_count = dict(sorted(word_count.items(), key=lambda x: x[-1], reverse=True)[:max_features])
for word in word_count:
# 将单词对应自己的id
word_idx[word] = len(word_idx) # 每次word对应一个序号
然后需要将数据集中的句子有单词形式转换成数字形式.
def transform(sentence, max_len=200):
"""
把句子转换为数字序列
:param sentence:
:param max_len: 句子的最大长度
:return: