数据源:imdb.com
预处理
分词
词的数字化表示方法与词嵌入
更合理的方案
http://word2vec.googlecode.com/svn/trunk/
https://nlp.stanford.edu/projects/glove/
https://nlp.stanford.edu/projects/glove/
IMDB数据集获取与处理(非TF集成模式)
数据读取
import numpy as np
import tensorflow as tf
import os
import re
def remove_tags(text):
re_tag = re.compile(r'<[^>]>')
return re_tag.sub('',text)
def read_files(filetype):
path = './aclImdb/'
file_list = []
#读取正面评价文件的路径,存到file_list列表里
positive_path = path + filetype + '/pos/'
for f in os.listdir(positive_path):
file_list += [positive_path + f]
pos_files_num = len(file_list)
#读取负面评价的文件路径,存到file_list里面
negative_path = path + filetype + '/neg/'
for f in os.listdir(negative_path):
file_list += [negative_path + f]
neg_files_num = len(file_list) - pos_files_num
print('read',filetype,'files:',len(file_list))
print(pos_files_num,'POS FILES IN',filetype,'files')
print(neg_files_num,'neg files in',filetype,'files')
# 用one-hot编码标签
all_labels = ([[1,0]]*pos_files_num + [[0,1]] * neg_files_num)
#得到所有文本
all_texts = []
for fi in file_list:
with open(fi,encoding='utf-8') as file_input:
#正则表达式清楚’br /'这类标签
all_texts += [remove_tags(" ".join(file_input.readlines()))]
return all_labels,all_texts
#得到 训练 与 测试 用的 标签和文本
train_labels,train_texts = read_files("train")
test_labels,test_tests = read_files("test")
#查看数据、标签
print("训练数据")
print("正面评价:")
print(train_texts[0])
print(train_labels[0])
print("负面评价:")
print(train_texts[12500])
print(train_labels[12500])
print('=='*50)
print("测试数据")
print("正面评价:")
print(test_tests[0])
print(test_labels[0])
print("负面数据:")
print(test_tests[12500])
print(test_labels[12500])