Tensorflow练习1-对评论进行分类

#-*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf
import random
import pickle
from collections import Counter
import pdb 
import nltk
from nltk.tokenize import word_tokenize
"""
'I'm super man'
tokenize:
['I', ''m', 'super','man' ] 
"""
from nltk.stem import WordNetLemmatizer
"""
词形还原(lemmatizer),即把一个任何形式的英语单词还原到一般形式,与词根还原不同(stemmer),后者是抽取一个单词的词根。
"""
nltk.download('wordnet') 
pos_file = 'pos.txt'
neg_file = 'neg.txt'
 
# 创建词汇表
def create_lexicon(pos_file, neg_file):
	lex = []
	# 读取文件
	def process_file(f):
		with open(pos_file, 'r') as f:
			lex = []
			lines = f.readlines()
 #                       pdb.set_trace()
			#print(lines)
			for line in lines:
                            try:
				words = word_tokenize(line.lower())  #将一行按照空白符来切分,得到一组单词集合
				lex += words
                            except:
                                pass
#                        pdb.set_trace()
			return lex
	lex += process_file(pos_file)
	lex += process_file(neg_file)  #lex是文件中所有的单词
	#print(len(lex))
	lemmatizer = WordNetLemmatizer()
#	lex = [lemmatizer.lemmatize(word) for word in lex] # 词形还原 (cats->cat)
        nlex = []
        for word in lex:
            try:
                nlex += lemmatizer.lemmatize(word) #将单词的各种形式还原成原形,如cats还原成cat
            except:
#                pdb.set_trace()
                i = 1   
        lex = nlex    
	word_count = Counter(lex) #统计单词出现的次数
	#print(word_count)
	# {'.': 13944, ',': 10536, 'the': 10120, 'a': 9444, 'and': 7108, 'of': 6624, 'it': 4748, 'to': 3940......}
	# 去掉一些常用词,像the,a and等等,和一些不常用词; 这些词对判断一个评论是正面还是负面没有做任何贡献
	lex = []
        #只保留指定频次区间的单词   
	for word in word_count:
		if word_count[word] < 2000 and word_count[word] > 20:  # 这写死了,好像能用百分比
			lex.append(word)        # 齐普夫定律-使用Python验证文本的Zipf分布 http://blog.topspeedsnail.com/archives/9546
	return lex
 
lex = create_lexicon(pos_file, neg_file)
#lex里保存了文本中出现过的单词。
 
# 把每条评论转换为向量, 转换原理:
# 假设lex为['woman', 'great', 'feel', 'actually', 'looking', 'latest', 'seen', 'is'] 当然实际上要大的多
# 评论'i think this movie is great' 转换为 [0,1,0,0,0,0,0,1], 把评论中出现的字在lex中标记,出现过的标记为1,其余标记为0
def normalize_dataset(lex):
	dataset = []
	# lex:词汇表;review:评论;clf:评论对应的分类,[0,1]代表负面评论 [1,0]代表正面评论
        # 处理文件中的一行评论,将一条评论表示成一个向量 
	def string_to_vector(lex, review, clf):
		words = word_tokenize(review.lower())
		lemmatizer = WordNetLemmatizer()
	#	words = [lemmatizer.lemmatize(word) for word in words]
                nwords = []
                for word in words:
                    try:
                        nwords += lemmatizer.lemmatize(word)
                    except:
                        pass
                words = nwords    
 
		features = np.zeros(len(lex))
		for word in words:
			if word in lex:
				features[lex.index(word)] = 1  # 一个句子中某个词可能出现两次,可以用+=1,其实区别不大
		return [features, clf]
 
	with open(pos_file, 'r') as f:
		lines = f.readlines()
		for line in lines:
                    try:
			one_sample = string_to_vector(lex, line, [1,0])  # [array([ 0.,  1.,  0., ...,  0.,  0.,  0.]), [1,0]]
			dataset.append(one_sample)
                    except:
                        pass  
	with open(neg_file, 'r') as f:
		lines = f.readlines()
		for line in lines:
                    try:
			one_sample = string_to_vector(lex, line, [0,1])  # [array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), [0,1]]]
			dataset.append(one_sample)
                    except:
                        pass
	
	#print(len(dataset))
	return dataset
 
dataset = normalize_dataset(lex) #将每行评论表示成一个向量后的集合
random.shuffle(dataset)  #将list中的元素随机排序

 
# 取样本中的10%做为测试数据
test_size = int(len(dataset) * 0.1)
dataset = np.array(dataset)
#dataset的格式:[[X1:Y1], [X2,Y2] ....] 
train_dataset = dataset[:-test_size]
test_dataset = dataset[-test_size:]
 
# Feed-Forward Neural Network
# 定义每个层有多少'神经元''
n_input_layer = len(lex)  # 输入层
 
n_layer_1 = 1000    # hide layer
n_layer_2 = 1000    # hide layer(隐藏层)听着很神秘,其实就是除输入输出层外的中间层
 
n_output_layer = 2       # 输出层
 
# 定义待训练的神经网络
def neural_network(data):
	# 定义第一层"神经元"的权重和biases, w: n_input_layer*n_layer_1 , b: n_layer_1*1
	layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
	# 定义第二层"神经元"的权重和biases
	layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
	# 定义输出层"神经元"的权重和biases
	layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
 
	# w·x+b
	layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
	layer_1 = tf.nn.relu(layer_1)  # 激活函数, 将为负数的元素改成0
	layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
	layer_2 = tf.nn.relu(layer_2 ) # 激活函数
	layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
 
	return layer_output
 
# 每次使用50条数据进行训练
batch_size = 50
 
X = tf.placeholder('float', [None, len(train_dataset[0][0])]) 
#[None, len(train_x)]代表数据数据的高和宽(矩阵),好处是如果数据不符合宽高,tensorflow会报错,不指定也可以。
Y = tf.placeholder('float')
# 使用数据训练神经网络
def train_neural_network(X, Y):
	predict = neural_network(X)  #这就是我们训练的最终的神经网络模型
#	cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=predict, logits=Y))
        cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=predict)) 
	optimizer = tf.train.AdamOptimizer().minimize(cost_func)  # learning rate 默认 0.001 
 
	epochs = 20
	with tf.Session() as session:
		session.run(tf.initialize_all_variables())
		for epoch in range(epochs):
                        random.shuffle(train_dataset) #train_dataset的格式:[[X1:Y1], [X2,Y2] ....]
	                train_x = train_dataset[:, 0]
        	        train_y = train_dataset[:, 1] 
			i = 0
			epoch_loss = 0
			while i < len(train_x):
				start = i
				end = i + batch_size
				batch_x = train_x[start:end]
				batch_y = train_y[start:end]
				_, c = session.run([optimizer, cost_func], feed_dict={X:list(batch_x),Y:list(batch_y)})
				epoch_loss += c
				i += batch_size
			print(epoch, ' : ', epoch_loss)
 
		text_x = test_dataset[: ,0]
		text_y = test_dataset[:, 1]
                #tf.argmax(predict,1) 实际就是从第2个维度,也就是每列中([0,1]/[1,0]) 中找出最大的值的索引,实际就是看哪一位为1
                #predict和Y的格式都是:[[0,1],[1,0],.....]
		correct = tf.equal(tf.argmax(predict,1), tf.argmax(Y,1)) #这里predict就是将输入的X带入模型neural_network计算出来的
		accuracy = tf.reduce_mean(tf.cast(correct,'float'))
		print('准确率: ', accuracy.eval({X:list(text_x) , Y:list(text_y)}))
 
train_neural_network(X,Y)

数据集下载地址:

neg.txt:5331条负面电影评论(http://blog.topspeedsnail.com/wp-content/uploads/2016/11/neg.txt)
pos.txt:5331条正面电影评论 (http://blog.topspeedsnail.com/wp-content/uploads/2016/11/pos.txt)

训练结果为:

(0, ' : ', 9017.1933832168579)
(1, ' : ', 4066.41361951828)
(2, ' : ', 4068.9778349399567)
(3, ' : ', 3021.8649171590805)
(4, ' : ', 2755.1698242425919)
(5, ' : ', 3285.6446791887283)
(6, ' : ', 2699.1985047459602)
(7, ' : ', 3897.1043330430984)
(8, ' : ', 3554.3585470914841)
(9, ' : ', 2428.1245975494385)
(10, ' : ', 3937.8592742085457)
(11, ' : ', 2276.5575492374992)
(12, ' : ', 2440.6624698638916)
(13, ' : ', 1014.4160533390241)
(14, ' : ', 1225.6515423953533)
(15, ' : ', 892.63348872936263)
(16, ' : ', 822.74370710202493)
(17, ' : ', 454.85909063366125)
(18, ' : ', 165.94665694236755)
(19, ' : ', 3.1606742052643568)
('正确率: ', 0.51361501)

这正确率比瞎蒙好那么一丢丢?我要哭了,不过问题应该是出在数据集上吧。


原文地址:http://blog.topspeedsnail.com/archives/10399

但是原文代码有很多错误,我这里进行了一些修正




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值