TensorFlow可被用于语音识别或图像识别等多项机器深度学习领域,它可在小到手机、大到数千台服务器上运行。前段时间在做有关情感分类的实验,利用了神经网络对数据进行分类;效果还不错,达到80+%。
数据集来源:评论数据集,中文的,很不容易,感谢作者!
pos数据
neg数据
数据处理:
import random
def loadfile():
neg = pd.read_excel('data/neg.xls', header=None, index=None)
pos = pd.read_excel('data/pos.xls', header=None, index=None) # 读取训练语料完毕
pos['mark'] = 1
neg['mark'] = 0 # 给训练语料贴上标签
pn = pd.concat([pos, neg], ignore_index=True) # 合并语料
#neglen = len(neg)
#poslen = len(pos) # 计算语料数目
#print(type(neg['mark'].values[0]))
#print(pn[:10],pn[-10:-1])
print (len(pn[0].values),len(pn['mark'].values))
with open('data/data.txt','w',encoding='utf-8') as f:
for x in pn[0].values:
f.write(x+'\n')
with open('data/label.txt', 'w', encoding='utf-8') as f:
for x in pn['mark'].values:
f.write(str(x)+'\n')
loadfile()#加载并合并数据
-------------------------------------------------
#分词,去停用词
import jieba
import numpy as np
with open('data/stopwords', 'r', encoding='utf-8') as f:
stopwords = []
for line in f.readlines():
stopwords.append(line.strip())
def split_word():
with open('data/data.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
#lines=random.sample(lines, len(lines))#打乱次序
lines_1 = []
#word_list = []
for line in lines:
line = ' '.join(jieba.cut(line.strip()))
#for word in line.split(' '):
# if word not in stopwords:
# word_list.append(word)
lines_1.append(line)
with open('data/split_data.txt','w',encoding='utf-8') as f1:
for line in lines_1:
f1.write(line+'\n')
#print(lines_1[0])
#with open('data/clean_data.txt','w',encoding='utf-8') as f:
# for line in lines_1:
# f.write((" ".join([word for word in line]) + "\n"))
with open('data/split_data.txt','r',encoding='utf-8') as f:
line_list=[]
#len_list=[]
for line in f.readlines():
line =line.strip().split(' ')
line_1=[]
for word in line:
if word not in stopwords:
line_1.append(word)
#len_list.append(len(line_1))
line_list.append(line_1)
with open('data_clean.txt','w',encoding='utf-8') as f1:
for line in line_list:
f1.write((" ".join([num for num in line]) + "\n"))
停用词表(stopwords):
"
..
>>
/
...
8
二
<
@
]
、
,
“
”
。
-
&
《
》
…
?
^
_
(
)
#
啊
此
这
呢
哦
仅
*
+
=
0
1
2
3
4
5
6
7
8
9
@
$
【
】
[
]
矣
兮
~
>
<
{
}
了
个
呵
的
」
「
&#
;
%
.
.
:
—
TWILIGHT
,
\
;
.....
创建词典:
#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
dataset_path_1='data_clean.txt'
#dataset_path_2="data/sentiment_XS_test.txt"
def set_dataset_path(path):
dataset_path=path
if not os.path.exists(dataset_path_1):
print('training dataset is null')
exit()
#gen_vocabulary(生成字典)
def gen_vocabulary_file(input_file, output_file,vocab_size,input_file2=None):
f = open(input_file, encoding='utf-8')
train_set_x = []
#train_set_y = []
#test_set_x = []
#test_set_y = []
for line in f.readlines():
x = line.strip()
train_set_x.append(x)
#train_set_y.append(y)
f.close()
#train_set_x = train_set_x[1:]
vocabulary = {}
counter = 0
for line in train_set_x:
counter += 1
# print line
tokens = line.strip().split(' ') # 这一步有问题,输出的不是汉字
#print(tokens)
for word in tokens:
if word in vocabulary: # 已在词汇表中,则词频加1
vocabulary[word] += 1
else: # 不在则为1
vocabulary[word] = 1
vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
# print vocabulary
# 取前5000个常用汉字, 应该差不多够用了
if len(vocabulary_list) > vocab_size:
vocabulary_list = vocabulary_list[:vocab_size] # vocab_size大小的词汇表
print(input_file, " 词汇表大小:", len(vocabulary_list))
with open(output_file, "w",encoding='utf-8') as ff:
for word in vocabulary_list:
ff.write(word + '\n')
print ("vocabulary start convert...:")
gen_vocabulary_file(dataset_path_1,"train_set_vocabulary",20000)
句子转换id:
#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS &#