tensorflow练习1：利用神经网络进行分类

最新推荐文章于 2024-06-27 11:21:44 发布

风之清扬

最新推荐文章于 2024-06-27 11:21:44 发布

阅读量2k

点赞数 1

分类专栏： tensorflow学习笔记文章标签：神经网络分类

本文链接：https://blog.csdn.net/a18852867035/article/details/75675368

版权

这篇博客介绍了如何利用TensorFlow进行情感分类，通过神经网络对评论数据集进行处理和分类，数据处理包括停用词表的使用，模型部分采用MLP，经过训练和评估，最终模型达到86%的准确率。

摘要由CSDN通过智能技术生成

TensorFlow可被用于语音识别或图像识别等多项机器深度学习领域，它可在小到手机、大到数千台服务器上运行。前段时间在做有关情感分类的实验，利用了神经网络对数据进行分类；效果还不错，达到80+%。
数据集来源：评论数据集，中文的，很不容易，感谢作者！
pos数据
neg数据

数据处理：

import random
def loadfile():
    neg = pd.read_excel('data/neg.xls', header=None, index=None)
    pos = pd.read_excel('data/pos.xls', header=None, index=None)  # 读取训练语料完毕
    pos['mark'] = 1
    neg['mark'] = 0  # 给训练语料贴上标签
    pn = pd.concat([pos, neg], ignore_index=True)  # 合并语料
    #neglen = len(neg)
    #poslen = len(pos)  # 计算语料数目
    #print(type(neg['mark'].values[0]))
    #print(pn[:10],pn[-10:-1])

    print (len(pn[0].values),len(pn['mark'].values))

    with open('data/data.txt','w',encoding='utf-8') as f:
        for x in pn[0].values:
           f.write(x+'\n')
    with open('data/label.txt', 'w', encoding='utf-8') as f:
        for x in pn['mark'].values:
            f.write(str(x)+'\n')

loadfile()#加载并合并数据

-------------------------------------------------

#分词，去停用词
import jieba
import numpy as np
with open('data/stopwords', 'r', encoding='utf-8') as f:

    stopwords = []
    for line in f.readlines():
        stopwords.append(line.strip())
def split_word():
    with open('data/data.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        #lines=random.sample(lines, len(lines))#打乱次序
        lines_1 = []
        #word_list = []
        for line in lines:
            line = ' '.join(jieba.cut(line.strip()))
            #for word in line.split(' '):
            #    if word not in stopwords:
            #        word_list.append(word)
            lines_1.append(line)
        with open('data/split_data.txt','w',encoding='utf-8') as f1:
            for line in lines_1:
                f1.write(line+'\n')


    #print(lines_1[0])
    #with open('data/clean_data.txt','w',encoding='utf-8') as f:
    #    for line in lines_1:
    #        f.write((" ".join([word for word in line]) + "\n"))
with open('data/split_data.txt','r',encoding='utf-8') as f:
    line_list=[]
    #len_list=[]
    for line in f.readlines():
        line =line.strip().split(' ')
        line_1=[]
        for word in line:
            if word not in stopwords:
                line_1.append(word)
        #len_list.append(len(line_1))
        line_list.append(line_1)
    with open('data_clean.txt','w',encoding='utf-8') as f1:
        for line in line_list:
            f1.write((" ".join([num for num in line]) + "\n"))

停用词表（stopwords）：

"
..
>>

/
...

8
二
＜
＠
］
、
，
“
”
。
-
&
《
》
…
?
^
_
（
）
#
啊
此
这
呢
哦
仅
*
+
=
0
1
2
3
4
5
6
7
8
9
@
$
【
】
[
]
矣
兮
~
>
<
{
}
了
个
呵
的
」
「
&#
;
%
．
.
：
—
TWILIGHT
,
\
；
.....

创建词典:

#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__"  # 对话结束
UNK = "__UNK__"  # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

dataset_path_1='data_clean.txt'
#dataset_path_2="data/sentiment_XS_test.txt"

def set_dataset_path(path):
    dataset_path=path

if not os.path.exists(dataset_path_1):
    print('training dataset is null')
    exit()

#gen_vocabulary(生成字典)
def gen_vocabulary_file(input_file, output_file,vocab_size,input_file2=None):
    f = open(input_file, encoding='utf-8')
    train_set_x = []
    #train_set_y = []
    #test_set_x = []
    #test_set_y = []
    for line in f.readlines():
        x = line.strip()
        train_set_x.append(x)
        #train_set_y.append(y)
    f.close()

    #train_set_x = train_set_x[1:]
    vocabulary = {}

    counter = 0
    for line in train_set_x:
        counter += 1
        # print line
        tokens = line.strip().split(' ')  # 这一步有问题，输出的不是汉字
        #print(tokens)
        for word in tokens:
            if word in vocabulary:  # 已在词汇表中，则词频加1
                vocabulary[word] += 1
            else:  # 不在则为1
                vocabulary[word] = 1
    vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
    # print vocabulary
    # 取前5000个常用汉字, 应该差不多够用了
    if len(vocabulary_list) > vocab_size:
        vocabulary_list = vocabulary_list[:vocab_size]  # vocab_size大小的词汇表

    print(input_file, " 词汇表大小:", len(vocabulary_list))
    with open(output_file, "w",encoding='utf-8') as ff:
        for word in vocabulary_list:
            ff.write(word + '\n')



print ("vocabulary start convert...:")
gen_vocabulary_file(dataset_path_1,"train_set_vocabulary",20000)

句子转换id：

#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS &#

最低0.47元/天解锁文章

风之清扬

关注

1
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录