tensorflow练习1:利用神经网络进行分类

TensorFlow可被用于语音识别或图像识别等多项机器深度学习领域,它可在小到手机、大到数千台服务器上运行。前段时间在做有关情感分类的实验,利用了神经网络对数据进行分类;效果还不错,达到80+%。
数据集来源:评论数据集,中文的,很不容易,感谢作者!
pos数
neg数据

数据处理:

import random
def loadfile():
    neg = pd.read_excel('data/neg.xls', header=None, index=None)
    pos = pd.read_excel('data/pos.xls', header=None, index=None)  # 读取训练语料完毕
    pos['mark'] = 1
    neg['mark'] = 0  # 给训练语料贴上标签
    pn = pd.concat([pos, neg], ignore_index=True)  # 合并语料
    #neglen = len(neg)
    #poslen = len(pos)  # 计算语料数目
    #print(type(neg['mark'].values[0]))
    #print(pn[:10],pn[-10:-1])

    print (len(pn[0].values),len(pn['mark'].values))

    with open('data/data.txt','w',encoding='utf-8') as f:
        for x in pn[0].values:
           f.write(x+'\n')
    with open('data/label.txt', 'w', encoding='utf-8') as f:
        for x in pn['mark'].values:
            f.write(str(x)+'\n')

loadfile()#加载并合并数据

-------------------------------------------------

#分词,去停用词
import jieba
import numpy as np
with open('data/stopwords', 'r', encoding='utf-8') as f:

    stopwords = []
    for line in f.readlines():
        stopwords.append(line.strip())
def split_word():
    with open('data/data.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        #lines=random.sample(lines, len(lines))#打乱次序
        lines_1 = []
        #word_list = []
        for line in lines:
            line = ' '.join(jieba.cut(line.strip()))
            #for word in line.split(' '):
            #    if word not in stopwords:
            #        word_list.append(word)
            lines_1.append(line)
        with open('data/split_data.txt','w',encoding='utf-8') as f1:
            for line in lines_1:
                f1.write(line+'\n')


    #print(lines_1[0])
    #with open('data/clean_data.txt','w',encoding='utf-8') as f:
    #    for line in lines_1:
    #        f.write((" ".join([word for word in line]) + "\n"))
with open('data/split_data.txt','r',encoding='utf-8') as f:
    line_list=[]
    #len_list=[]
    for line in f.readlines():
        line =line.strip().split(' ')
        line_1=[]
        for word in line:
            if word not in stopwords:
                line_1.append(word)
        #len_list.append(len(line_1))
        line_list.append(line_1)
    with open('data_clean.txt','w',encoding='utf-8') as f1:
        for line in line_list:
            f1.write((" ".join([num for num in line]) + "\n"))

停用词表(stopwords):

"
..
>>

/
...

8
二
<
@
]
、
,
“
”
。
-
&
《
》
…
?
^
_
(
)
#
啊
此
这
呢
哦
仅
*
+
=
0
1
2
3
4
5
6
7
8
9
@
$
【
】
[
]
矣
兮
~
>
<
{
}
了
个
呵
的
」
「
&#
;
%
.
.
:
—
TWILIGHT
,
\
;
.....

创建词典:

#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__"  # 对话结束
UNK = "__UNK__"  # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

dataset_path_1='data_clean.txt'
#dataset_path_2="data/sentiment_XS_test.txt"

def set_dataset_path(path):
    dataset_path=path

if not os.path.exists(dataset_path_1):
    print('training dataset is null')
    exit()

#gen_vocabulary(生成字典)
def gen_vocabulary_file(input_file, output_file,vocab_size,input_file2=None):
    f = open(input_file, encoding='utf-8')
    train_set_x = []
    #train_set_y = []
    #test_set_x = []
    #test_set_y = []
    for line in f.readlines():
        x = line.strip()
        train_set_x.append(x)
        #train_set_y.append(y)
    f.close()

    #train_set_x = train_set_x[1:]
    vocabulary = {}

    counter = 0
    for line in train_set_x:
        counter += 1
        # print line
        tokens = line.strip().split(' ')  # 这一步有问题,输出的不是汉字
        #print(tokens)
        for word in tokens:
            if word in vocabulary:  # 已在词汇表中,则词频加1
                vocabulary[word] += 1
            else:  # 不在则为1
                vocabulary[word] = 1
    vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
    # print vocabulary
    # 取前5000个常用汉字, 应该差不多够用了
    if len(vocabulary_list) > vocab_size:
        vocabulary_list = vocabulary_list[:vocab_size]  # vocab_size大小的词汇表

    print(input_file, " 词汇表大小:", len(vocabulary_list))
    with open(output_file, "w",encoding='utf-8') as ff:
        for word in vocabulary_list:
            ff.write(word + '\n')



print ("vocabulary start convert...:")
gen_vocabulary_file(dataset_path_1,"train_set_vocabulary",20000)

句子转换id:

#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS &#
  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值