cnn文本分类

from lxml import etree
import numpy as np
xml_text=etree.parse('/home/wyy/PycharmProjects/wyy1/classification/data/cnews/cnews.train.xml')
root=xml_text.getroot()
content=[]
categ=[]
for article in root[1]:
    content.append(article.text)
    categ.append(article.get('class'))
categories=[]
for categorie in root[0]:
    for categorie in categorie:
        categories.append(categorie.get('id'))
words=[]
for i in content:
    words.extend(list(i))
print(words)
from collections import Counter
word=Counter(words)
# print(word)
# print(word.most_common(10))
pairs=word.most_common(4999)
word=list(zip(*pairs))
word_list=['<PAD>']+list(word[0])
# print(word_list)
word_to_id=dict(zip(word_list,range(len(word_list))))


cat_to_id=dict(zip(categories,range(len(categories))))
# print(cat_to_id)
# print(content)
# print(categ)
a1=[]
for i in content:
    a2=[]
    for j in i:
        if j in word_to_id:
            a2.append(word_to_id[j])
    a1.append(a2)
data_id=a1

c=[]
for i in categ:
    if i in cat_to_id:
        c.append(cat_to_id[i])
label_id=c
print(data_id,'\n',label_id)
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

x_train=pad_sequences(data_id,maxlen=600)
y_train=to_categorical(label_id,num_classes=15)
print(x_train)
print(y_train)
import tensorflow as tf

input_x=tf.placeholder(dtype=tf.int32, shape=[None,600], name='input_x')
input_y=tf.placeholder(dtype=tf.float32, shape=[None,15],name='input_y')
# embeding层
embedding = tf.get_variable('embeding',[5000,64])
embedding_inputs= tf.nn.embedding_lookup(embedding, input_x)

#cnn layer
conv=tf.layers.conv1d(inputs=embedding_inputs,filters=256,kernel_size=5,name='conv')

gmp=tf.reduce_max(conv, reduction_indices=[1],name='gmp')

fc = tf.layers.dense(gmp,128, name='fc1')
fc = tf.contrib.layers.dropout(fc, 0.6)
fc = tf.nn.relu(fc)

logits = tf.layers.dense(fc, 15, name='fc2')
y_pred_cls = tf.argmax(tf.nn.softmax(logits), 1)

cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=input_y)
loss=tf.reduce_mean(cross_entropy)

optim=tf.train.AdamOptimizer(learning_rate=0.001,).minimize(loss)
correct_pred = tf.equal(tf.argmax(input_y, 1), y_pred_cls)
acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

sess=tf.Session()

sess.run(tf.global_variables_initializer())
def batch_iter(x, y, batch_size=64):
    """生成批次数据"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]



for i in range(1000):

    batch_train=batch_iter(x_train,y_train,batch_size=64)


    for x_batch,y_batch in batch_train:
        # import time
        # print(y_batch.dtype)
        # print(y_batch)


        sess.run(optim, feed_dict={input_x:x_batch,input_y:y_batch,})

        print(sess.run(acc, feed_dict={input_y:y_batch,input_x:x_batch}))
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值