from lxml import etree
import numpy as np
xml_text=etree.parse('/home/wyy/PycharmProjects/wyy1/classification/data/cnews/cnews.train.xml')
root=xml_text.getroot()
content=[]
categ=[]
for article in root[1]:
content.append(article.text)
categ.append(article.get('class'))
categories=[]
for categorie in root[0]:
for categorie in categorie:
categories.append(categorie.get('id'))
words=[]
for i in content:
words.extend(list(i))
print(words)
from collections import Counter
word=Counter(words)
# print(word)
# print(word.most_common(10))
pairs=word.most_common(4999)
word=list(zip(*pairs))
word_list=['<PAD>']+list(word[0])
# print(word_list)
word_to_id=dict(zip(word_list,range(len(word_list))))
cat_to_id=dict(zip(categories,range(len(categories))))
# print(cat_to_id)
# print(content)
# print(categ)
a1=[]
for i in content:
a2=[]
for j in i:
if j in word_to_id:
a2.append(word_to_id[j])
a1.append(a2)
data_id=a1
c=[]
for i in categ:
if i in cat_to_id:
c.append(cat_to_id[i])
label_id=c
print(data_id,'\n',label_id)
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
x_train=pad_sequences(data_id,maxlen=600)
y_train=to_categorical(label_id,num_classes=15)
print(x_train)
print(y_train)
import tensorflow as tf
input_x=tf.placeholder(dtype=tf.int32, shape=[None,600], name='input_x')
input_y=tf.placeholder(dtype=tf.float32, shape=[None,15],name='input_y')
# embeding层
embedding = tf.get_variable('embeding',[5000,64])
embedding_inputs= tf.nn.embedding_lookup(embedding, input_x)
#cnn layer
conv=tf.layers.conv1d(inputs=embedding_inputs,filters=256,kernel_size=5,name='conv')
gmp=tf.reduce_max(conv, reduction_indices=[1],name='gmp')
fc = tf.layers.dense(gmp,128, name='fc1')
fc = tf.contrib.layers.dropout(fc, 0.6)
fc = tf.nn.relu(fc)
logits = tf.layers.dense(fc, 15, name='fc2')
y_pred_cls = tf.argmax(tf.nn.softmax(logits), 1)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=input_y)
loss=tf.reduce_mean(cross_entropy)
optim=tf.train.AdamOptimizer(learning_rate=0.001,).minimize(loss)
correct_pred = tf.equal(tf.argmax(input_y, 1), y_pred_cls)
acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
sess=tf.Session()
sess.run(tf.global_variables_initializer())
def batch_iter(x, y, batch_size=64):
"""生成批次数据"""
data_len = len(x)
num_batch = int((data_len - 1) / batch_size) + 1
indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
for i in range(1000):
batch_train=batch_iter(x_train,y_train,batch_size=64)
for x_batch,y_batch in batch_train:
# import time
# print(y_batch.dtype)
# print(y_batch)
sess.run(optim, feed_dict={input_x:x_batch,input_y:y_batch,})
print(sess.run(acc, feed_dict={input_y:y_batch,input_x:x_batch}))
cnn文本分类
最新推荐文章于 2024-05-05 14:44:55 发布