tensorflow的尝试进行谣言检测

数据是清华大学的中文谣言集

我只取了一千条谣言,分词,计算出IF-IDF,贴上标签,用作训练集,但是效果不是很好,这其中有很多需要优化的地方。我是小白,正在学东西
-这是数据处理的程序,由于代码写的很乱,不断的修改、不断的调试,最终有很多注释和不满意的地方。

# -*- coding: utf-8 -*-
import jieba
import numpy
import json
import copy
import os
import re
import time
import logging
import csv
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA,KernelPCA

#########配置log日志方便打印#############

LOG_FORMAT = "%(asctime)s -%(filename)s[line:%(lineno)d]- %(name)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m-%d-%Y %H:%M:%S"

logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)

logger = logging.getLogger(__name__)

#------------------处理数据-----------------------#


# stopword_list = []
# rumor_corpus = []
# unrumor_corpus = []
# training_data = []
# validation_data = []
# test_data = []
# bag_of_word_count = {}


def readrumorfile(filename, bag_of_word_count, stopword_list, rumor_corpus):
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            # line.encode('utf-8')
            text_json = json.loads(line)
            s = jieba.cut(re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", text_json["rumorText"]))  # 过滤掉句子中的数字和字母以及标点符号
            line_list = list(s)

            cp_line = copy.deepcopy(line_list)
            for word in line_list:
                if word in stopword_list:
                    word.encode('utf-8')
                    cp_line.remove(word)  # 去掉停用词
            for word in cp_line:
                if word not in bag_of_word_count:
                    bag_of_word_count[word] = 1
                else:
                    bag_of_word_count[word] += 1
            rumor_corpus.append(",".join(cp_line))
            if len(rumor_corpus) >= 1000:
                break
        f.close()
def readnewsfile(filename,bag_of_word_count,stopword_list,unrumor_corpus):
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            if len(line) <= 43:  # 过滤掉短文本   毫无意义的文本 len("①柔道冠军称父亲深夜被带走真相是这样http://t.cn/EJuoyyO")=38
                continue
            s = jieba.cut(re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", line.strip('\n')))
            line_list = list(s)

            cp_line = copy.deepcopy(line_list)
            for word in line_list:
                if word in stopword_list:
                    word.encode('utf-8')
                    cp_line.remove(word)  # 去掉停用词
            for word in cp_line:
                if word not in bag_of_word_count:
                    bag_of_word_count[word] = 1
                else:
                    bag_of_word_count[word] += 1
            unrumor_corpus.append(",".join(cp_line))
            if len(unrumor_corpus) >= 1000:
                break
        f.close()

#将词袋中小于frequ的直接去掉

def removeWord(rumor_corpus,unrumor_corpus,bag_of_word_count,frequ):
    rumor_cor = []
    unrumor_cor = []
    for s_r,s_u in zip(rumor_corpus,unrumor_corpus):
        list_s_r = s_r.split(",")
        list_s_u = s_u.split(",")

        list_r = copy.deepcopy(list_s_r)
        list_u = copy.deepcopy(list_s_u)

        for w in list_s_r:
            if w not in bag_of_word_count:
                logger.info(w)
                continue
            if bag_of_word_count[w] < frequ:
                list_r.remove(w)
        for w in list_s_u:
            if w not in bag_of_word_count:
                logger.info(w)
                continue
            if bag_of_word_count[w] < frequ:
                list_u.remove(w)

        if list_s_r:
            rumor_cor.append(",".join(list_r))
        if list_s_u:
            unrumor_cor.append(",".join(list_u))

    return rumor_cor,unrumor_cor



def getdata(stopword_list, bag_of_word_count, rumor_corpus, unrumor_corpus):
    # remove stopwords from list_corpus

    with open("../data/stopword.txt", "r", encoding="utf-8") as fp:
        for line in fp:
            stopword_list.append(line[:-1])
        fp.close()

    logger.info("读取停用词,构造stopword_list集合")

    # 谣言
    # 数据处理    list_corpus = [rumorText,rumorText,rumorText,...]
    readrumorfile("../data/rumors_v170613.json", bag_of_word_count, stopword_list, rumor_corpus)

    logger.info("从 rumors_v170613.json 谣言文本中获取 %g条数据" % (len(rumor_corpus)))
    # 非谣言
    readnewsfile("../data/news20190407-214236.txt", bag_of_word_count, stopword_list, unrumor_corpus)
    if len(unrumor_corpus) <= 1000:
        readnewsfile("../data/news20190407-214412.txt", bag_of_word_count, stopword_list, unrumor_corpus)
    # 释放堆内存
    stopword_list.clear()

    logger.info("从 news20190407-214236.txt | news20190407-214412.txt文本中获取到 %g" % (len(unrumor_corpus)))
    logger.info("词袋长度:%s" % (len(bag_of_word_count)))
    corpus = rumor_corpus + unrumor_corpus

    return corpus,bag_of_word_count, rumor_corpus, unrumor_corpus


def Sklearn_getfeature(corpus):
    # 将list_corpus里面所有的谣言短文本转换向量化,构建词袋
    vectoerizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b')

    X = vectoerizer.fit_transform(corpus)

    # 计算TF-IDF
    tfidf_transformer = TfidfTransformer()

    tfidf = tfidf_transformer.fit_transform(X)

    logger.info("用sklearn构建词袋,TFIDF计算完成")
    # logger.info(tfidf[0][0])
    # logger.info(type(tfidf.toarray()))

    # 构造tupple,准备测试:
    # label = numpy.zeros((1000, 2))
    # for i in range(0, 500):
    #     label[i][0] = 1
    # for i in range(500, 1000):
    #     label[i][1] = 1
    # label = numpy.asarray(label)
    data_tfidf = tfidf.toarray()

    with open('../data/roumordataset.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(data_tfidf)

    #利用PCA降维
    pca = PCA(n_components=841)
    data_pca = pca.fit_transform(data_tfidf)
    with open('../data/roumordatasetPCA.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(data_pca)

    #利用PCA核方法进行降维
    kpca = KernelPCA(kernel="rbf")
    data_kpca = kpca.fit_transform(data_tfidf)
    with open('../data/roumordatasetKPCA.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(data_kpca)
    return tfidf


def gensim_getfeature(corpus):
    return


# 测试时使用的函数----毫无用处
def WriteFile(data, target):
    if os.path.exists(target):
        path, suffix = os.path.splitext(target)
        s = time.strftime("%Y%m%d-%H%M%S", time.localtime())
        target = path + s + suffix
    with open(target, 'w', encoding="utf-8") as fp:
        for str in data:
            fp.write(str)
            fp.write("\n")
    fp.close()

#做数据集   按照训练集:验证集 = 4:1

if __name__ == '__main__':
    stopword_list = []
    rumor_corpus = []
    unrumor_corpus = []
    training_data = []
    validation_data = []
    test_data = []
    bag_of_word_count = {}
    frequ = 2

    corpus,bag_of_word_count,rumor_corpus, unrumor_corpus = getdata(stopword_list,bag_of_word_count,rumor_corpus,unrumor_corpus)
    logger.info(len(rumor_corpus))
    logger.info(len(unrumor_corpus))

    rumor_cor,unrumor_cor = removeWord(rumor_corpus,unrumor_corpus,bag_of_word_count,frequ)
    logger.info(len(rumor_cor))
    logger.info(len(unrumor_cor))

    with open("../data/bag_word.json","w",encoding='utf-8') as f:
        json.dump(bag_of_word_count,f,ensure_ascii=False)

    Sklearn_getfeature(rumor_cor+unrumor_cor)

-这是训练的程序,效果很差

"""
X 841=29*29维特征的文本  Y label[1.,0.]  [0.,1.]
第一层:卷积层    输入的是29*29*1的文本特征
    过滤器尺寸 3*3 深度为5 不使用全零填充 步长为1
    输出为29-3+1=27*27 深度为5
    参数w = 3*3*1*5  b = 5
第二层:池化层   输入27*27*5的矩阵
    过滤器大小 3*3 步长为 3
    输出9*9*5
第三层:卷积层   输入9*9*5的矩阵
    过滤器尺寸 2*2 深度为12 不使用全零填充 步长为1
    参数w = 2*2*5*12 b = 12
    输出9-2+1=8*8*12
第四层:池化层   输入8*8*12
    过滤器大小 2*2 步长 2
    输出4*4*12
第五层:全连接层 输入4*4*12
    过滤器尺寸 4*4*80 不使用全零填充 步长为1
    参数w = 4*4*12*80 b = 80
    输出1*1*80
第六层:全连接层
    输入80
    w = 80*56 b = 56
    输出56
输出层:
    输入56
    w = 56*2 b=2
    输出2

"""
import tensorflow as tf
import numpy as np
import csv
import logging
#########配置log日志方便打印#############

LOG_FORMAT = "%(asctime)s -%(filename)s[line:%(lineno)d]- %(name)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m-%d-%Y %H:%M:%S"

logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)

logger = logging.getLogger(__name__)


num_input = 841
num_classes = 2
dropout = 0.5

learning_rate = 0.001
batch_size = 100
num_steps = 10000
display_step = 10

X = tf.placeholder(tf.float32, [None, num_input])
Y = tf.placeholder(tf.float32, [None, num_classes])


X_batch = tf.Variable(tf.constant(0.0), dtype=tf.float32)
Y_batch = tf.Variable(tf.constant(0.0), dtype=tf.float32)
#权重和偏向
weigths={
    "w1":tf.Variable(tf.random_normal([3, 3, 1, 5])),
    "w2":tf.Variable(tf.random_normal([2, 2, 5, 12])),
    "w3":tf.Variable(tf.random_normal([4*4*12,80])),
    "w4":tf.Variable(tf.random_normal([80,56])),
    "w5":tf.Variable(tf.random_normal([56,2]))
}
bias = {
    "b1":tf.Variable(tf.random_normal([5])),
    "b2":tf.Variable(tf.random_normal([12])),
    "b3":tf.Variable(tf.random_normal([80])),
    "b4":tf.Variable(tf.random_normal([56])),
    "b5":tf.Variable(tf.random_normal([2]))
}

def conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='VALID')
    x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)

def maxpool2d(x, k=2):
    # MaxPool2D wrapper
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],padding='VALID')

#定义操作
def conv_net(x, weights, biases, dropout):
    x = tf.reshape(x, shape=[-1, 29, 29, 1])

    conv1 = conv2d(x,weights['w1'],biases['b1'])
    conv1 = maxpool2d(conv1,k=3)

    conv2 = conv2d(conv1, weights['w2'], biases['b2'])
    conv2 = maxpool2d(conv2, k=2)

    fc3 = tf.reshape(conv2,[-1,weights['w3'].get_shape().as_list()[0]])
    fc3 = tf.add(tf.matmul(fc3, weights['w3']), biases['b3'])
    fc3 = tf.nn.relu(fc3)
    fc3 = tf.nn.dropout(fc3, dropout)

    fc4 = tf.add(tf.matmul(fc3, weights['w4']), biases['b4'])
    fc4 = tf.nn.relu(fc4)
    fc4 = tf.nn.dropout(fc4, dropout)

    fc5 = tf.add(tf.matmul(fc4, weights['w5']), biases['b5'])
    # fc5 = tf.nn.relu(fc5)

    return fc5

# Construct model
logits = conv_net(X, weigths, bias, dropout)

prediction = tf.nn.softmax(logits)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()

#初始化label  [0,1] 代表谣言  [1,0]非谣言
r = np.zeros((1000, 1),dtype=float)
c = np.ones((1000, 1),dtype=float)
a = np.hstack((r, c))
b = np.hstack((c, r))
lable = np.vstack((a, b))

count =0
with tf.Session() as sess:
    logger.info("-----------")
    s = []
    sess.run(init)
    with open("D:/WorkSpace/pyWorkspace/deepLearning/GradientDescent/data/roumordatasetPCA.csv", "r") as f:

        csv_reader = csv.reader(f)
        for line in csv_reader:
            s.append(line)
        f.close()
        line = np.array([ list(map(float, x))for x in s])
        data_x = np.reshape(line, (-1, 841))
        for step in range(1,num_steps + 1):
                data_y = np.reshape(lable,(-1,2))

                sess.run(train_op, feed_dict={X: data_x, Y: data_y})

                if step % 100 == 0 or step == 1:
                    loss, acc = sess.run([loss_op, accuracy], feed_dict={X: data_x, Y: data_y})
                    print("Step " + str(step) + ", Minibatch Loss= " + \
                          "{:.4f}".format(loss) + ", Training Accuracy= " + \
                          "{:.3f}".format(acc))

-最后贴一下结果,其实我是想做三个数据集,训练集,验证集,测试集,但是刚开始拿所有数据跑都是很差的结果,我就没有心情做下去了,后面我再改进一下,看哪里除了问题,还是数据集量太少,这可能是最根本的原因。

Step 1, Minibatch Loss= 1520.1038, Training Accuracy= 0.491
Step 100, Minibatch Loss= 262.6792, Training Accuracy= 0.549
Step 200, Minibatch Loss= 23.2395, Training Accuracy= 0.686
Step 300, Minibatch Loss= 2.4426, Training Accuracy= 0.615
Step 400, Minibatch Loss= 2.0057, Training Accuracy= 0.592
Step 500, Minibatch Loss= 1.4990, Training Accuracy= 0.586
Step 600, Minibatch Loss= 1.2961, Training Accuracy= 0.589
Step 700, Minibatch Loss= 1.1176, Training Accuracy= 0.576
Step 800, Minibatch Loss= 1.0612, Training Accuracy= 0.584
Step 900, Minibatch Loss= 0.8949, Training Accuracy= 0.595
Step 1000, Minibatch Loss= 0.8426, Training Accuracy= 0.595
Step 1100, Minibatch Loss= 0.8437, Training Accuracy= 0.592
Step 1200, Minibatch Loss= 0.7606, Training Accuracy= 0.576
Step 1300, Minibatch Loss= 0.7009, Training Accuracy= 0.609
Step 1400, Minibatch Loss= 0.6559, Training Accuracy= 0.609
Step 1500, Minibatch Loss= 0.6449, Training Accuracy= 0.598
Step 1600, Minibatch Loss= 0.6219, Training Accuracy= 0.603
Step 1700, Minibatch Loss= 0.5932, Training Accuracy= 0.608
Step 1800, Minibatch Loss= 0.5810, Training Accuracy= 0.623
Step 1900, Minibatch Loss= 0.5983, Training Accuracy= 0.608
Step 2000, Minibatch Loss= 0.5709, Training Accuracy= 0.607
Step 2100, Minibatch Loss= 0.5430, Training Accuracy= 0.626
Step 2200, Minibatch Loss= 0.5401, Training Accuracy= 0.642
Step 2300, Minibatch Loss= 0.5308, Training Accuracy= 0.630
Step 2400, Minibatch Loss= 0.5409, Training Accuracy= 0.627
Step 2500, Minibatch Loss= 0.5284, Training Accuracy= 0.638
Step 2600, Minibatch Loss= 0.5743, Training Accuracy= 0.627
Step 2700, Minibatch Loss= 0.5084, Training Accuracy= 0.649
Step 2800, Minibatch Loss= 0.5221, Training Accuracy= 0.643
Step 2900, Minibatch Loss= 0.5110, Training Accuracy= 0.651
Step 3000, Minibatch Loss= 0.5214, Training Accuracy= 0.663
Step 3100, Minibatch Loss= 0.4978, Training Accuracy= 0.663
Step 3200, Minibatch Loss= 0.5084, Training Accuracy= 0.647
Step 3300, Minibatch Loss= 0.4945, Training Accuracy= 0.677
Step 3400, Minibatch Loss= 0.4991, Training Accuracy= 0.660
Step 3500, Minibatch Loss= 0.4948, Training Accuracy= 0.667
Step 3600, Minibatch Loss= 0.4915, Training Accuracy= 0.660
Step 3700, Minibatch Loss= 0.4986, Training Accuracy= 0.670
Step 3800, Minibatch Loss= 0.4813, Training Accuracy= 0.674
Step 3900, Minibatch Loss= 0.5162, Training Accuracy= 0.682
Step 4000, Minibatch Loss= 0.5086, Training Accuracy= 0.680
Step 4100, Minibatch Loss= 0.4827, Training Accuracy= 0.677
Step 4200, Minibatch Loss= 0.4798, Training Accuracy= 0.686
Step 4300, Minibatch Loss= 0.4738, Training Accuracy= 0.682
Step 4400, Minibatch Loss= 0.4889, Training Accuracy= 0.679
Step 4500, Minibatch Loss= 0.4631, Training Accuracy= 0.690
Step 4600, Minibatch Loss= 0.4766, Training Accuracy= 0.681
Step 4700, Minibatch Loss= 0.4778, Training Accuracy= 0.686
Step 4800, Minibatch Loss= 0.4525, Training Accuracy= 0.704
Step 4900, Minibatch Loss= 0.4552, Training Accuracy= 0.692
Step 5000, Minibatch Loss= 0.4411, Training Accuracy= 0.701
Step 5100, Minibatch Loss= 0.4653, Training Accuracy= 0.694
Step 5200, Minibatch Loss= 0.4400, Training Accuracy= 0.709
Step 5300, Minibatch Loss= 0.4426, Training Accuracy= 0.698
Step 5400, Minibatch Loss= 0.4385, Training Accuracy= 0.705
Step 5500, Minibatch Loss= 0.4365, Training Accuracy= 0.705
Step 5600, Minibatch Loss= 0.4332, Training Accuracy= 0.711
Step 5700, Minibatch Loss= 0.4404, Training Accuracy= 0.708
Step 5800, Minibatch Loss= 0.4188, Training Accuracy= 0.715
Step 5900, Minibatch Loss= 0.4118, Training Accuracy= 0.722
Step 6000, Minibatch Loss= 0.4032, Training Accuracy= 0.713
Step 6100, Minibatch Loss= 0.4179, Training Accuracy= 0.711
Step 6200, Minibatch Loss= 0.4081, Training Accuracy= 0.714
Step 6300, Minibatch Loss= 0.4038, Training Accuracy= 0.713
Step 6400, Minibatch Loss= 0.4081, Training Accuracy= 0.719
Step 6500, Minibatch Loss= 0.3908, Training Accuracy= 0.742
Step 6600, Minibatch Loss= 0.3901, Training Accuracy= 0.735
Step 6700, Minibatch Loss= 0.3915, Training Accuracy= 0.729
Step 6800, Minibatch Loss= 0.3782, Training Accuracy= 0.721
Step 6900, Minibatch Loss= 0.3917, Training Accuracy= 0.712
Step 7000, Minibatch Loss= 0.3819, Training Accuracy= 0.734
Step 7100, Minibatch Loss= 0.3765, Training Accuracy= 0.738
Step 7200, Minibatch Loss= 0.3544, Training Accuracy= 0.749
Step 7300, Minibatch Loss= 0.3634, Training Accuracy= 0.748
Step 7400, Minibatch Loss= 0.3551, Training Accuracy= 0.758
Step 7500, Minibatch Loss= 0.3613, Training Accuracy= 0.746
Step 7600, Minibatch Loss= 0.3574, Training Accuracy= 0.753
Step 7700, Minibatch Loss= 0.3532, Training Accuracy= 0.758
Step 7800, Minibatch Loss= 0.3456, Training Accuracy= 0.762
Step 7900, Minibatch Loss= 0.3695, Training Accuracy= 0.747
Step 8000, Minibatch Loss= 0.3646, Training Accuracy= 0.768
Step 8100, Minibatch Loss= 0.3573, Training Accuracy= 0.756
Step 8200, Minibatch Loss= 0.3461, Training Accuracy= 0.760
Step 8300, Minibatch Loss= 0.3557, Training Accuracy= 0.759
Step 8400, Minibatch Loss= 0.3514, Training Accuracy= 0.756
Step 8500, Minibatch Loss= 0.3472, Training Accuracy= 0.768
Step 8600, Minibatch Loss= 0.3538, Training Accuracy= 0.757
Step 8700, Minibatch Loss= 0.3424, Training Accuracy= 0.763
Step 8800, Minibatch Loss= 0.3516, Training Accuracy= 0.754
Step 8900, Minibatch Loss= 0.3555, Training Accuracy= 0.762
Step 9000, Minibatch Loss= 0.3448, Training Accuracy= 0.767
Step 9100, Minibatch Loss= 0.3467, Training Accuracy= 0.761
Step 9200, Minibatch Loss= 0.3319, Training Accuracy= 0.777
Step 9300, Minibatch Loss= 0.3444, Training Accuracy= 0.765
Step 9400, Minibatch Loss= 0.3430, Training Accuracy= 0.762
Step 9500, Minibatch Loss= 0.3375, Training Accuracy= 0.766
Step 9600, Minibatch Loss= 0.3355, Training Accuracy= 0.768
Step 9700, Minibatch Loss= 0.3285, Training Accuracy= 0.780
Step 9800, Minibatch Loss= 0.3374, Training Accuracy= 0.772
Step 9900, Minibatch Loss= 0.3304, Training Accuracy= 0.781
Step 10000, Minibatch Loss= 0.3401, Training Accuracy= 0.768

  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
谣言检测是指通过分析文本内容来判断该文本是否包含谣言或虚假信息。Transformer是一种基于注意力机制的神经网络模型,被广泛应用于自然语言处理任务中,包括谣言检测。 在谣言检测中,我们可以使用Transformer的编码器作为语义提取器。编码器将整个句子作为输入,并通过注意力机制和多层感知机(MLP)为每个字生成一个编码向量。这些编码向量包含了整个句子的语义信息。然后,我们可以将这些向量输入到全连接网络中进行分类,判断文本是否为谣言。 需要注意的是,由于Transformer的注意力机制可以同时处理整个句子,因此它具有较好的并行能力。然而,与循环神经网络(RNN)相比,Transformer缺少了上下文信息,即没有考虑到不同词之间的顺序。因此,在使用Transformer进行谣言检测时,需要注意这一点。 下面是一个基于PyTorch和Transformer实现谣言检测的示例代码: ```python import torch import torch.nn as nn import torch.optim as optim from torchtext.data import Field, TabularDataset, BucketIterator from torch.nn import TransformerEncoder, TransformerEncoderLayer # 定义模型 class RumorDetectionTransformer(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim, n_layers, n_heads, dropout): super().__init__() self.embedding = nn.Embedding(input_dim, hidden_dim) self.encoder_layer = TransformerEncoderLayer(hidden_dim, n_heads) self.encoder = TransformerEncoder(self.encoder_layer, n_layers) self.fc = nn.Linear(hidden_dim, output_dim) self.dropout = nn.Dropout(dropout) def forward(self, text): embedded = self.embedding(text) embedded = embedded.permute(1, 0, 2) output = self.encoder(embedded) output = output.permute(1, 0, 2) pooled = torch.mean(output, dim=1) pooled = self.dropout(pooled) return self.fc(pooled) # 数据预处理 TEXT = Field(tokenize='spacy', lower=True) LABEL = Field(sequential=False, is_target=True) fields = [('text', TEXT), ('label', LABEL)] train_data, test_data = TabularDataset.splits( path='data', train='train.csv', test='test.csv', format='csv', fields=fields ) TEXT.build_vocab(train_data, vectors='glove.6B.100d') LABEL.build_vocab(train_data) # 模型训练 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 64 train_iterator, test_iterator = BucketIterator.splits( (train_data, test_data), batch_size=BATCH_SIZE, device=device ) INPUT_DIM = len(TEXT.vocab) HIDDEN_DIM = 100 OUTPUT_DIM = 1 N_LAYERS = 2 N_HEADS = 2 DROPOUT = 0.2 model = RumorDetectionTransformer(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, N_HEADS, DROPOUT).to(device) optimizer = optim.Adam(model.parameters()) criterion = nn.BCEWithLogitsLoss().to(device) def train(model, iterator, optimizer, criterion): model.train() epoch_loss = 0 for batch in iterator: optimizer.zero_grad() text = batch.text label = batch.label.float() predictions = model(text).squeeze(1) loss = criterion(predictions, label) loss.backward() optimizer.step() epoch_loss += loss.item() return epoch_loss / len(iterator) def evaluate(model, iterator, criterion): model.eval() epoch_loss = 0 with torch.no_grad(): for batch in iterator: text = batch.text label = batch.label.float() predictions = model(text).squeeze(1) loss = criterion(predictions, label) epoch_loss += loss.item() return epoch_loss / len(iterator) N_EPOCHS = 10 best_valid_loss = float('inf') for epoch in range(N_EPOCHS): train_loss = train(model, train_iterator, optimizer, criterion) valid_loss = evaluate(model, test_iterator, criterion) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'rumor_detection_transformer.pt') # 模型使用 model.load_state_dict(torch.load('rumor_detection_transformer.pt')) def predict_rumor(text): model.eval() tokenized = [tok.text for tok in TEXT.tokenizer(text)] indexed = [TEXT.vocab.stoi[t] for t in tokenized] tensor = torch.LongTensor(indexed).unsqueeze(1).to(device) prediction = torch.sigmoid(model(tensor)) return prediction.item() # 示例使用 text = "这是一条谣言" prediction = predict_rumor(text) print(f"谣言概率:{prediction}") ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值