GCN图卷积神经网络-中文文本分类 pytorch

代码基于Text-GCN作者论文和基于Bible的实践,保留原作者信息,具体代码请移步原作者仓库
https://github.com/plkmo/Bible_Text_GCN
https://github.com/yao8839836/text_gcn

从无到有学习GCN的一个过程,很多python的用法都不懂,打上一部分注释,希望给我一样的小白一点帮助

新手入门容易遇到的问题

  1. 训练的是什么参数? AXW里面的A是图结构,X是feature,是节点的特征(类比word embedding),W是这一层的权重,也就是随着梯度下降更新的值,在pytorch里面定义一个weight,下文代码用了normal_方法对这个权重做了初始化,但在很多讲座介绍里这个是可有可无的
  2. kipf的论文里面X是取的单位矩阵,在这个文本分类任务里使用了TF-IDF和PMI作为对角矩阵的值,将边的信息融入了学习中,至于融合的怎么样实际上在数学上比较抽象,但我个人训练后,300-500epoch就能取得一个不错的结果,准确率可以达到80%以上还是很顶的,具体细节、更难的东西俺也不懂
  3. 中文和英文分词的区别?中文是词语有含义而不是英语的单词,所以需要先进行分词。下面代码用了jieba cut分词函数,写了一个小jieba函数,jieba真香
  4. GCN怎么来的,什么原理?参考NTU李宏毅老师姜成瀚助教做的影片,看那个视频B站有,讲的挺清楚的
# 下面的代码关键部分打了注释,觉得有帮助的同学不妨给我点个免费的赞让我开心一下
# -*- coding: utf-8 -*-
"""
Created on Thu May  9 10:28:24 2019

@author: WT
"""
import os
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
import networkx as nx
from collections import OrderedDict
from itertools import combinations
import math
from tqdm import tqdm
import logging
import jieba

logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
                    datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)  # 配置输出日志
logger = logging.getLogger(__file__)

stop = [line.strip() for line in open('stop_words.txt').readlines()]


def cut_words(text):
    text = str(text)
    text = list(jieba.cut(text))
    for word in text:
        if word in stop:
            text.remove(word)
    return text

def load_pickle(filename):
    completeName = os.path.join("./data/", \
                                filename)
    with open(completeName, 'rb') as pkl_file:
        data = pickle.load(pkl_file)
    return data


def save_as_pickle(filename, data):
    completeName = os.path.join("./data/", \
                                filename)
    with open(completeName, 'wb') as output:
        pickle.dump(data, output)


def nCr(n, r):
    f = math.factorial  # 阶乘
    return int(f(n) / (f(r) * f(n - r)))


# 移除无意义词汇和符号
def filter_tokens(tokens, stopwords):
    tokens1 = []
    for token in tokens:
        if (token not in stopwords) and (token not in [".", ",", ";", "&", "'s", ":", "?", "!", "(", ")", \
                                                       "'", "'m", "'no", "***", "--", "...", "[", "]"]):
            tokens1.append(token)
    return tokens1


def dummy_fun(doc):
    return doc


def word_word_edges(p_ij):  # 该函数生成词汇和词汇的边
    word_word = []  # list() 方法用于将元组转换为列表
    cols = list(p_ij.columns);
    cols = [str(w) for w in cols]  # str() 函数将对象转化为字符串
    '''
    # old, inefficient but maybe more instructive code
    dum = []; counter = 0
    for w1 in tqdm(cols, total=len(cols)):
        for w2 in cols:
            #if (counter % 300000) == 0:
            #    print("Current Count: %d; %s %s" % (counter, w1, w2))
            if (w1 != w2) and ((w1,w2) not in dum) and (p_ij.loc[w1,w2] > 0):
                word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]})); dum.append((w2,w1))
            counter += 1
    '''
    for w1, w2 in tqdm(combinations(cols, 2), total=nCr(len(cols), 2)):
        if (p_ij.loc[w1, w2] > 0):
            word_word.append((w1, w2, {"weight": p_ij.loc[w1, w2]}))
    return word_word


def generate_text_graph(window=10):
    """ generates graph based on text corpus; window = sliding window size to calculate point-wise mutual information between words """     # 用滑动窗口的方式统计词与词间的同时出现概率
    logger.info("Preparing data...")
    datafolder = "./data/"  # 基于文本生成图的函数,窗口用于计算文字点与点之间的相互关系
    df = pd.read_csv(os.path.join(datafolder, "data3.csv"))  # pandas读取csv文件的一个基本函数


    # column是行,index是列,axis=1的目的是为了删掉某一列,inplace=True是为了在原df上进行修改,类似实参形参
    # 这里v是指的verse 属于哪一节不要了
    df = df[["t", "c", "b"]]


    # one chapter per document, labelled by book
    # 查注释,id = verse ID , b= Book , c = Chapter , v = Verse (诗句的)节, t = Text
    # df_data = pd.DataFrame(columns=["c", "b"])   # 创建一个data_frame,列索引选择c,b,也就是选了章和册(实际上这里就text没选)
    # for book in df["b"].unique():               # unique()是以 数组形式(numpy.ndarray)返回列的所有唯一值(特征的所有唯一值)
    #     dum = pd.DataFrame(columns=["c", "b"])
    #     dum["c"] = df[df["b"] == book].groupby("c").apply(lambda x: (" ".join(x["t"])).lower())
    #     dum["b"] = book
    #     df_data = pd.concat([df_data,dum], ignore_index=True)         # 使用 concat是一种基本的合并方式
    # del df
    df_data = pd.DataFrame(columns=["c", "b"])
    for book in df["b"].unique():
        dum = pd.DataFrame(columns=["c", "b"])
        dum["c"] = df[df["b"] == book].groupby("c").apply(lambda x: (" ".join(x["t"])))
        dum["c"] = dum["c"].apply(cut_words)
        dum["b"] = book
        df_data = pd.concat([df_data, dum], ignore_index=True)  # 使用 concat是一种基本的合并方式
    del df

    save_as_pickle("df_data.pkl", df_data)
    # tokenize是分词断句的函数

    # Tfidf               # tf-idf对文本重要程度进行加权
    logger.info("Calculating Tf-idf...")  # 使用sklearn包下的TfidfVectorizer()进行文本处理,该函数将原始文档集合转换为TF-IDF特性矩阵。
    vectorizer = TfidfVectorizer(input="content", max_features=None, tokenizer=dummy_fun, preprocessor=dummy_fun)
    vectorizer.fit(df_data["c"])  # 从培训集中学习词汇和IDF
    df_tfidf = vectorizer.transform(df_data["c"])  # 将文档转换为文档术语矩阵
    df_tfidf = df_tfidf.toarray()
    vocab = vectorizer.get_feature_names()  # 从特征整数索引到特征名的数组映射,Return :A list of feature names.
    vocab = np.array(vocab)
    df_tfidf = pd.DataFrame(df_tfidf, columns=vocab)  # 以特征名和 fit() 处理后的 df_tfidf(df == datafolder) 生成一个dataframe

    # PMI between words  #计算词与词之间的PMI   PMI是 point-wise mutual information
    names = vocab
    # names是上节传递的列的'名字'
    n_i = OrderedDict((name, 0) for name in names)
    word2index = OrderedDict((name, index) for index, name in enumerate(names))
    # OrderedDict是python中的有序字典,n_i和word2index都是有序的字典

    occurrences = np.zeros((len(names), len(names)), dtype=np.int32)
    # Find the co-occurrences:
    no_windows = 0;
    logger.info("Calculating co-occurences...")
    for l in tqdm(df_data["c"], total=len(df_data["c"])):  # 这里是暴力计算co-occurrences的函数
        for i in range(len(l) - window):
            no_windows += 1
            d = set(l[i:(i + window)])  # 这里是上周没看懂的地方

            for w in d:
                n_i[w] += 1  # 出现的频率++  # combinations(iterable, r)
            for w1, w2 in combinations(d, 2):  # 创建一个迭代器,返回iterable中所有长度为r的子序列,返回的子序列中的项按输入iterable中的顺序排序
                i1 = word2index[w1]  # 简单来说combination实现排列组合
                i2 = word2index[w2]

                occurrences[i1][i2] += 1
                occurrences[i2][i1] += 1

    logger.info("Calculating PMI*...")
    # convert to PMI
    p_ij = pd.DataFrame(occurrences, index=names, columns=names) / no_windows
    p_i = pd.Series(n_i, index=n_i.keys()) / no_windows

    del occurrences
    del n_i
    for col in p_ij.columns:
        p_ij[col] = p_ij[col] / p_i[col]
    for row in p_ij.index:
        p_ij.loc[row, :] = p_ij.loc[row, :] / p_i[row]
    p_ij = p_ij + 1E-9
    for col in p_ij.columns:
        p_ij[col] = p_ij[col].apply(lambda x: math.log(x))

    # Build graph             # 建图,调用networkx模块
    logger.info("Building graph (No. of document, word nodes: %d, %d)..." % (len(df_tfidf.index), len(vocab)))
    G = nx.Graph()
    logger.info("Adding document nodes to graph...")
    G.add_nodes_from(df_tfidf.index)  # document nodes     # add_nodes_from()向图G中添加节点,文档节点
    logger.info("Adding word nodes to graph...")
    G.add_nodes_from(vocab)  # word nodes                   # 这里添加的是单词节点
    # build edges between document-word pairs
    logger.info("Building document-word edges...")  # 这一步比较慢
    document_word = [(doc, w, {"weight": df_tfidf.loc[doc, w]}) for doc in
                     tqdm(df_tfidf.index, total=len(df_tfidf.index)) \
                     for w in df_tfidf.columns]  # 建立文档和单词之间的边

    logger.info("Building word-word edges...")
    word_word = word_word_edges(p_ij)  # 建立单词和单词之间的边
    save_as_pickle("word_word_edges.pkl", word_word)
    logger.info("Adding document-word and word-word edges...")
    G.add_edges_from(document_word)  # 添加上面建立好的边
    G.add_edges_from(word_word)
    save_as_pickle("text_graph.pkl", G)  # 用pickle格式保存,准备把图送入text_GCN函数
    logger.info("Done and saved!")


if __name__ == "__main__":
    generate_text_graph()

下面是网络模型代码

# -*- coding: utf-8 -*-
"""
Created on Wed Jul  3 10:58:01 2019

@author: WT
"""
import torch
import torch.nn as nn
import torch.nn.functional as F

class gcn(nn.Module):
    def __init__(self, X_size, A_hat, args, bias=True): # X_size = num features
        super(gcn, self).__init__()   # 继承nn.Module的__init__()
        self.A_hat = torch.tensor(A_hat, requires_grad=False).float()
        self.weight = nn.parameter.Parameter(torch.FloatTensor(X_size, args.hidden_size_1))  # 定义图和权重矩阵
        var = 2./(self.weight.size(1)+self.weight.size(0))
        self.weight.data.normal_(0,var)
        self.weight2 = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_1, args.hidden_size_2))
        var2 = 2./(self.weight2.size(1)+self.weight2.size(0))  # 使用normal_方法初始化权重weight
        													# normal_(mean=0, std=1, , gengerator=None*)
															# 将tensor用均值为mean和标准差为std的正态分布填充。
        self.weight2.data.normal_(0,var2)
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_1))
            self.bias.data.normal_(0,var)
            self.bias2 = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_2))
            self.bias2.data.normal_(0,var2)
        else:
            self.register_parameter("bias", None)
        self.fc1 = nn.Linear(args.hidden_size_2, args.num_classes)   # 全连接网络
        
    def forward(self, X): ### 2-layer GCN architecture
        X = torch.mm(X, self.weight)        # torch.mm(a, b)是矩阵a和b矩阵相乘
        if self.bias is not None:
            X = (X + self.bias)            # relu套上就是一层GCN的H1,送入下一层作为新的'图'结构
        X = F.relu(torch.mm(self.A_hat, X))
        X = torch.mm(X, self.weight2)
        if self.bias2 is not None:
            X = (X + self.bias2)
        X = F.relu(torch.mm(self.A_hat, X))
        return self.fc1(X)
  • 10
    点赞
  • 50
    收藏
    觉得还不错? 一键收藏
  • 8
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值