【毕设】将mysql中的数据存储到neo4j中

最新推荐文章于 2023-03-20 15:00:45 发布

稳得一笔

最新推荐文章于 2023-03-20 15:00:45 发布

阅读量764

点赞数

分类专栏：项目实战踩坑经验文章标签： mysql 数据库 python 知识图谱

本文链接：https://blog.csdn.net/qq_42907802/article/details/115220452

版权

项目实战同时被 2 个专栏收录

9 篇文章 0 订阅

订阅专栏

踩坑经验

4 篇文章 1 订阅

订阅专栏

前几天从知网爬取的相关数据已经存到mysql，sql文件已经放在了码云。就在昨天晚上用selenium爬了一晚上知网，早上才发现好多都是重复的数据，3000多条数据基本上能用的就200多，然后不甘心，试了试多线程，由于不太熟悉，导致ip貌似被知网限制，用家里WiFi已经访问不了知网了，只能用手机开热点才能访问知网。计划开学后在完善爬虫吧，这几天先往后做一下知识图谱。

-- 删除重复数据并保留id最小的一个 
DELETE FROM author
WHERE NAME IN ( SELECT NAME 
               FROM ( SELECT NAME 
                     FROM author 
                     GROUP BY NAME 
                     HAVING COUNT(NAME) > 1) a
              )
-- 排除最小的id
AND id NOT IN (
	SELECT id
	FROM (SELECT min(id) AS id
          FROM author 
          GROUP BY NAME 
          HAVING count(NAME) > 1 ) b
)

我从知网爬下的数据目前是这样存储的，以后可能会更多

存储过程也是极其简单的：

连接mysql数据库，neo4j图数据库
从mysql取出实体，创建到neo4j中的节点
从mysql取出关系，匹配neo4j中的节点
创建关系
关闭数据库

需要的第三方库有pymysql，py2neo

def save_article(cursor, graph):
    """存储文章节点"""
    print("正在存储文章节点，请稍等...")
    sql = 'SELECT url, title, summary, keyss, funds, doi, album, special, classNo FROM article'
    cursor.execute(sql)
    rows = cursor.fetchall()
    success, fail = 0, 0
    for row in rows:
        try:
            url = row[0]
            title = row[1]
            summary = row[2]
            keys = row[3]
            funds = row[4]
            doi = row[5]
            album = row[6]
            special = row[7]
            classNo = row[8]
            node = Node('Article', url=url, title=title, summary=summary, keys=keys, funds=funds, doi=doi, album=album,
                        special=special, classNo=classNo)
            graph.create(node)
            success += 1
        except Exception as e:
            print('【失败】存储文章节点', e)
            fail += 1
    print('所有文章节点存储完毕，成功存储{}个，失败{}个\n'.format(success, fail))


def save_author(cursor, graph):
    """存储作者节点"""
    print("正在存储作者节点，请稍等...")
    sql = 'SELECT url, name, major, sum_publish, sum_download, fields FROM author'
    cursor.execute(sql)
    rows = cursor.fetchall()
    success, fail = 0, 0
    for row in rows:
        try:
            url = row[0]
            name = row[1]
            major = row[2]
            sum_publish = row[3]
            sum_download = row[4]
            fields = row[5]
            node = Node('Author', url=url, name=name, major=major, sum_publish=sum_publish, sum_download=sum_download,
                        fields=fields)
            graph.create(node)
            success += 1
        except Exception as e:
            print('【失败】存储作者节点', e)
            fail += 1
    print('所有作者节点存储完毕，成功存储{}个，失败{}个\n'.format(success, fail))
    
def save_re_article_author(cursor, graph):
    """存储文献作者关系,其他的实体关系建立过程也类似
    :param cursor: mysql 游标对象
    :param graph: neo4j 数据库连接
    """
    print("正在存储文献作者关系，请稍等...")
    sql = 'SELECT url_article,url_author FROM re_article_author'
    cursor.execute(sql)
    rows = cursor.fetchall()
    success, fail = 0, 0
    for row in rows:
        match = NodeMatcher(graph)
        url_article = row[0]
        url_author = row[1]
        try:
            # 查找文章节点
            node_article = match.match('Article').where('_.url="{}"'.format(url_article)).first()
            # 查找作者节点
            node_author = match.match('Author').where('_.url="{}"'.format(url_author)).first()
            # 建立关系
            if node_article and node_author:
                re = Relationship(node_article, '作者', node_author)
                graph.create(re)
                success += 1
            else:
                fail += 1
        except Exception as e:
            print('【失败】文章作者关系', e)

    print('所有文献作者关系存储完毕，成功存储{}个，失败{}个\n'.format(success, fail))

if __name__ == '__main__':
    print('主程序开始执行，当前时间：{}\n'.format(time.strftime('%H:%M:%S', time.localtime())))
    start = time.time()

    db = pymysql.connect(host='localhost', user='root', passwd='123456', db='cnki', port=3306, charset='utf8')
    curr = db.cursor()
    
    # 初始化图数据库
    g = Graph(auth=('neo4j', '123456'))
    g.run('match(n) detach delete n')
    save_article(curr, g)
    save_author(curr, g)
    save_re_article_author(curr, g)
    db.close()

    end = time.time()
    t = end - start
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    print("程序耗时 {:.0f}时 {:.0f}分 {:.0f}秒".format(h, m, s))