增加了对爬取到的论文信息整合进一个csv中并去除重复行的功能
import pandas as pd
import os
data = pd.DataFrame()
for info in os.listdir(r'.\data'):
info = os.path.join(r'.\data', info)
info = pd.read_csv(info, index_col=0)
data = data.append(info, ignore_index=True)
data = data.drop_duplicates()
data.to_csv(r"./data.csv", encoding='utf-8-sig')
将csv导入Neo4j数据库中
在这里插入代码片 def __init__(self):
"""建立连接"""
link = Graph("http://localhost//:7474", username="neo4j", password="2922610627")
self.graph = link
# 定义label
self.Paper = '文章'
self.Author = '作者'
self.Organ = '来源单位'
self.Keyword = '关键词'
self.lists={'Author':'作者','Organ':'来源单位','Keyword':'关键词'}
self.graph.delete_all()
def create_node(self, node_list_paper, node_list_author,node_list_organ,node_list_keyword):
"""建立节点"""
#对文章,属性包括名字,作者,来源,关键词
for line in node_list_paper:
paper = Node(self.Paper, title=line[0],author=line[1],organ=line[2],keyword=line[3])
self.graph.create(paper)
#对作者
for name in node_list_author:
value_node = Node(self.Author, name=name)
self.graph.create(value_node)
#对机构
for name2 in node_list_organ:
value2_node = Node(self.Organ, name=name2)
self.graph.create(value2_node)
#对关键词
for name3 in node_list_keyword:
value3_node = Node(self.Keyword, name=name3)
self.graph.create(value3_node)
for m in range(0, len(df_data)):
index = df_data['relation'][m]
if index=='Keyword':
query1 = "match(p:%s),(q:%s) where p.title='%s'and q.name='%s' merge (p)-[rel:%s]->(q)" % (
self.Paper, self.lists[index], df_data['name'][m], df_data['name2'][m], index)
query2= "match(q:%s),(p:%s) where p.title='%s'and q.name='%s' merge (q)-[rel:%s]->(p)" % (
self.lists[index],self.Paper, df_data['name'][m], df_data['name2'][m], index)
self.graph.run(query1)
self.graph.run(query2)
else:
query = "match(p:%s),(q:%s) where p.title='%s'and q.name='%s' create (p)-[rel:%s]->(q)" % (
self.Paper, self.lists[index], df_data['name'][m], df_data['name2'][m], index)
try:
self.graph.run(query)
except Exception as e:
print(e)