python爬虫neo4j知识图谱实体的属性补全

本文原创,如有转载请说明

‘’‘大致思路是:从neo4j数据库中拿出A实体及其ID,将A传如一个爬虫程序在百科上搜索采集结果数据。然后将采集到的数据return做为实体A的属性值写入neo4j,在写入的过程中match到A实体的ID能大大提升match写入的速度。’’’

import requests
import urllib
from bs4 import BeautifulSoup
from neo4j.v1 import GraphDatabase, basic_auth, TRUST_ALL_CERTIFICATES

使用neo4j.v1 驱动链接数据库

driver = GraphDatabase.driver(“bolt://111111111:7687”, auth=basic_auth(“neo4j”, “neo4j”))

class Get_datas():
def init(self):
self.session = driver.session() # 初始化驱动
# 这个header是 百度百科的requests headers,为后面的爬虫做准备的。
self.header={
‘Accept’: ’ text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3’,
‘Accept-Encoding’: ’ gzip, deflate, br’,
‘Accept-Language’: ’ zh-CN,zh;q=0.9’,
‘Cache-Control’: ’ max-age=0’,
‘Connection’: ’ keep-alive’,
‘Host’: ‘baike.baidu.com’,
‘Upgrade-Insecure-Requests’: ’ 1’,
‘User-Agent’: ’ Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36’
}

def get_neo4j_orgname(self):
'''这个函数的功能是:从neo4j数据库中拿出orgnames数据及其ID,将orgnames数据传如get_spiders_data这个爬虫程序然后去百科上搜索采集结果数据。然后将采集到的数据return做为orgnames数据得的属性值写入neo4j,在写入的过程中match该orgnames的ID能大大提升match写入的速度。'''
    dis = 0
    coms = self.session.run("MATCH (a:orgnames) RETURN a.name,id(a)").values() 
    set_list = []
    for c in coms[dis:]:
        dis += 1
        print("次数:", dis)
        orgids = c[1] 
        orgname = c[0]
        datad= self.get_spiders_data(orgname)
        if datad:
			# 下面数据清理和写入
            huzs = str(datad).replace('\\n','').replace('xa0','').replace('[3]\\','') 
            huz = huzs.split(',')
            huzd = huz[0].replace("['","").replace("']","").replace(" ","")
            btion = str(huz[1:]).replace("[","").replace("]","").replace("}\\''","")
            query = "match(e) where id(e)=%s set e.descri = '%s', e.btion = '%s' " % (orgids, huzd, btion)  # 写入neo4j
            self.session.run(query)
            # print('成功补充这个机构属性')
    self.browser.close()

def get_spiders_data(self,orgname):
	# 将汉字处理成链接需要的编码,拼接链接请求数据。
    new = urllib.parse.quote(orgname)
    urls = 'https://baike.baidu.com/item/'+new # 链接拼接
    response = requests.get(urls, headers=self.header) # 爬虫请求数据
    soups = BeautifulSoup(response, 'lxml')  # BeautifulSoup 解析数据
    try:
        data1 = soups.find_all("div",class_="lemma-summary") # 获取数据所在的网页前端的标签,
        shuju1 = [d.text for d in data1]  # 去网页标签,留文字内容
        datad = soups.find_all("div",class_="basic-info cmn-clearfix")
        dataleft = soups.find('dl',class_="basicInfo-block basicInfo-left")
        ltdata = str(dataleft).replace('<dl class="basicInfo-block basicInfo-left">\n','{')  # 数据清理
        dataright = soups.find('dl',class_="basicInfo-block basicInfo-right")
        rtdata = str(dataright).replace('<dl class="basicInfo-block basicInfo-right">\n','{') # 数据清理
        if shuju1:
            return shuju1, ltdata, rtdata
    except Exception as e:
        print(e)

def mains(self):
    self.get_neo4j_orgname()

if name == ‘main’:
get = Get_datas()
get.mains()

以下是基于 Python 实现的爬虫后建立知识图谱的示例代码: ``` import requests from bs4 import BeautifulSoup from py2neo import Graph, Node, Relationship # 爬取知乎一些关于 Python 的问题的页面 response = requests.get('https://www.zhihu.com/search?type=content&q=python') soup = BeautifulSoup(response.content, 'html.parser') question_tags = soup.find_all('div', class_='ContentItem-head') # 获取问题标题和问题链接 questions = [] for tag in question_tags: question = {} a_tag = tag.find('a') question['title'] = a_tag.get_text() question['link'] = 'https://www.zhihu.com' + a_tag['href'] questions.append(question) # 建立知识图谱 graph = Graph('http://localhost:7474/db/data/', username='neo4j', password='password') for question in questions: # 创建节点 question_node = Node('Question', title=question['title'], link=question['link']) graph.create(question_node) # 爬取问题页面并获取问题描述和回答 response = requests.get(question['link']) soup = BeautifulSoup(response.content, 'html.parser') description_tag = soup.find('div', class_='QuestionDetail-main') if description_tag: description = description_tag.get_text().strip() if description: description_node = Node('Description', content=description) graph.create(description_node) graph.create(Relationship(question_node, 'HAS_DESCRIPTION', description_node)) answer_tags = soup.find_all('div', class_='ContentItem AnswerItem') for tag in answer_tags: answer_node = Node('Answer', content=tag.find('div', class_='RichContent-inner').get_text().strip()) graph.create(answer_node) graph.create(Relationship(question_node, 'HAS_ANSWER', answer_node)) ``` 注:此示例代码基于知乎问题爬取,需要安装 py2neo 库和 BeautifulSoup 库。另外,需要安装并运行 Neo4j 数据库,并且在代码中修改数据库的连接信息。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值