抓取爱问知识人问题，保存至数据库。

最新推荐文章于 2022-07-15 23:11:33 发布

Waterkong

最新推荐文章于 2022-07-15 23:11:33 发布

阅读量474

点赞数 1

分类专栏： python与爬虫 mysql 文章标签： python MYSQL

本文链接：https://blog.csdn.net/Waterkong/article/details/73017325

版权

python与爬虫同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

mysql

2 篇文章 0 订阅

订阅专栏

新手上路，欢迎批评。博客暂时只为记录学习过程。

抓取结果：

整体思路：

1.链接数据库并建好一张表。要存储以下信息，问题，回答者，回答时间，回答内容。

def createtable():
	# 打开数据库连接
	db = pymysql.connect("localhost","root","sql123","likelearn", charset='utf8' )
	# 使用 cursor() 方法创建一个游标对象 cursor
	cursor = db.cursor()
	# 使用 execute() 方法执行 SQL，如果表存在则删除
	cursor.execute("DROP TABLE IF EXISTS goodAnswer")
	# 使用预处理语句创建表
	sql = """CREATE TABLE goodAnswer(
         question text not null,
         a_name varchar(20) not null,
         a_time varchar(20) not null,
         a_content text not null) CHARSET=utf8 """
	cursor.execute(sql)
    # 关闭数据库连接
	db.close()

2.抓取一个问题汇总页面 http://iask.sina.com.cn/c/74-goodAnswer-1-new.html 写一个函数geturls()获取该页面上指向其他问题汇总页面的链接，存储在全局变量 pageurls 中。

注释掉的最后一行，是递归抓取整个100页的所有问题汇总页面的链接，这里我只抓取了1，2,3,100这四个页面。

def geturls(url):
	global pageurls
	url = "http://iask.sina.com.cn/" + str(url)
	print(url)
	req = getpage(url)
	soup = BeautifulSoup(req.text,"html.parser")
	hrefs = soup.find("div",{"class":"page mt30"}).find_all("a")
	for pageurl in hrefs:
		if "href" in pageurl.attrs:
			if pageurl["href"] not in pageurls:
				pageurls.append(pageurl["href"])
				# geturls(pageurl["href"])

3.两层循环，第一层循环变量pageurls，每次获得一个问题汇总页面，并且用函数 getlinks() 获得该页面上的所有具体问题的链接。

第二层抓取具体问题回答的页面，并且用函数 getdetail() 获得最好回答的的各个字段。

# 获取页面的所有问题链接
def getlinks(soup):
	temp = soup.find_all("div",{"class":"question-title"})
	links = []
	for link in temp:
		links.append(link.a["href"])
	return links

# 获取问题的回答者，回答时间回答内容 
def getdetail(soup):
	item = []
	#获取回答者的名字
	a_name = soup.find("div", {"class":"answer_tip"})
	if a_name == None:
		a_name = " "
	else:
		a_name = a_name.a
	#获取回答时间
	a_time = soup.find("div", {"class":"answer_tip"})
	if a_time == None:
		a_time = " "
	else:
		a_time = a_time.span
    #获取回答内容
	a_content = soup.find("div",{"class":"answer_text"})
	if a_content == None:
		a_content = " "
	elif soup.find("div",{"class":"answer_text"}).find("div",{"style":"display: none"}) == None:
		a_content = soup.find("div",{"class":"answer_text"})
	else:
		a_content = soup.find("div",{"class":"answer_text"}).find("div",{"style":"display: none"})
	ques = soup.find("div", {"class":"question_text"})
	item.append(tool(ques))
	item.append(tool(a_name) )
	item.append(tool(a_time) )
	item.append(tool(a_content) )
	return item

4. 链接数据库，将抓取到的各个字段存储在已经建好的表 goodAnswer 中。

插入方法参考： http://www.runoob.com/python3/python3-mysql.html

以下是全部代码：

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
import pymysql
import time
# 用来存储页面的url
pageurls = []

# 获取一个页面
def getpage(url):
	headers = {'user-agent':  'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
	req = requests.get(url, headers = headers)
	if req.status_code == 200:
		pass
	else:
		print("获取失败，请检查URL:", url)
	return req

# 获取页面的所有问题链接
def getlinks(soup):
	temp = soup.find_all("div",{"class":"question-title"})
	links = []
	for link in temp:
		links.append(link.a["href"])
	return links

# 获取所有的页面链接
def geturls(url):
	global pageurls
	url = "http://iask.sina.com.cn/" + str(url)
	print(url)
	req = getpage(url)
	soup = BeautifulSoup(req.text,"html.parser")
	hrefs = soup.find("div",{"class":"page mt30"}).find_all("a")
	for pageurl in hrefs:
		if "href" in pageurl.attrs:
			if pageurl["href"] not in pageurls:
				pageurls.append(pageurl["href"])
				# geturls(pageurl["href"])

# 获取问题的回答者，回答时间回答内容 
def getdetail(soup):
	item = []
	#获取回答者的名字
	a_name = soup.find("div", {"class":"answer_tip"})
	if a_name == None:
		a_name = " "
	else:
		a_name = a_name.a
	#获取回答时间
	a_time = soup.find("div", {"class":"answer_tip"})
	if a_time == None:
		a_time = " "
	else:
		a_time = a_time.span
    #获取回答内容
	a_content = soup.find("div",{"class":"answer_text"})
	if a_content == None:
		a_content = " "
	elif soup.find("div",{"class":"answer_text"}).find("div",{"style":"display: none"}) == None:
		a_content = soup.find("div",{"class":"answer_text"})
	else:
		a_content = soup.find("div",{"class":"answer_text"}).find("div",{"style":"display: none"})
	ques = soup.find("div", {"class":"question_text"})
	item.append(tool(ques))
	item.append(tool(a_name) )
	item.append(tool(a_time) )
	item.append(tool(a_content) )
	return item

# 清理不必要的标签和特殊符号
def tool(x):
	x = str(x)
	replaceBR = re.compile('<pre>|</pre>')
	removeImg = re.compile('<span.*?>|</span>" "*?|</span>')
	replaceLine = re.compile('<div.*?>|</div>')
	removeAddr = re.compile('<a.*?>|</a>')
	removespace = re.compile('\s*')
	removete = re.compile('\W')
	x = re.sub(replaceBR,"",x)
	x = re.sub(removeImg,"",x)
	x = re.sub(replaceLine,"",x)
	x = re.sub(removeAddr,"",x)
	x = re.sub(removespace,"",x)
	x = re.sub(removete,"",x)
	return x

def createtable():
	# 打开数据库连接
	db = pymysql.connect("localhost","root","sql123","likelearn", charset='utf8' )
	# 使用 cursor() 方法创建一个游标对象 cursor
	cursor = db.cursor()
	# 使用 execute() 方法执行 SQL，如果表存在则删除
	cursor.execute("DROP TABLE IF EXISTS goodAnswer")
	# 使用预处理语句创建表
	sql = """CREATE TABLE goodAnswer(
         question text not null,
         a_name varchar(20) not null,
         a_time varchar(20) not null,
         a_content text not null) CHARSET=utf8 """
	cursor.execute(sql)
    # 关闭数据库连接
	db.close()

# 逻辑控制主函数
def main():
    # 链接数据库操作数据库
	db = pymysql.connect("localhost","root","sql123","likelearn", charset='utf8')
	# 使用 cursor() 方法创建一个游标对象 cursor
	cursor = db.cursor()

	# 为了简洁我只抓取第 1 2 3 100 这四个页面的 url
	geturls("/c/74-goodAnswer-1-new.html") 
	# 遍历四个 url 抓取页面
	for url in pageurls:
		page = getpage("http://iask.sina.com.cn" + str(url))
		soup = BeautifulSoup(page.text,"html.parser")
		print("获得该页面所有问题的链接", "http://iask.sina.com.cn" + str(url)) 
		links = getlinks(soup)
		# 遍历每个该页面的所有问题链接
		for link in links:
			print("获得goodAnswer页面详情",link) 
			a_page = getpage("http://iask.sina.com.cn/" + str(link))
			a_soup = BeautifulSoup(a_page.text, "html.parser")
			print("获得问题最好回答的各个信息段") 
			a_item = getdetail(a_soup)
			# #SQL 插入语句
			sql = "INSERT INTO goodAnswer(question, a_name, a_time, a_content)VALUES ('%s', '%s',  '%s', '%s')"%(a_item[0], a_item[1], a_item[2],a_item[3])
			try:
				print("# 执行 sql 语句")
				cursor.execute(sql)
				print("# 提交到数据库执行") 
				db.commit()
			except:
				print("# 发生错误回滚") 
				db.rollback()
			time.sleep(2)
	# 关闭数据库连接
	db.close()
createtable()
main()

Waterkong

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
抓取爱问知识人问题，保存至数据库。

新手上路，欢迎批评。博客暂时只为记录学习过程。抓取结果：整体思路：1.链接数据库并建好一张表。要存储以下信息，问题，回答者，回答时间，回答内容。def createtable(): # 打开数据库连接 db = pymysql.connect("localhost","root","sql123","likelearn", charset='utf8' ) # 使用 curs
复制链接

扫一扫