思路:通过selenium的webdriver实现页面的点击、下来等操作。页面完全加载后通beautifulsoup来查找相应的标签。将数据放到列表中,然后通过xlwt创建excel,并把数据存进去。
缺点:容易遇到性能问题。可能一个话题有很多的回复,而对于往excel中插入数据来说,上万条就会有卡顿了。其次,此代码处理完一个页面后并不会关闭,而是接着打开下一个网页继续处理。等所有网页都处理完后,再从后往前出栈的方式关闭。对内存消耗大。(后来找到,原来webdriver退出有两种,一种是close,一种是quit。close是只关闭当前标签,并且不清除缓存,而quit是关闭浏览器。如果爬取数量多,记住使用quit)
疑难杂症:没能够实现将数据存到mysql中,一直报各种错,数据库的相关代码在最后,希望有明白人指点一下啊!
报错内容大概是下面这样:
pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near "inset into answerInfo(answerer,answer,upvoteCount,commentCount) values ('%s','%s','%s','%s')"""%(pymysql.escape_string(answerer),answer,voteCount,commentCount) at(xxx line 1")
beautifulsoup4文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
代码
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import json
import xlwt
class ZhiHu():
pubDate=0
title=''
def __init__(self,topicURL): # 类的初始化操作
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} # 给请求指定一个请求头来模拟chrome浏览器
self.topicURL = topicURL # 要访问的话题地址
def getHtml(self,url):
driver = webdriver.Chrome()
driver.get(url)
#点击查看全部回答按钮
driver.find_element_by_class_name('QuestionMainAction').click()
time.sleep(3)
bs = BeautifulSoup(driver.page_source, 'lxml')
#循环下拉
while True:
b = bs.find('button',{'class':'Button QuestionAnswers-answerButton Button--blue Button--spread'})
if b!=None:
break
else:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
bs = BeautifulSoup(driver.page_source, 'lxml')
global pubDate,title
title = bs.find('h1', class_='QuestionHeader-title').string
pubDate= json.loads(bs.find('script', {'type': 'application/ld+json'}).get_text())["pubDate"][0:10]
html = bs.find_all('div',{'class':'List-item'})
print(title+"\t:\t此问题总共有%d条回答"%len(html))
return html
def downLoadToTxt(self,html,path):
for tag in html:
content = []
content.append(title)
content.append(pubDate)
# 获取回答内容
answer = tag.find('div', class_='RichContent-inner').find('span').get_text()
#获取回答人
answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"]
content.append(answerer)
#回答时间
time = tag.find('div',class_='ContentItem-time').find('span').get_text()[-10:]
content.append(time)
#获取赞同数
upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"]
content.append(str(upvoteCount))
#获取评论数
commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"]
content.append(str(commentCount))
content.append(answer)
with open(path, 'a') as f: # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据!
for tag in content:
f.write(tag+'\t')
f.write('\n')
f.close()
print(answerer+'\n'+str(upvoteCount)+'\n'+str(commentCount)+'\n\n\n')
def downLoadToExcel(self,html):
result = []
head = ['问题','发布时间','回答人','回答时间','赞同数','评论数','回答内容']
result.append(head)
for tag in html:
content = []
content.append(title)
content.append(pubDate)
# 获取回答内容
answer = tag.find('div', class_='RichContent-inner').find('span').get_text()
#获取回答人
answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"]
content.append(answerer)
# 回答时间
time = tag.find('div', class_='ContentItem-time').find('span').get_text()[-10:]
content.append(time)
#获取赞同数
upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"]
content.append(str(upvoteCount))
#获取评论数
commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"]
content.append(str(commentCount))
content.append(answer)
result.append(content)
workbook = xlwt.Workbook(encoding='utf-8')
booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)
for i, row in enumerate(result):
for j, col in enumerate(row):
booksheet.write(i, j, col)
workbook.save(title+'.xls')
def getAnswerItemURLs(self):
driver = webdriver.Chrome()
driver.get(self.topicURL)
time.sleep(2)
#下拉次数
i=5
# 循环下拉
while i>0:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)
i=i-1
bs = BeautifulSoup(driver.page_source, 'lxml')
#所有的回答
AnswerItems=bs.find_all('div',class_='ContentItem AnswerItem')
AnswerItemURLs=[]
preURL="https://www.zhihu.com"
for item in AnswerItems:
tailURL=item.find('a')['href']
URL=preURL+tailURL
AnswerItemURLs.append(URL)
print(URL)
print("总共有%d条问题!"%len(AnswerItemURLs))
return AnswerItemURLs
def getArticleItemURLs(self):
driver = webdriver.Chrome()
driver.get(self.topicURL)
time.sleep(2)
i=5
# 循环下拉
while i>0:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(3)
i=i-1
bs = BeautifulSoup(driver.page_source, 'lxml')
# 所有的专栏
ArticleItems = bs.find_all('div', class_='ContentItem ArticleItem')
ArticleItemURLs=[]
preURL="https:"
for item in ArticleItems:
tailURL=item.find('a')['href']
URL=preURL+tailURL
ArticleItemURLs.append(URL)
print(URL)
print("总共有%d条问题!"%len(ArticleItemURLs))
return ArticleItemURLs
zhihu = ZhiHu("话题地址")
AnswerItemURLs = zhihu.getAnswerItemURLs()
for url in AnswerItemURLs:
html = zhihu.getHtml(url)
zhihu.downLoadToExcel(html)
print("ok")